swps3
DynProgr_SPE_functions.cc
Go to the documentation of this file.
1 
5 /*
6  * Copyright (c) 2007-2008 ETH Zürich, Institute of Computational Science
7  *
8  * Permission is hereby granted, free of charge, to any person
9  * obtaining a copy of this software and associated documentation
10  * files (the "Software"), to deal in the Software without
11  * restriction, including without limitation the rights to use,
12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following
15  * conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27  * OTHER DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include "DynProgr_SPE_functions.h"
31 #include "DynProgr_SPE.h"
32 #include "matrix.h"
33 #include <cstdlib>
34 #include <malloc.h>
35 #include <float.h>
36 #include <cstdio>
37 #include <string.h>
38 #include <spu_intrinsics.h>
39 #include <sys/types.h>
40 
41 template<typename T> static inline T min( T a, T b ){ return a<b?a:b; }
42 template<typename T> static inline T max( T a, T b ){ return a>b?a:b; }
43 
44 template<typename T> struct IsInteger { static const bool value = false; };
45 template<> struct IsInteger<int8_t> { static const bool value = true; };
46 template<> struct IsInteger<int16_t> { static const bool value = true; };
47 template<> struct IsInteger<int32_t> { static const bool value = true; };
48 
49 template<typename T> struct MaxValue { static const T value = -1 ^ (1ll<<(sizeof(T)*8-1)); };
50 template<> struct MaxValue<float> { static const float value = FLT_MAX; };
51 template<> struct MaxValue<double> { static const double value = DBL_MAX; };
52 
53 template<typename T> struct MinValue { static const T value = 1ll<<(sizeof(T)*8-1); };
54 template<> struct MinValue<float> { static const float value = FLT_MIN; };
55 template<> struct MinValue<double> { static const double value = DBL_MIN; };
56 
57 char * s1, * s2;
58 int ls1, ls2;
61 double mn, mx, fixedDel, incDel;
62 void *simi, *profile, *loadOpt, *storeOpt, *rD, *maxS, *delS;
64 
68 template< class V > static inline V spu_max( V a, V b ){
69  return spu_sel(a,b,spu_cmpgt(b,a));
70 }
74 template< class V > static inline V spu_min( V a, V b ){
75  return spu_sel(a,b,spu_cmpgt(a,b));
76 }
77 
78 #undef SHORTCUT
79 
97 template< class T, class V > static inline T dynProgrLocalBlock(
98  int currentBlockSize,
99  T zero, T goal,
100  T * maxS, T* delS,
101  const V * profile,
102  V * loadOpt,
103  V * storeOpt,
104  V * rD){
105  /**********************************************************************
106  * This version of the code implements the idea presented in
107  *
108  ***********************************************************************
109  * Striped Smith-Waterman speeds database searches six times over other
110  * SIMD implementations
111  *
112  * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
113  **********************************************************************/
114 
115  const V vZero = spu_splats( zero );
116  const V vGoal = spu_splats( goal );
117  const V vDelFixed = spu_splats( (T)fixedDel );
118  const V vDelInc = spu_splats( (T)incDel );
119 
120  T maxScore = zero;
121  const int nSeg = sizeof(V)/sizeof(T); // the number of segments
122  const int segLen = currentBlockSize/nSeg; // the segment length
123 
124  V vMaxScore = vZero; // The maximum score
125  T prevMax = zero;
126  /* Initialize the other arrays */
127  /*******************************/
128  for(int i=0; LIKELY(i<segLen); i++)
129  loadOpt[i] = storeOpt[i] = rD[i] = vZero;
130 
131  /* looping through all the columns */
132  /***********************************/
133  for( int i=0; LIKELY(i<ls2); i++ ){
134 
135  /* compute the opt and cd score depending on the previous column */
136  /*******************************************************************/
137  // set the column deletion score to zero, has to be fixed later on
138  V vCD = spu_insert( delS[i], vZero, 0);
139 
140  // set the opt score to the elements computed in the previous column
141  // set the low of storeOpt to MaxS[j]
142  // spu_rlmaskqwbyte is a complicated way to say right shift
143  V vStoreOpt = spu_rlmaskqwbyte(storeOpt[segLen-1], -sizeof(T));
144  vStoreOpt = spu_insert( prevMax, vStoreOpt, 0 );
145 
146  /* compute the current profile, depending on the character in s2 */
147  /*****************************************************************/
148  const V * currentProfile = profile + s2[i]*segLen;
149 
150 #if 0
151  for(int ii=0; ii<nSeg; ++ii) {
152  for(int jj=0; jj<segLen; ++jj) {
153  if(ii*segLen+jj < ls1)
154  printf("\t%d",(int)((T*)currentProfile)[ii+jj*nSeg]);
155  }
156  }
157  printf("\n");
158 #endif
159 
160  /* swap the old optimal score with the new one */
161  /***********************************************/
162  V * swap = storeOpt;
163  storeOpt = loadOpt;
164  loadOpt = swap;
165 
166  /* main loop computing the max, precomputing etc. */
167  /**************************************************/
168  for( int j=0; LIKELY(j<segLen); j++ ){
169  // Load the the rd value
170  V vRD = rD[j];
171  V vTmp = loadOpt[j];
172  vRD += vDelInc;
173  vTmp += vDelFixed;
174  if(IsInteger<T>::value) {
175  vRD = spu_max(vRD,vZero);
176  }
177  vRD = spu_max(vTmp,vRD);
178  rD[j] = vRD;
179 
180  // add the profile the prev. opt
181  vStoreOpt += currentProfile[j];
182 
183  // To provide saturated arithmetics
185  vStoreOpt = spu_min( vStoreOpt, vGoal );
186 
187  // update the maxscore found so far
188  vMaxScore = spu_max( vMaxScore, vStoreOpt );
189  // precompute the maximum here
190  vTmp = spu_max( vCD, vRD );
191  // compute the correct opt score of the cell
192  vStoreOpt = spu_max( vStoreOpt, vTmp );
193  vStoreOpt = spu_max( vStoreOpt, vZero );
194 
195  // store the opt score of the cell
196  storeOpt[j] = vStoreOpt;
197 
198  // precompute rd and cd for next iteration
199  vStoreOpt += vDelFixed;
200  vCD += vDelInc;
202  vStoreOpt = spu_max( vStoreOpt, vZero );
203  vCD = spu_max( vStoreOpt, vCD );
204 
205  // load precomputed opt for next iteration
206  vStoreOpt = loadOpt[j];
207  }
208 
209  /* set totcells */
210  /****************/
211 // totcells += ls1;
212  /* check for a changed MaxScore */
213  /********************************/
214  for( T* tmp = (T*)&vMaxScore; tmp<(T*)(&vMaxScore+1); tmp++ )
215  if (UNLIKELY(maxScore < *tmp))
216  maxScore = *tmp;
217  // if the goal was reached, exit
218  if ( UNLIKELY(maxScore >= goal) )
219  return MaxValue<T>::value;
220 
221  /* cleaning up with missed arrows */
222  /**********************************/
223  delS[i] = spu_extract( vCD, nSeg-1 );
224 
225  V vStoreOptx = storeOpt[0];
226  vStoreOptx = spu_max(vStoreOptx + (vDelFixed - vDelInc),vZero);
227  V vCDx = spu_rlmaskqwbyte(vCD, -sizeof(T));
228  vCDx = spu_insert( zero, vCDx, 0 );
229 
230  if( spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCDx,vStoreOptx)),0) != 0) {
231  for(int j=0; LIKELY(j<nSeg); ++j) {
232  // set everything up for the next iteration
233  vCD = spu_rlmaskqwbyte(vCD, -sizeof(T));
234  vCD = spu_insert( zero, vCD, 0 );
235 
236  for(int k=0; k<segLen-1; ++k) {
237  // compute the current optimal value of the cell
238  vStoreOpt = storeOpt[k];
239  vStoreOpt = spu_max( vStoreOpt, vCD );
240  storeOpt[k] = vStoreOpt;
241 
242  // precompute the scores for the next cell
243  vCD = spu_max( vCD + vDelInc, vZero );
244  vStoreOpt = spu_max( vStoreOpt + vDelFixed, vZero );
245 
246  #ifdef SHORTCUT
247  if(UNLIKELY(spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0))
248  goto shortcut;
249  #endif
250 
251  }
252 
253  // compute the current optimal value of the cell
254  vStoreOpt = storeOpt[segLen-1];
255  vStoreOpt = spu_max( vStoreOpt, vCD );
256  storeOpt[segLen-1] = vStoreOpt;
257 
258  // precompute the cd value for the next cell
259  vCD = spu_max( vCD + vDelInc, vZero );
260  vStoreOpt = spu_max( vStoreOpt + vDelFixed, vZero );
261 
262  // Update the del Score
263  T temp = spu_extract( vCD, nSeg-1 );
264  if ( UNLIKELY(delS[i] < temp) )
265  delS[i] = temp;
266 
267  if(UNLIKELY(spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0)) break;
268  }
269  #ifdef SHORTCUT
270  shortcut:
271  (void)1;
272  #endif
273  }
274 
275  /* store the new MaxScore for the next line block */
276  /**************************************************/
277  prevMax = maxS[i];
278  maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
279 
280 #ifdef DEBUG
281  printf("%c\t",s2[i]);
282  for(int ii=0; ii<nSeg; ++ii) {
283  for(int jj=0; jj<segLen; ++jj) {
284  if(ii*segLen+jj < ls1)
285  printf("%d\t",(int)(((T*)storeOpt)[ii+jj*nSeg]-zero));
286  }
287  }
288  printf("\n");
289 #endif
290  }
291  return maxScore;
292 }
293 
294 #ifdef UNROLL
295 
314 template< class T, class V > static inline T dynProgrLocalBlock2(
315  int currentBlockSize,
316  T zero, T goal,
317  T * maxS, T* delS,
318  const V * profile,
319  V * loadOpt,
320  V * storeOpt,
321  V * rD){
322  /**********************************************************************
323  * This version of the code implements the idea presented in
324  *
325  ***********************************************************************
326  * Striped Smith-Waterman speeds database searches six times over other
327  * SIMD implementations
328  *
329  * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
330  **********************************************************************/
331 
332  const V vZero = spu_splats( zero );
333  const V vGoal = spu_splats( goal );
334  const V vDelFixed = spu_splats( (T)fixedDel );
335  const V vDelInc = spu_splats( (T)incDel );
336 
337  T maxScore = zero;
338  const int nSeg = sizeof(V)/sizeof(T); // the number of segments
339  const int segLen = (currentBlockSize/nSeg + 1) & ~1; // the segment length
340  const int subSegLen = segLen / 2; // the sub segment length
341  V vMaxScore1 = vZero,vMaxScore2 = vZero; // The maximum score
342  T prevMax = zero;
343 
344  /* Initialize the other arrays */
345  /*******************************/
346  for(int i=0; LIKELY(i<segLen); i++)
347  loadOpt[i] = storeOpt[i] = rD[i] = vZero;
348 
349  /* looping through all the columns */
350  /***********************************/
351  for( int i=0; LIKELY(i<ls2); i++ ){
352 
353  /* compute the opt and cd score depending on the previous column */
354  /*******************************************************************/
355  // set the column deletion score to zero, has to be fixed later on
356  V vCD1 = spu_insert( delS[i], vZero, 0);
357  V vCD2 = vZero;
358 
359  // set the opt score to the elements computed in the previous column
360  // set the low of storeOpt to MaxS[j]
361  // spu_rlmaskqwbyte is a complicated way to say right shift
362  V vStoreOpt1 = spu_rlmaskqwbyte(storeOpt[segLen-1], -sizeof(T));
363  vStoreOpt1 = spu_insert( prevMax, vStoreOpt1, 0 );
364  V vStoreOpt2 = storeOpt[subSegLen-1];
365  /* compute the current profile, depending on the character in s2 */
366  /*****************************************************************/
367  const V * currentProfile = profile + s2[i]*segLen;
368 
369 #if 0
370  for(int ii=0; ii<nSeg; ++ii) {
371  for(int jj=0; jj<segLen; ++jj) {
372  if(ii*segLen+jj < ls1)
373  printf("\t%d",(int)((T*)currentProfile)[ii+jj*nSeg]);
374  }
375  }
376  printf("\n");
377 #endif
378 
379  /* swap the old optimal score with the new one */
380  /***********************************************/
381  V * swap = storeOpt;
382  storeOpt = loadOpt;
383  loadOpt = swap;
384 
385  /* main loop computing the max, precomputing etc. */
386  /**************************************************/
387  for( int j=0; LIKELY(j<subSegLen); j++ ){
388  // lead the row deletion score
389  V vRD1 = rD[j];
390  V vRD2 = rD[j+subSegLen];
391  V vTmp1 = loadOpt[j];
392  V vTmp2 = loadOpt[j+subSegLen];
393  vRD1 += vDelInc;
394  vRD2 += vDelInc;
395  vTmp1 += vDelFixed;
396  vTmp2 += vDelFixed;
397  if(IsInteger<T>::value) {
398  vRD1 = spu_max(vRD1,vZero);
399  vRD2 = spu_max(vRD2,vZero);
400  }
401  vRD1 = spu_max(vTmp1,vRD1);
402  vRD2 = spu_max(vTmp2,vRD2);
403  rD[j] = vRD1;
404  rD[j+subSegLen] = vRD2;
405 
406  // add the profile the prev. opt
407  vStoreOpt1 += currentProfile[j];
408  vStoreOpt2 += currentProfile[j+subSegLen];
409 
410  // To avoid saturated arithmetics
411  if (IsInteger<T>::value){
412  vStoreOpt1 = spu_min( vStoreOpt1, vGoal );
413  vStoreOpt2 = spu_min( vStoreOpt2, vGoal );
414  }
415  // update the maxscore found so far
416  vMaxScore1 = spu_max( vMaxScore1, vStoreOpt1 );
417  vMaxScore2 = spu_max( vMaxScore2, vStoreOpt2 );
418 
419  // precompute the maximum here (gives about 5% speedup)
420  vTmp1 = spu_max( vCD1, vRD1 );
421  vTmp2 = spu_max( vCD2, vRD2 );
422  // compute the correct opt score of the cell
423  vStoreOpt1 = spu_max( vStoreOpt1, vTmp1 );
424  vStoreOpt2 = spu_max( vStoreOpt2, vTmp2 );
425  vStoreOpt1 = spu_max( vStoreOpt1, vZero );
426  vStoreOpt2 = spu_max( vStoreOpt2, vZero );
427 
428  // store the opt score of the cell
429  storeOpt[j ] = vStoreOpt1;
430  storeOpt[j+subSegLen] = vStoreOpt2;
431 
432  // precompute rd and cd for next iteration
433  vStoreOpt1 += vDelFixed;
434  vStoreOpt2 += vDelFixed;
435  vCD1 += vDelInc;
436  vCD2 += vDelInc;
437  if(IsInteger<T>::value) {
438  vStoreOpt1 = spu_max( vStoreOpt1, vZero );
439  vStoreOpt2 = spu_max( vStoreOpt2, vZero );
440  }
441  vCD1 = spu_max( vStoreOpt1, vCD1 );
442  vCD2 = spu_max( vStoreOpt2, vCD2 );
443 
444  // load precomputed opt for next iteration
445  vStoreOpt1 = loadOpt[j];
446  vStoreOpt2 = loadOpt[j+subSegLen];
447  }
448 
449  /* set totcells */
450  /****************/
451 // totcells += ls1;
452  /* check for a changed MaxScore */
453  /********************************/
454  V vMaxScore = spu_max( vMaxScore1, vMaxScore2 );
455  for( T* tmp = (T*)&vMaxScore; tmp<(T*)(&vMaxScore+1); tmp++ )
456  if (UNLIKELY(maxScore < *tmp))
457  maxScore = *tmp;
458  // if the goal was reached, exit
459  if ( UNLIKELY(maxScore >= goal) )
460  return MaxValue<T>::value;
461 
462  /* cleaning up with missed arrows */
463  /**********************************/
464  delS[i] = spu_extract( vCD2, nSeg-1 );
465 
466  V vStoreOptx1 = storeOpt[0 ];
467  V vStoreOptx2 = storeOpt[subSegLen];
468  vStoreOptx1 = spu_max(vStoreOpt1 + (vDelFixed - vDelInc),vZero);
469  vStoreOptx2 = spu_max(vStoreOpt2 + (vDelFixed - vDelInc),vZero);
470  V vCDx2 = vCD1;
471  V vCDx1 = spu_rlmaskqwbyte(vCD2, -sizeof(T));
472  vCDx1 = spu_insert( zero, vCDx1, 0 );
473  if( spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCDx1,vStoreOptx1),(vector unsigned char)spu_cmpgt(vCDx2,vStoreOptx2))),0) != 0) {
474  for(int j=0; LIKELY(j<nSeg+1); ++j) {
475  // set everything up for the next iteration
476  V vRotate = vCD2;
477  vCD2 = vCD1;
478  vCD1 = spu_rlmaskqwbyte(vRotate, -sizeof(T));
479  vCD1 = spu_insert( zero, vCD1, 0 );
480 
481  for(int k=0; k<subSegLen-1; ++k) {
482  // compute the current optimal value of the cell
483  vStoreOpt1 = storeOpt[k ];
484  vStoreOpt2 = storeOpt[k + subSegLen];
485  vStoreOpt1 = spu_max( vStoreOpt1, vCD1 );
486  vStoreOpt2 = spu_max( vStoreOpt2, vCD2 );
487  storeOpt[k ] = vStoreOpt1;
488  storeOpt[k + subSegLen] = vStoreOpt2;
489 
490  // precompute the scores for the next cell
491  vStoreOpt1 = spu_max( vStoreOpt1 + vDelFixed, vZero );
492  vStoreOpt2 = spu_max( vStoreOpt2 + vDelFixed, vZero );
493  vCD1 = spu_max( vCD1 + vDelInc, vZero );
494  vCD2 = spu_max( vCD2 + vDelInc, vZero );
495 
496  #ifdef SHORTCUT
497  if(UNLIKELY(spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0))
498  goto shortcut;
499  #endif
500 
501  }
502 
503  // compute the current optimal value of the cell
504  vStoreOpt1 = storeOpt[subSegLen - 1];
505  vStoreOpt2 = storeOpt[segLen - 1];
506  vStoreOpt1 = spu_max( vStoreOpt1, vCD1 );
507  vStoreOpt2 = spu_max( vStoreOpt2, vCD2 );
508  storeOpt[subSegLen - 1] = vStoreOpt1;
509  storeOpt[segLen - 1] = vStoreOpt2;
510 
511  // precompute the scores for the next cell
512  vStoreOpt1 = spu_max( vStoreOpt1 + vDelFixed, vZero );
513  vStoreOpt2 = spu_max( vStoreOpt2 + vDelFixed, vZero );
514  vCD1 = spu_max( vCD1 + vDelInc, vZero );
515  vCD2 = spu_max( vCD2 + vDelInc, vZero );
516 
517  // Update the del Score
518  T temp = spu_extract( vCD2, nSeg-1 );
519  if ( UNLIKELY(delS[i] < temp) )
520  delS[i] = temp;
521 
522  if(UNLIKELY(spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0)) break;
523  }
524  #ifdef SHORTCUT
525  shortcut:
526  (void)1;
527  #endif
528  }
529 
530  /* store the new MaxScore for the next line block */
531  /**************************************************/
532  prevMax = maxS[i];
533  maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
534 
535 #ifdef DEBUG
536  printf("%c\t",s2[i]);
537  for(int ii=0; ii<nSeg; ++ii) {
538  for(int jj=0; jj<segLen; ++jj) {
539  if(ii*segLen+jj < ls1)
540  printf("%d\t",(int)(((T*)storeOpt)[ii+jj*nSeg]-zero));
541  }
542  }
543  printf("\n");
544 #endif
545  }
546  return maxScore;
547 }
548 #endif
549 
558 template< class T, class V >
559 static void doCreateProfile( int blockStart, int currentBlockSize, const T* simi, V* currentBlock){
560  const int nSeg = sizeof(V)/sizeof(T); // the number of segments
561  const int segLen = currentBlockSize/nSeg; // the segment length
562 
563  for( int i=0; i<MATRIX_DIM; i++ ){
564  T *currentProfile = ((T*)currentBlock)+i*currentBlockSize;
565  for( int j=0; j<segLen; j++ ){
566  T *tmp = currentProfile + j*nSeg;
567  for( int k=0; k<nSeg; k++ )
568  if( j + k*segLen + blockStart < ls1 )
569  tmp[k] = simi[ s1[j + k*segLen + blockStart] * MATRIX_DIM + i ];
570  else
571  tmp[k] = 0;
572  }
573  }
574 }
575 
582 template< class T, class V >
583 static void TcreateProfile(void){
585 }
586 
594 template< class T, class V >
595 static double TdynProgLocal(void){
596  T zero, goal;
597  /* A vectorized template version */
598  if (IsInteger<T>::value){
599  // adjust the zero and goal values...
600  zero = MinValue<T>::value;
601  zero-= mn;
602  goal = MaxValue<T>::value;
603  goal-= mx;
604  } else {
605  zero = 0.0;
606  goal = MaxValue<T>::value;
607  }
608 
609  /* Set the stored max and del score to zero */
610  /********************************************/
611  for( int i=0; i<ls2; i++ )
612  ((T*)maxS)[i] = ((T*)delS)[i] = (T)zero;
613 
614  T maxScore=zero;
615  blockStart = 0;
616  do {
617  const int currentBlockSize = ALIGN16(min(ls1-blockStart,blockSize));
618  /* initialize the profile for the current iteration */
619  if(blockStart != 0) { /* when blockStart==0 then the profile has been initialized already */
620  if(remote_profile == 0) {
621 #ifdef DEBUG_FETCH
622  printf(">>>> creating profile\n");
623 #endif
624  doCreateProfile<T,V>( blockStart, currentBlockSize, (T*)simi, (V*)profile);
625  } else {
626 #ifdef DEBUG_FETCH
627  printf(">>>> fetching profile (%lu bytes)\n", currentBlockSize * MATRIX_DIM * sizeof(T));
628 #endif
629  for( int64_t bs=0; bs<currentBlockSize * MATRIX_DIM * sizeof(T); bs+=MAX_TRANSFER ) {
630  mfc_get( ((char*)profile)+bs, remote_profile+blockStart*MATRIX_DIM*sizeof(T)+bs, ALIGN16(min(currentBlockSize*MATRIX_DIM*sizeof(T)-bs, (int64_t)MAX_TRANSFER)), 0, 0, 0 );
631 
632  /* wait for DMA to finish */
633  mfc_write_tag_mask(1<<0);
634  mfc_read_tag_status_all();
635  }
636  }
637  }
638 
639 #ifdef UNROLL
640  T currentScore;
641  if (sizeof(T) < 2)
642  currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
643  else
644  currentScore = dynProgrLocalBlock2<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
645 #else
646  T currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
647 #endif
648  if( maxScore < currentScore)
649  maxScore = currentScore;
650 
651  if(maxScore >= goal)
652  return DBL_MAX;
653 
654  if(blockStart+blockSize >= ls1) break;
655 
657  } while(1);
658  /* Finally free all the memory we allocated */
659  /********************************************/
660  return (double)(maxScore-zero);
661 }
662 
667  TdynProgLocal<int8_t, vector int8_t>,
668  TdynProgLocal<int16_t, vector int16_t>,
669  TdynProgLocal<int32_t, vector int32_t>,
670  TdynProgLocal<float, vector float>,
671  TdynProgLocal<double, vector double>
672 };
673 
678  TcreateProfile<int8_t, vector int8_t>,
679  TcreateProfile<int16_t, vector int16_t>,
680  TcreateProfile<int32_t, vector int32_t>,
681  TcreateProfile<float, vector float>,
682  TcreateProfile<double, vector double>
683 };
684