00001
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "DynProgr_SPE_functions.h"
00031 #include "DynProgr_SPE.h"
00032 #include "matrix.h"
00033 #include <cstdlib>
00034 #include <malloc.h>
00035 #include <float.h>
00036 #include <cstdio>
00037 #include <string.h>
00038 #include <spu_intrinsics.h>
00039 #include <sys/types.h>
00040
00041 template<typename T> static inline T min( T a, T b ){ return a<b?a:b; }
00042 template<typename T> static inline T max( T a, T b ){ return a>b?a:b; }
00043
00044 template<typename T> struct IsInteger { static const bool value = false; };
00045 template<> struct IsInteger<int8_t> { static const bool value = true; };
00046 template<> struct IsInteger<int16_t> { static const bool value = true; };
00047 template<> struct IsInteger<int32_t> { static const bool value = true; };
00048
00049 template<typename T> struct MaxValue { static const T value = -1 ^ (1ll<<(sizeof(T)*8-1)); };
00050 template<> struct MaxValue<float> { static const float value = FLT_MAX; };
00051 template<> struct MaxValue<double> { static const double value = DBL_MAX; };
00052
00053 template<typename T> struct MinValue { static const T value = 1ll<<(sizeof(T)*8-1); };
00054 template<> struct MinValue<float> { static const float value = FLT_MIN; };
00055 template<> struct MinValue<double> { static const double value = DBL_MIN; };
00056
00057 char * s1, * s2;
00058 int ls1, ls2;
00059 int blockStart, blockSize;
00060 int maxDbLen;
00061 double mn, mx, fixedDel, incDel;
00062 void *simi, *profile, *loadOpt, *storeOpt, *rD, *maxS, *delS;
00063 ppu_addr_t remote_profile;
00064
00068 template< class V > static inline V spu_max( V a, V b ){
00069 return spu_sel(a,b,spu_cmpgt(b,a));
00070 }
00074 template< class V > static inline V spu_min( V a, V b ){
00075 return spu_sel(a,b,spu_cmpgt(a,b));
00076 }
00077
00078 #undef SHORTCUT
00079
00097 template< class T, class V > static inline T dynProgrLocalBlock(
00098 int currentBlockSize,
00099 T zero, T goal,
00100 T * maxS, T* delS,
00101 const V * profile,
00102 V * loadOpt,
00103 V * storeOpt,
00104 V * rD){
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115 const V vZero = spu_splats( zero );
00116 const V vGoal = spu_splats( goal );
00117 const V vDelFixed = spu_splats( (T)fixedDel );
00118 const V vDelInc = spu_splats( (T)incDel );
00119
00120 T maxScore = zero;
00121 const int nSeg = sizeof(V)/sizeof(T);
00122 const int segLen = currentBlockSize/nSeg;
00123
00124 V vMaxScore = vZero;
00125 T prevMax = zero;
00126
00127
00128 for(int i=0; LIKELY(i<segLen); i++)
00129 loadOpt[i] = storeOpt[i] = rD[i] = vZero;
00130
00131
00132
00133 for( int i=0; LIKELY(i<ls2); i++ ){
00134
00135
00136
00137
00138 V vCD = spu_insert( delS[i], vZero, 0);
00139
00140
00141
00142
00143 V vStoreOpt = spu_rlmaskqwbyte(storeOpt[segLen-1], -sizeof(T));
00144 vStoreOpt = spu_insert( prevMax, vStoreOpt, 0 );
00145
00146
00147
00148 const V * currentProfile = profile + s2[i]*segLen;
00149
00150 #if 0
00151 for(int ii=0; ii<nSeg; ++ii) {
00152 for(int jj=0; jj<segLen; ++jj) {
00153 if(ii*segLen+jj < ls1)
00154 printf("\t%d",(int)((T*)currentProfile)[ii+jj*nSeg]);
00155 }
00156 }
00157 printf("\n");
00158 #endif
00159
00160
00161
00162 V * swap = storeOpt;
00163 storeOpt = loadOpt;
00164 loadOpt = swap;
00165
00166
00167
00168 for( int j=0; LIKELY(j<segLen); j++ ){
00169
00170 V vRD = rD[j];
00171 V vTmp = loadOpt[j];
00172 vRD += vDelInc;
00173 vTmp += vDelFixed;
00174 if(IsInteger<T>::value) {
00175 vRD = spu_max(vRD,vZero);
00176 }
00177 vRD = spu_max(vTmp,vRD);
00178 rD[j] = vRD;
00179
00180
00181 vStoreOpt += currentProfile[j];
00182
00183
00184 if (IsInteger<T>::value)
00185 vStoreOpt = spu_min( vStoreOpt, vGoal );
00186
00187
00188 vMaxScore = spu_max( vMaxScore, vStoreOpt );
00189
00190 vTmp = spu_max( vCD, vRD );
00191
00192 vStoreOpt = spu_max( vStoreOpt, vTmp );
00193 vStoreOpt = spu_max( vStoreOpt, vZero );
00194
00195
00196 storeOpt[j] = vStoreOpt;
00197
00198
00199 vStoreOpt += vDelFixed;
00200 vCD += vDelInc;
00201 if(IsInteger<T>::value)
00202 vStoreOpt = spu_max( vStoreOpt, vZero );
00203 vCD = spu_max( vStoreOpt, vCD );
00204
00205
00206 vStoreOpt = loadOpt[j];
00207 }
00208
00209
00210
00211
00212
00213
00214 for( T* tmp = (T*)&vMaxScore; tmp<(T*)(&vMaxScore+1); tmp++ )
00215 if (UNLIKELY(maxScore < *tmp))
00216 maxScore = *tmp;
00217
00218 if ( UNLIKELY(maxScore >= goal) )
00219 return MaxValue<T>::value;
00220
00221
00222
00223 delS[i] = spu_extract( vCD, nSeg-1 );
00224
00225 V vStoreOptx = storeOpt[0];
00226 vStoreOptx = spu_max(vStoreOptx + (vDelFixed - vDelInc),vZero);
00227 V vCDx = spu_rlmaskqwbyte(vCD, -sizeof(T));
00228 vCDx = spu_insert( zero, vCDx, 0 );
00229
00230 if( spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCDx,vStoreOptx)),0) != 0) {
00231 for(int j=0; LIKELY(j<nSeg); ++j) {
00232
00233 vCD = spu_rlmaskqwbyte(vCD, -sizeof(T));
00234 vCD = spu_insert( zero, vCD, 0 );
00235
00236 for(int k=0; k<segLen-1; ++k) {
00237
00238 vStoreOpt = storeOpt[k];
00239 vStoreOpt = spu_max( vStoreOpt, vCD );
00240 storeOpt[k] = vStoreOpt;
00241
00242
00243 vCD = spu_max( vCD + vDelInc, vZero );
00244 vStoreOpt = spu_max( vStoreOpt + vDelFixed, vZero );
00245
00246 #ifdef SHORTCUT
00247 if(UNLIKELY(spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0))
00248 goto shortcut;
00249 #endif
00250
00251 }
00252
00253
00254 vStoreOpt = storeOpt[segLen-1];
00255 vStoreOpt = spu_max( vStoreOpt, vCD );
00256 storeOpt[segLen-1] = vStoreOpt;
00257
00258
00259 vCD = spu_max( vCD + vDelInc, vZero );
00260 vStoreOpt = spu_max( vStoreOpt + vDelFixed, vZero );
00261
00262
00263 T temp = spu_extract( vCD, nSeg-1 );
00264 if ( UNLIKELY(delS[i] < temp) )
00265 delS[i] = temp;
00266
00267 if(UNLIKELY(spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0)) break;
00268 }
00269 #ifdef SHORTCUT
00270 shortcut:
00271 (void)1;
00272 #endif
00273 }
00274
00275
00276
00277 prevMax = maxS[i];
00278 maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
00279
00280 #ifdef DEBUG
00281 printf("%c\t",s2[i]);
00282 for(int ii=0; ii<nSeg; ++ii) {
00283 for(int jj=0; jj<segLen; ++jj) {
00284 if(ii*segLen+jj < ls1)
00285 printf("%d\t",(int)(((T*)storeOpt)[ii+jj*nSeg]-zero));
00286 }
00287 }
00288 printf("\n");
00289 #endif
00290 }
00291 return maxScore;
00292 }
00293
00294 #ifdef UNROLL
00295
00314 template< class T, class V > static inline T dynProgrLocalBlock2(
00315 int currentBlockSize,
00316 T zero, T goal,
00317 T * maxS, T* delS,
00318 const V * profile,
00319 V * loadOpt,
00320 V * storeOpt,
00321 V * rD){
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332 const V vZero = spu_splats( zero );
00333 const V vGoal = spu_splats( goal );
00334 const V vDelFixed = spu_splats( (T)fixedDel );
00335 const V vDelInc = spu_splats( (T)incDel );
00336
00337 T maxScore = zero;
00338 const int nSeg = sizeof(V)/sizeof(T);
00339 const int segLen = (currentBlockSize/nSeg + 1) & ~1;
00340 const int subSegLen = segLen / 2;
00341 V vMaxScore1 = vZero,vMaxScore2 = vZero;
00342 T prevMax = zero;
00343
00344
00345
00346 for(int i=0; LIKELY(i<segLen); i++)
00347 loadOpt[i] = storeOpt[i] = rD[i] = vZero;
00348
00349
00350
00351 for( int i=0; LIKELY(i<ls2); i++ ){
00352
00353
00354
00355
00356 V vCD1 = spu_insert( delS[i], vZero, 0);
00357 V vCD2 = vZero;
00358
00359
00360
00361
00362 V vStoreOpt1 = spu_rlmaskqwbyte(storeOpt[segLen-1], -sizeof(T));
00363 vStoreOpt1 = spu_insert( prevMax, vStoreOpt1, 0 );
00364 V vStoreOpt2 = storeOpt[subSegLen-1];
00365
00366
00367 const V * currentProfile = profile + s2[i]*segLen;
00368
00369 #if 0
00370 for(int ii=0; ii<nSeg; ++ii) {
00371 for(int jj=0; jj<segLen; ++jj) {
00372 if(ii*segLen+jj < ls1)
00373 printf("\t%d",(int)((T*)currentProfile)[ii+jj*nSeg]);
00374 }
00375 }
00376 printf("\n");
00377 #endif
00378
00379
00380
00381 V * swap = storeOpt;
00382 storeOpt = loadOpt;
00383 loadOpt = swap;
00384
00385
00386
00387 for( int j=0; LIKELY(j<subSegLen); j++ ){
00388
00389 V vRD1 = rD[j];
00390 V vRD2 = rD[j+subSegLen];
00391 V vTmp1 = loadOpt[j];
00392 V vTmp2 = loadOpt[j+subSegLen];
00393 vRD1 += vDelInc;
00394 vRD2 += vDelInc;
00395 vTmp1 += vDelFixed;
00396 vTmp2 += vDelFixed;
00397 if(IsInteger<T>::value) {
00398 vRD1 = spu_max(vRD1,vZero);
00399 vRD2 = spu_max(vRD2,vZero);
00400 }
00401 vRD1 = spu_max(vTmp1,vRD1);
00402 vRD2 = spu_max(vTmp2,vRD2);
00403 rD[j] = vRD1;
00404 rD[j+subSegLen] = vRD2;
00405
00406
00407 vStoreOpt1 += currentProfile[j];
00408 vStoreOpt2 += currentProfile[j+subSegLen];
00409
00410
00411 if (IsInteger<T>::value){
00412 vStoreOpt1 = spu_min( vStoreOpt1, vGoal );
00413 vStoreOpt2 = spu_min( vStoreOpt2, vGoal );
00414 }
00415
00416 vMaxScore1 = spu_max( vMaxScore1, vStoreOpt1 );
00417 vMaxScore2 = spu_max( vMaxScore2, vStoreOpt2 );
00418
00419
00420 vTmp1 = spu_max( vCD1, vRD1 );
00421 vTmp2 = spu_max( vCD2, vRD2 );
00422
00423 vStoreOpt1 = spu_max( vStoreOpt1, vTmp1 );
00424 vStoreOpt2 = spu_max( vStoreOpt2, vTmp2 );
00425 vStoreOpt1 = spu_max( vStoreOpt1, vZero );
00426 vStoreOpt2 = spu_max( vStoreOpt2, vZero );
00427
00428
00429 storeOpt[j ] = vStoreOpt1;
00430 storeOpt[j+subSegLen] = vStoreOpt2;
00431
00432
00433 vStoreOpt1 += vDelFixed;
00434 vStoreOpt2 += vDelFixed;
00435 vCD1 += vDelInc;
00436 vCD2 += vDelInc;
00437 if(IsInteger<T>::value) {
00438 vStoreOpt1 = spu_max( vStoreOpt1, vZero );
00439 vStoreOpt2 = spu_max( vStoreOpt2, vZero );
00440 }
00441 vCD1 = spu_max( vStoreOpt1, vCD1 );
00442 vCD2 = spu_max( vStoreOpt2, vCD2 );
00443
00444
00445 vStoreOpt1 = loadOpt[j];
00446 vStoreOpt2 = loadOpt[j+subSegLen];
00447 }
00448
00449
00450
00451
00452
00453
00454 V vMaxScore = spu_max( vMaxScore1, vMaxScore2 );
00455 for( T* tmp = (T*)&vMaxScore; tmp<(T*)(&vMaxScore+1); tmp++ )
00456 if (UNLIKELY(maxScore < *tmp))
00457 maxScore = *tmp;
00458
00459 if ( UNLIKELY(maxScore >= goal) )
00460 return MaxValue<T>::value;
00461
00462
00463
00464 delS[i] = spu_extract( vCD2, nSeg-1 );
00465
00466 V vStoreOptx1 = storeOpt[0 ];
00467 V vStoreOptx2 = storeOpt[subSegLen];
00468 vStoreOptx1 = spu_max(vStoreOpt1 + (vDelFixed - vDelInc),vZero);
00469 vStoreOptx2 = spu_max(vStoreOpt2 + (vDelFixed - vDelInc),vZero);
00470 V vCDx2 = vCD1;
00471 V vCDx1 = spu_rlmaskqwbyte(vCD2, -sizeof(T));
00472 vCDx1 = spu_insert( zero, vCDx1, 0 );
00473 if( spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCDx1,vStoreOptx1),(vector unsigned char)spu_cmpgt(vCDx2,vStoreOptx2))),0) != 0) {
00474 for(int j=0; LIKELY(j<nSeg+1); ++j) {
00475
00476 V vRotate = vCD2;
00477 vCD2 = vCD1;
00478 vCD1 = spu_rlmaskqwbyte(vRotate, -sizeof(T));
00479 vCD1 = spu_insert( zero, vCD1, 0 );
00480
00481 for(int k=0; k<subSegLen-1; ++k) {
00482
00483 vStoreOpt1 = storeOpt[k ];
00484 vStoreOpt2 = storeOpt[k + subSegLen];
00485 vStoreOpt1 = spu_max( vStoreOpt1, vCD1 );
00486 vStoreOpt2 = spu_max( vStoreOpt2, vCD2 );
00487 storeOpt[k ] = vStoreOpt1;
00488 storeOpt[k + subSegLen] = vStoreOpt2;
00489
00490
00491 vStoreOpt1 = spu_max( vStoreOpt1 + vDelFixed, vZero );
00492 vStoreOpt2 = spu_max( vStoreOpt2 + vDelFixed, vZero );
00493 vCD1 = spu_max( vCD1 + vDelInc, vZero );
00494 vCD2 = spu_max( vCD2 + vDelInc, vZero );
00495
00496 #ifdef SHORTCUT
00497 if(UNLIKELY(spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0))
00498 goto shortcut;
00499 #endif
00500
00501 }
00502
00503
00504 vStoreOpt1 = storeOpt[subSegLen - 1];
00505 vStoreOpt2 = storeOpt[segLen - 1];
00506 vStoreOpt1 = spu_max( vStoreOpt1, vCD1 );
00507 vStoreOpt2 = spu_max( vStoreOpt2, vCD2 );
00508 storeOpt[subSegLen - 1] = vStoreOpt1;
00509 storeOpt[segLen - 1] = vStoreOpt2;
00510
00511
00512 vStoreOpt1 = spu_max( vStoreOpt1 + vDelFixed, vZero );
00513 vStoreOpt2 = spu_max( vStoreOpt2 + vDelFixed, vZero );
00514 vCD1 = spu_max( vCD1 + vDelInc, vZero );
00515 vCD2 = spu_max( vCD2 + vDelInc, vZero );
00516
00517
00518 T temp = spu_extract( vCD2, nSeg-1 );
00519 if ( UNLIKELY(delS[i] < temp) )
00520 delS[i] = temp;
00521
00522 if(UNLIKELY(spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0)) break;
00523 }
00524 #ifdef SHORTCUT
00525 shortcut:
00526 (void)1;
00527 #endif
00528 }
00529
00530
00531
00532 prevMax = maxS[i];
00533 maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
00534
00535 #ifdef DEBUG
00536 printf("%c\t",s2[i]);
00537 for(int ii=0; ii<nSeg; ++ii) {
00538 for(int jj=0; jj<segLen; ++jj) {
00539 if(ii*segLen+jj < ls1)
00540 printf("%d\t",(int)(((T*)storeOpt)[ii+jj*nSeg]-zero));
00541 }
00542 }
00543 printf("\n");
00544 #endif
00545 }
00546 return maxScore;
00547 }
00548 #endif
00549
00558 template< class T, class V >
00559 static void doCreateProfile( int blockStart, int currentBlockSize, const T* simi, V* currentBlock){
00560 const int nSeg = sizeof(V)/sizeof(T);
00561 const int segLen = currentBlockSize/nSeg;
00562
00563 for( int i=0; i<MATRIX_DIM; i++ ){
00564 T *currentProfile = ((T*)currentBlock)+i*currentBlockSize;
00565 for( int j=0; j<segLen; j++ ){
00566 T *tmp = currentProfile + j*nSeg;
00567 for( int k=0; k<nSeg; k++ )
00568 if( j + k*segLen + blockStart < ls1 )
00569 tmp[k] = simi[ s1[j + k*segLen + blockStart] * MATRIX_DIM + i ];
00570 else
00571 tmp[k] = 0;
00572 }
00573 }
00574 }
00575
00582 template< class T, class V >
00583 static void TcreateProfile(void){
00584 doCreateProfile( blockStart, ALIGN16(min(ls1-blockStart, blockSize)), (T*)simi, (V*)profile);
00585 }
00586
00594 template< class T, class V >
00595 static double TdynProgLocal(void){
00596 T zero, goal;
00597
00598 if (IsInteger<T>::value){
00599
00600 zero = MinValue<T>::value;
00601 zero-= mn;
00602 goal = MaxValue<T>::value;
00603 goal-= mx;
00604 } else {
00605 zero = 0.0;
00606 goal = MaxValue<T>::value;
00607 }
00608
00609
00610
00611 for( int i=0; i<ls2; i++ )
00612 ((T*)maxS)[i] = ((T*)delS)[i] = (T)zero;
00613
00614 T maxScore=zero;
00615 blockStart = 0;
00616 do {
00617 const int currentBlockSize = ALIGN16(min(ls1-blockStart,blockSize));
00618
00619 if(blockStart != 0) {
00620 if(remote_profile == 0) {
00621 #ifdef DEBUG_FETCH
00622 printf(">>>> creating profile\n");
00623 #endif
00624 doCreateProfile<T,V>( blockStart, currentBlockSize, (T*)simi, (V*)profile);
00625 } else {
00626 #ifdef DEBUG_FETCH
00627 printf(">>>> fetching profile (%lu bytes)\n", currentBlockSize * MATRIX_DIM * sizeof(T));
00628 #endif
00629 for( int64_t bs=0; bs<currentBlockSize * MATRIX_DIM * sizeof(T); bs+=MAX_TRANSFER ) {
00630 mfc_get( ((char*)profile)+bs, remote_profile+blockStart*MATRIX_DIM*sizeof(T)+bs, ALIGN16(min(currentBlockSize*MATRIX_DIM*sizeof(T)-bs, (int64_t)MAX_TRANSFER)), 0, 0, 0 );
00631
00632
00633 mfc_write_tag_mask(1<<0);
00634 mfc_read_tag_status_all();
00635 }
00636 }
00637 }
00638
00639 #ifdef UNROLL
00640 T currentScore;
00641 if (sizeof(T) < 2)
00642 currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
00643 else
00644 currentScore = dynProgrLocalBlock2<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
00645 #else
00646 T currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
00647 #endif
00648 if( maxScore < currentScore)
00649 maxScore = currentScore;
00650
00651 if(maxScore >= goal)
00652 return DBL_MAX;
00653
00654 if(blockStart+blockSize >= ls1) break;
00655
00656 blockStart += blockSize;
00657 } while(1);
00658
00659
00660 return (double)(maxScore-zero);
00661 }
00662
00666 dvf_t dynProgLocal[] = {
00667 TdynProgLocal<int8_t, vector int8_t>,
00668 TdynProgLocal<int16_t, vector int16_t>,
00669 TdynProgLocal<int32_t, vector int32_t>,
00670 TdynProgLocal<float, vector float>,
00671 TdynProgLocal<double, vector double>
00672 };
00673
00677 vvf_t createProfile[] = {
00678 TcreateProfile<int8_t, vector int8_t>,
00679 TcreateProfile<int16_t, vector int16_t>,
00680 TcreateProfile<int32_t, vector int32_t>,
00681 TcreateProfile<float, vector float>,
00682 TcreateProfile<double, vector double>
00683 };
00684