38 #include <spu_intrinsics.h>
39 #include <sys/types.h>
41 template<
typename T>
static inline T min(
T a,
T b ){
return a<b?a:b; }
42 template<
typename T>
static inline T max(
T a,
T b ){
return a>b?a:b; }
49 template<
typename T>
struct MaxValue {
static const T value = -1 ^ (1ll<<(
sizeof(
T)*8-1)); };
50 template<>
struct MaxValue<float> {
static const float value = FLT_MAX; };
51 template<>
struct MaxValue<double> {
static const double value = DBL_MAX; };
53 template<
typename T>
struct MinValue {
static const T value = 1ll<<(
sizeof(
T)*8-1); };
54 template<>
struct MinValue<float> {
static const float value = FLT_MIN; };
55 template<>
struct MinValue<double> {
static const double value = DBL_MIN; };
68 template<
class V >
static inline V
spu_max( V a, V b ){
69 return spu_sel(a,b,spu_cmpgt(b,a));
74 template<
class V >
static inline V
spu_min( V a, V b ){
75 return spu_sel(a,b,spu_cmpgt(a,b));
115 const V vZero = spu_splats( zero );
116 const V vGoal = spu_splats( goal );
117 const V vDelFixed = spu_splats( (
T)
fixedDel );
118 const V vDelInc = spu_splats( (
T)
incDel );
121 const int nSeg =
sizeof(V)/
sizeof(
T);
122 const int segLen = currentBlockSize/nSeg;
128 for(
int i=0;
LIKELY(i<segLen); i++)
129 loadOpt[i] = storeOpt[i] = rD[i] = vZero;
138 V vCD = spu_insert( delS[i], vZero, 0);
143 V vStoreOpt = spu_rlmaskqwbyte(storeOpt[segLen-1], -
sizeof(
T));
144 vStoreOpt = spu_insert( prevMax, vStoreOpt, 0 );
148 const V * currentProfile = profile +
s2[i]*segLen;
151 for(
int ii=0; ii<nSeg; ++ii) {
152 for(
int jj=0; jj<segLen; ++jj) {
153 if(ii*segLen+jj <
ls1)
154 printf(
"\t%d",(
int)((
T*)currentProfile)[ii+jj*nSeg]);
168 for(
int j=0;
LIKELY(j<segLen); j++ ){
181 vStoreOpt += currentProfile[j];
185 vStoreOpt =
spu_min( vStoreOpt, vGoal );
188 vMaxScore =
spu_max( vMaxScore, vStoreOpt );
192 vStoreOpt =
spu_max( vStoreOpt, vTmp );
193 vStoreOpt =
spu_max( vStoreOpt, vZero );
196 storeOpt[j] = vStoreOpt;
199 vStoreOpt += vDelFixed;
202 vStoreOpt =
spu_max( vStoreOpt, vZero );
203 vCD =
spu_max( vStoreOpt, vCD );
206 vStoreOpt = loadOpt[j];
214 for(
T* tmp = (
T*)&vMaxScore; tmp<(
T*)(&vMaxScore+1); tmp++ )
223 delS[i] = spu_extract( vCD, nSeg-1 );
225 V vStoreOptx = storeOpt[0];
226 vStoreOptx =
spu_max(vStoreOptx + (vDelFixed - vDelInc),vZero);
227 V vCDx = spu_rlmaskqwbyte(vCD, -
sizeof(
T));
228 vCDx = spu_insert( zero, vCDx, 0 );
230 if( spu_extract(spu_gather((vector
unsigned char)spu_cmpgt(vCDx,vStoreOptx)),0) != 0) {
231 for(
int j=0;
LIKELY(j<nSeg); ++j) {
233 vCD = spu_rlmaskqwbyte(vCD, -
sizeof(
T));
234 vCD = spu_insert( zero, vCD, 0 );
236 for(
int k=0; k<segLen-1; ++k) {
238 vStoreOpt = storeOpt[k];
239 vStoreOpt =
spu_max( vStoreOpt, vCD );
240 storeOpt[k] = vStoreOpt;
243 vCD =
spu_max( vCD + vDelInc, vZero );
244 vStoreOpt =
spu_max( vStoreOpt + vDelFixed, vZero );
247 if(
UNLIKELY(spu_extract(spu_gather((vector
unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0))
254 vStoreOpt = storeOpt[segLen-1];
255 vStoreOpt =
spu_max( vStoreOpt, vCD );
256 storeOpt[segLen-1] = vStoreOpt;
259 vCD =
spu_max( vCD + vDelInc, vZero );
260 vStoreOpt =
spu_max( vStoreOpt + vDelFixed, vZero );
263 T temp = spu_extract( vCD, nSeg-1 );
267 if(
UNLIKELY(spu_extract(spu_gather((vector
unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0))
break;
278 maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
281 printf(
"%c\t",
s2[i]);
282 for(
int ii=0; ii<nSeg; ++ii) {
283 for(
int jj=0; jj<segLen; ++jj) {
284 if(ii*segLen+jj <
ls1)
285 printf(
"%d\t",(
int)(((
T*)storeOpt)[ii+jj*nSeg]-zero));
314 template<
class T,
class V >
static inline T dynProgrLocalBlock2(
315 int currentBlockSize,
332 const V vZero = spu_splats( zero );
333 const V vGoal = spu_splats( goal );
334 const V vDelFixed = spu_splats( (
T)
fixedDel );
335 const V vDelInc = spu_splats( (
T)
incDel );
338 const int nSeg =
sizeof(V)/
sizeof(
T);
339 const int segLen = (currentBlockSize/nSeg + 1) & ~1;
340 const int subSegLen = segLen / 2;
341 V vMaxScore1 = vZero,vMaxScore2 = vZero;
346 for(
int i=0;
LIKELY(i<segLen); i++)
347 loadOpt[i] = storeOpt[i] = rD[i] = vZero;
356 V vCD1 = spu_insert( delS[i], vZero, 0);
362 V vStoreOpt1 = spu_rlmaskqwbyte(storeOpt[segLen-1], -
sizeof(
T));
363 vStoreOpt1 = spu_insert( prevMax, vStoreOpt1, 0 );
364 V vStoreOpt2 = storeOpt[subSegLen-1];
367 const V * currentProfile = profile +
s2[i]*segLen;
370 for(
int ii=0; ii<nSeg; ++ii) {
371 for(
int jj=0; jj<segLen; ++jj) {
372 if(ii*segLen+jj <
ls1)
373 printf(
"\t%d",(
int)((
T*)currentProfile)[ii+jj*nSeg]);
387 for(
int j=0;
LIKELY(j<subSegLen); j++ ){
390 V vRD2 = rD[j+subSegLen];
391 V vTmp1 = loadOpt[j];
392 V vTmp2 = loadOpt[j+subSegLen];
404 rD[j+subSegLen] = vRD2;
407 vStoreOpt1 += currentProfile[j];
408 vStoreOpt2 += currentProfile[j+subSegLen];
412 vStoreOpt1 =
spu_min( vStoreOpt1, vGoal );
413 vStoreOpt2 =
spu_min( vStoreOpt2, vGoal );
416 vMaxScore1 =
spu_max( vMaxScore1, vStoreOpt1 );
417 vMaxScore2 =
spu_max( vMaxScore2, vStoreOpt2 );
423 vStoreOpt1 =
spu_max( vStoreOpt1, vTmp1 );
424 vStoreOpt2 =
spu_max( vStoreOpt2, vTmp2 );
425 vStoreOpt1 =
spu_max( vStoreOpt1, vZero );
426 vStoreOpt2 =
spu_max( vStoreOpt2, vZero );
429 storeOpt[j ] = vStoreOpt1;
430 storeOpt[j+subSegLen] = vStoreOpt2;
433 vStoreOpt1 += vDelFixed;
434 vStoreOpt2 += vDelFixed;
438 vStoreOpt1 =
spu_max( vStoreOpt1, vZero );
439 vStoreOpt2 =
spu_max( vStoreOpt2, vZero );
441 vCD1 =
spu_max( vStoreOpt1, vCD1 );
442 vCD2 =
spu_max( vStoreOpt2, vCD2 );
445 vStoreOpt1 = loadOpt[j];
446 vStoreOpt2 = loadOpt[j+subSegLen];
454 V vMaxScore =
spu_max( vMaxScore1, vMaxScore2 );
455 for(
T* tmp = (
T*)&vMaxScore; tmp<(
T*)(&vMaxScore+1); tmp++ )
464 delS[i] = spu_extract( vCD2, nSeg-1 );
466 V vStoreOptx1 = storeOpt[0 ];
467 V vStoreOptx2 = storeOpt[subSegLen];
468 vStoreOptx1 =
spu_max(vStoreOpt1 + (vDelFixed - vDelInc),vZero);
469 vStoreOptx2 =
spu_max(vStoreOpt2 + (vDelFixed - vDelInc),vZero);
471 V vCDx1 = spu_rlmaskqwbyte(vCD2, -
sizeof(
T));
472 vCDx1 = spu_insert( zero, vCDx1, 0 );
473 if( spu_extract(spu_gather(spu_or((vector
unsigned char)spu_cmpgt(vCDx1,vStoreOptx1),(vector
unsigned char)spu_cmpgt(vCDx2,vStoreOptx2))),0) != 0) {
474 for(
int j=0;
LIKELY(j<nSeg+1); ++j) {
478 vCD1 = spu_rlmaskqwbyte(vRotate, -
sizeof(
T));
479 vCD1 = spu_insert( zero, vCD1, 0 );
481 for(
int k=0; k<subSegLen-1; ++k) {
483 vStoreOpt1 = storeOpt[k ];
484 vStoreOpt2 = storeOpt[k + subSegLen];
485 vStoreOpt1 =
spu_max( vStoreOpt1, vCD1 );
486 vStoreOpt2 =
spu_max( vStoreOpt2, vCD2 );
487 storeOpt[k ] = vStoreOpt1;
488 storeOpt[k + subSegLen] = vStoreOpt2;
491 vStoreOpt1 =
spu_max( vStoreOpt1 + vDelFixed, vZero );
492 vStoreOpt2 =
spu_max( vStoreOpt2 + vDelFixed, vZero );
493 vCD1 =
spu_max( vCD1 + vDelInc, vZero );
494 vCD2 =
spu_max( vCD2 + vDelInc, vZero );
497 if(
UNLIKELY(spu_extract(spu_gather(spu_or((vector
unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector
unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0))
504 vStoreOpt1 = storeOpt[subSegLen - 1];
505 vStoreOpt2 = storeOpt[segLen - 1];
506 vStoreOpt1 =
spu_max( vStoreOpt1, vCD1 );
507 vStoreOpt2 =
spu_max( vStoreOpt2, vCD2 );
508 storeOpt[subSegLen - 1] = vStoreOpt1;
509 storeOpt[segLen - 1] = vStoreOpt2;
512 vStoreOpt1 =
spu_max( vStoreOpt1 + vDelFixed, vZero );
513 vStoreOpt2 =
spu_max( vStoreOpt2 + vDelFixed, vZero );
514 vCD1 =
spu_max( vCD1 + vDelInc, vZero );
515 vCD2 =
spu_max( vCD2 + vDelInc, vZero );
518 T temp = spu_extract( vCD2, nSeg-1 );
522 if(
UNLIKELY(spu_extract(spu_gather(spu_or((vector
unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector
unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0))
break;
533 maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
536 printf(
"%c\t",
s2[i]);
537 for(
int ii=0; ii<nSeg; ++ii) {
538 for(
int jj=0; jj<segLen; ++jj) {
539 if(ii*segLen+jj <
ls1)
540 printf(
"%d\t",(
int)(((
T*)storeOpt)[ii+jj*nSeg]-zero));
558 template<
class T,
class V >
560 const int nSeg =
sizeof(V)/
sizeof(
T);
561 const int segLen = currentBlockSize/nSeg;
564 T *currentProfile = ((
T*)currentBlock)+i*currentBlockSize;
565 for(
int j=0; j<segLen; j++ ){
566 T *tmp = currentProfile + j*nSeg;
567 for(
int k=0; k<nSeg; k++ )
568 if( j + k*segLen + blockStart <
ls1 )
569 tmp[k] = simi[
s1[j + k*segLen +
blockStart] * MATRIX_DIM + i ];
582 template<
class T,
class V >
594 template<
class T,
class V >
611 for(
int i=0; i<
ls2; i++ )
612 ((
T*)maxS)[i] = ((
T*)delS)[i] = (
T)zero;
622 printf(
">>>> creating profile\n");
627 printf(
">>>> fetching profile (%lu bytes)\n", currentBlockSize *
MATRIX_DIM *
sizeof(
T));
633 mfc_write_tag_mask(1<<0);
634 mfc_read_tag_status_all();
642 currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (
T*)maxS, (
T*)
delS, (V*)profile, (V*)
loadOpt, (V*)storeOpt, (V*)
rD );
644 currentScore = dynProgrLocalBlock2<T,V> ( currentBlockSize, zero, goal, (
T*)maxS, (
T*)
delS, (V*)profile, (V*)
loadOpt, (V*)storeOpt, (V*)
rD );
646 T currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (
T*)maxS, (
T*)
delS, (V*)profile, (V*)
loadOpt, (V*)storeOpt, (V*)
rD );
648 if( maxScore < currentScore)
649 maxScore = currentScore;
660 return (
double)(maxScore-zero);
667 TdynProgLocal<int8_t, vector int8_t>,
668 TdynProgLocal<int16_t, vector int16_t>,
669 TdynProgLocal<int32_t, vector int32_t>,
670 TdynProgLocal<float, vector float>,
671 TdynProgLocal<double, vector double>
678 TcreateProfile<int8_t, vector int8_t>,
679 TcreateProfile<int16_t, vector int16_t>,
680 TcreateProfile<int32_t, vector int32_t>,
681 TcreateProfile<float, vector float>,
682 TcreateProfile<double, vector double>