DynProgr_SPE_functions.cc

Go to the documentation of this file.
00001 
00005 /*
00006  * Copyright (c) 2007-2008 ETH Zürich, Institute of Computational Science
00007  *
00008  * Permission is hereby granted, free of charge, to any person
00009  * obtaining a copy of this software and associated documentation
00010  * files (the "Software"), to deal in the Software without
00011  * restriction, including without limitation the rights to use,
00012  * copy, modify, merge, publish, distribute, sublicense, and/or sell
00013  * copies of the Software, and to permit persons to whom the
00014  * Software is furnished to do so, subject to the following
00015  * conditions:
00016  *
00017  * The above copyright notice and this permission notice shall be
00018  * included in all copies or substantial portions of the Software.
00019  *
00020  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00021  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
00022  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00023  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
00024  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
00025  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
00026  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
00027  * OTHER DEALINGS IN THE SOFTWARE.
00028  */
00029 
00030 #include "DynProgr_SPE_functions.h"
00031 #include "DynProgr_SPE.h"
00032 #include "matrix.h"
00033 #include <cstdlib>
00034 #include <malloc.h>
00035 #include <float.h>
00036 #include <cstdio>
00037 #include <string.h>
00038 #include <spu_intrinsics.h>
00039 #include <sys/types.h>
00040 
00041 template<typename T> static inline T min( T a, T b ){ return a<b?a:b; }
00042 template<typename T> static inline T max( T a, T b ){ return a>b?a:b; }
00043 
00044 template<typename T> struct IsInteger   { static const bool value = false; };
00045 template<> struct IsInteger<int8_t>     { static const bool value = true; };
00046 template<> struct IsInteger<int16_t>    { static const bool value = true; };
00047 template<> struct IsInteger<int32_t>    { static const bool value = true; };
00048 
00049 template<typename T> struct MaxValue    { static const T value = -1 ^ (1ll<<(sizeof(T)*8-1)); };
00050 template<> struct MaxValue<float>       { static const float value = FLT_MAX; };
00051 template<> struct MaxValue<double>      { static const double value = DBL_MAX; };
00052 
00053 template<typename T> struct MinValue    { static const T value = 1ll<<(sizeof(T)*8-1); };
00054 template<> struct MinValue<float>       { static const float value = FLT_MIN; };
00055 template<> struct MinValue<double>      { static const double value = DBL_MIN; };
00056 
00057 char * s1, * s2;
00058 int ls1, ls2;
00059 int blockStart, blockSize;
00060 int maxDbLen;
00061 double mn, mx, fixedDel, incDel;
00062 void *simi, *profile, *loadOpt, *storeOpt, *rD, *maxS, *delS;
00063 ppu_addr_t remote_profile;
00064 
00068 template< class V > static inline V spu_max( V a, V b ){
00069         return spu_sel(a,b,spu_cmpgt(b,a));
00070 }
00074 template< class V > static inline V spu_min( V a, V b ){
00075         return spu_sel(a,b,spu_cmpgt(a,b));
00076 }
00077 
00078 #undef SHORTCUT
00079 
00097 template< class T, class V > static inline T dynProgrLocalBlock(
00098                                                                                 int currentBlockSize,
00099                                                                                 T zero, T goal,
00100                                                                                 T * maxS, T* delS,
00101                                                                                 const V * profile,
00102                                                                                 V * loadOpt,
00103                                                                                 V * storeOpt,
00104                                                                                 V * rD){
00105         /**********************************************************************
00106         * This version of the code implements the idea presented in
00107         *
00108         ***********************************************************************
00109         * Striped Smith-Waterman speeds database searches six times over other
00110         * SIMD implementations
00111         *
00112         * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
00113         **********************************************************************/
00114 
00115         const V vZero = spu_splats( zero );
00116         const V vGoal = spu_splats( goal );
00117         const V vDelFixed = spu_splats( (T)fixedDel );
00118         const V vDelInc   = spu_splats( (T)incDel   );
00119 
00120         T maxScore = zero;
00121         const int nSeg = sizeof(V)/sizeof(T);    // the number of segments
00122         const int segLen = currentBlockSize/nSeg; // the segment length
00123 
00124         V vMaxScore = vZero;                     // The maximum score
00125         T prevMax   = zero;
00126         /* Initialize the other arrays */
00127         /*******************************/
00128         for(int i=0; LIKELY(i<segLen); i++)
00129                 loadOpt[i] = storeOpt[i] = rD[i] = vZero;
00130 
00131         /* looping through all the columns */
00132         /***********************************/
00133         for( int i=0; LIKELY(i<ls2); i++ ){
00134 
00135                 /* compute the opt and cd score depending on the previous column */
00136                 /*******************************************************************/
00137                 // set the column deletion score to zero, has to be fixed later on
00138                 V vCD = spu_insert( delS[i], vZero, 0);
00139 
00140                 // set the opt score to the elements computed in the previous column
00141                 // set the low of storeOpt to MaxS[j]
00142                 // spu_rlmaskqwbyte is a complicated way to say right shift
00143                 V vStoreOpt = spu_rlmaskqwbyte(storeOpt[segLen-1], -sizeof(T));
00144                 vStoreOpt = spu_insert( prevMax, vStoreOpt, 0 );
00145 
00146                 /* compute the current profile, depending on the character in s2 */
00147                 /*****************************************************************/
00148                 const V * currentProfile = profile + s2[i]*segLen;
00149 
00150 #if 0
00151                 for(int ii=0; ii<nSeg; ++ii) {
00152                         for(int jj=0; jj<segLen; ++jj) {
00153                                 if(ii*segLen+jj < ls1)
00154                                         printf("\t%d",(int)((T*)currentProfile)[ii+jj*nSeg]);
00155                         }
00156                 }
00157                 printf("\n");
00158 #endif
00159 
00160                 /* swap the old optimal score with the new one */
00161                 /***********************************************/
00162                 V * swap = storeOpt;
00163                 storeOpt = loadOpt;
00164                 loadOpt  = swap;
00165 
00166                 /* main loop computing the max, precomputing etc. */
00167                 /**************************************************/
00168                 for( int j=0; LIKELY(j<segLen); j++ ){
00169                         // Load the the rd value
00170                         V vRD           = rD[j];
00171                         V vTmp          = loadOpt[j];
00172                         vRD            += vDelInc;
00173                         vTmp           += vDelFixed;
00174                         if(IsInteger<T>::value) {
00175                                 vRD     = spu_max(vRD,vZero);
00176                         }
00177                         vRD             = spu_max(vTmp,vRD);
00178                         rD[j]           = vRD;
00179 
00180                         // add the profile the prev. opt
00181                         vStoreOpt += currentProfile[j];
00182 
00183                         // To provide saturated arithmetics
00184                         if (IsInteger<T>::value)
00185                                 vStoreOpt = spu_min( vStoreOpt, vGoal );
00186 
00187                         // update the maxscore found so far
00188                         vMaxScore = spu_max( vMaxScore, vStoreOpt );
00189                         // precompute the maximum here
00190                         vTmp = spu_max( vCD, vRD );
00191                         // compute the correct opt score of the cell
00192                         vStoreOpt = spu_max( vStoreOpt, vTmp );
00193                         vStoreOpt = spu_max( vStoreOpt, vZero );
00194 
00195                         // store the opt score of the cell
00196                         storeOpt[j] = vStoreOpt;
00197 
00198                         // precompute rd and cd for next iteration
00199                         vStoreOpt += vDelFixed;
00200                         vCD       += vDelInc;
00201                         if(IsInteger<T>::value)
00202                                 vStoreOpt = spu_max( vStoreOpt, vZero );
00203                         vCD       = spu_max( vStoreOpt, vCD );
00204 
00205                         // load precomputed opt for next iteration
00206                         vStoreOpt = loadOpt[j];
00207                 }
00208 
00209                 /* set totcells */
00210                 /****************/
00211 //         totcells += ls1;
00212                 /* check for a changed MaxScore */
00213                 /********************************/
00214                 for( T* tmp = (T*)&vMaxScore; tmp<(T*)(&vMaxScore+1); tmp++ )
00215                         if (UNLIKELY(maxScore < *tmp))
00216                                 maxScore = *tmp;
00217                 // if the goal was reached, exit
00218                 if ( UNLIKELY(maxScore >= goal) )
00219                         return MaxValue<T>::value;
00220 
00221                 /* cleaning up with missed arrows */
00222                 /**********************************/
00223                 delS[i] = spu_extract( vCD, nSeg-1 );
00224 
00225                 V vStoreOptx = storeOpt[0];
00226                 vStoreOptx = spu_max(vStoreOptx + (vDelFixed - vDelInc),vZero);
00227                 V vCDx = spu_rlmaskqwbyte(vCD, -sizeof(T));
00228                 vCDx = spu_insert( zero, vCDx, 0 );
00229 
00230                 if( spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCDx,vStoreOptx)),0) != 0) {
00231                         for(int j=0; LIKELY(j<nSeg); ++j) {
00232                                 // set everything up for the next iteration
00233                                 vCD = spu_rlmaskqwbyte(vCD, -sizeof(T));
00234                                 vCD = spu_insert( zero, vCD, 0 );
00235 
00236                                 for(int k=0; k<segLen-1; ++k) {
00237                                         // compute the current optimal value of the cell
00238                                         vStoreOpt = storeOpt[k];
00239                                         vStoreOpt = spu_max( vStoreOpt, vCD );
00240                                         storeOpt[k] = vStoreOpt;
00241 
00242                                         // precompute the scores for the next cell
00243                                         vCD = spu_max( vCD + vDelInc, vZero );
00244                                         vStoreOpt = spu_max( vStoreOpt + vDelFixed, vZero );
00245 
00246                                         #ifdef SHORTCUT
00247                                         if(UNLIKELY(spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0))
00248                                                 goto shortcut;
00249                                         #endif
00250 
00251                                 }
00252 
00253                                 // compute the current optimal value of the cell
00254                                 vStoreOpt = storeOpt[segLen-1];
00255                                 vStoreOpt = spu_max( vStoreOpt, vCD );
00256                                 storeOpt[segLen-1] = vStoreOpt;
00257 
00258                                 // precompute the cd value for the next cell
00259                                 vCD = spu_max( vCD + vDelInc, vZero );
00260                                 vStoreOpt = spu_max( vStoreOpt + vDelFixed, vZero );
00261 
00262                                 // Update the del Score
00263                                 T temp = spu_extract( vCD, nSeg-1 );
00264                                 if ( UNLIKELY(delS[i] < temp) )
00265                                         delS[i] = temp;
00266 
00267                                 if(UNLIKELY(spu_extract(spu_gather((vector unsigned char)spu_cmpgt(vCD,vStoreOpt)),0) == 0)) break;
00268                         }
00269                         #ifdef SHORTCUT
00270                         shortcut:
00271                         (void)1;
00272                         #endif
00273                 }
00274 
00275                 /* store the new MaxScore for the next line block */
00276                 /**************************************************/
00277                 prevMax = maxS[i];
00278                 maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
00279 
00280 #ifdef DEBUG
00281                 printf("%c\t",s2[i]);
00282                 for(int ii=0; ii<nSeg; ++ii) {
00283                         for(int jj=0; jj<segLen; ++jj) {
00284                                 if(ii*segLen+jj < ls1)
00285                                         printf("%d\t",(int)(((T*)storeOpt)[ii+jj*nSeg]-zero));
00286                         }
00287                 }
00288                 printf("\n");
00289 #endif
00290         }
00291         return maxScore;
00292 }
00293 
00294 #ifdef UNROLL
00295 
00314 template< class T, class V > static inline T dynProgrLocalBlock2(
00315                                                                                 int currentBlockSize,
00316                                                                                 T zero, T goal,
00317                                                                                 T * maxS, T* delS,
00318                                                                                 const V * profile,
00319                                                                                 V * loadOpt,
00320                                                                                 V * storeOpt,
00321                                                                                 V * rD){
00322         /**********************************************************************
00323         * This version of the code implements the idea presented in
00324         *
00325         ***********************************************************************
00326         * Striped Smith-Waterman speeds database searches six times over other
00327         * SIMD implementations
00328         *
00329         * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
00330         **********************************************************************/
00331 
00332         const V vZero = spu_splats( zero );
00333         const V vGoal = spu_splats( goal );
00334         const V vDelFixed = spu_splats( (T)fixedDel );
00335         const V vDelInc   = spu_splats( (T)incDel   );
00336 
00337         T maxScore = zero;
00338         const int nSeg = sizeof(V)/sizeof(T);             // the number of segments
00339         const int segLen = (currentBlockSize/nSeg + 1) & ~1; // the segment length
00340         const int subSegLen = segLen / 2;                 // the sub segment length
00341         V vMaxScore1 = vZero,vMaxScore2 = vZero;  // The maximum score
00342         T prevMax   = zero;
00343 
00344         /* Initialize the other arrays */
00345         /*******************************/
00346         for(int i=0; LIKELY(i<segLen); i++)
00347                 loadOpt[i] = storeOpt[i] = rD[i] = vZero;
00348 
00349         /* looping through all the columns */
00350         /***********************************/
00351         for( int i=0; LIKELY(i<ls2); i++ ){
00352 
00353                 /* compute the opt and cd score depending on the previous column */
00354                 /*******************************************************************/
00355                 // set the column deletion score to zero, has to be fixed later on
00356                 V vCD1 = spu_insert( delS[i], vZero, 0);
00357                 V vCD2 = vZero;
00358 
00359                 // set the opt score to the elements computed in the previous column
00360                 // set the low of storeOpt to MaxS[j]
00361                 // spu_rlmaskqwbyte is a complicated way to say right shift
00362                 V vStoreOpt1 = spu_rlmaskqwbyte(storeOpt[segLen-1], -sizeof(T));
00363                 vStoreOpt1 = spu_insert( prevMax, vStoreOpt1, 0 );
00364                 V vStoreOpt2 = storeOpt[subSegLen-1];
00365                 /* compute the current profile, depending on the character in s2 */
00366                 /*****************************************************************/
00367                 const V * currentProfile = profile + s2[i]*segLen;
00368 
00369 #if 0
00370                 for(int ii=0; ii<nSeg; ++ii) {
00371                         for(int jj=0; jj<segLen; ++jj) {
00372                                 if(ii*segLen+jj < ls1)
00373                                         printf("\t%d",(int)((T*)currentProfile)[ii+jj*nSeg]);
00374                         }
00375                 }
00376                 printf("\n");
00377 #endif
00378 
00379                 /* swap the old optimal score with the new one */
00380                 /***********************************************/
00381                 V * swap     = storeOpt;
00382                 storeOpt = loadOpt;
00383                 loadOpt  = swap;
00384 
00385                 /* main loop computing the max, precomputing etc. */
00386                 /**************************************************/
00387                 for( int j=0; LIKELY(j<subSegLen); j++ ){
00388                         // lead the row deletion score
00389                         V vRD1           = rD[j];
00390                         V vRD2           = rD[j+subSegLen];
00391                         V vTmp1          = loadOpt[j];
00392                         V vTmp2          = loadOpt[j+subSegLen];
00393                         vRD1            += vDelInc;
00394                         vRD2            += vDelInc;
00395                         vTmp1           += vDelFixed;
00396                         vTmp2           += vDelFixed;
00397                         if(IsInteger<T>::value) {
00398                                 vRD1     = spu_max(vRD1,vZero);
00399                                 vRD2     = spu_max(vRD2,vZero);
00400                         }
00401                         vRD1             = spu_max(vTmp1,vRD1);
00402                         vRD2             = spu_max(vTmp2,vRD2);
00403                         rD[j]            = vRD1;
00404                         rD[j+subSegLen]  = vRD2;
00405 
00406                         // add the profile the prev. opt
00407                         vStoreOpt1 += currentProfile[j];
00408                         vStoreOpt2 += currentProfile[j+subSegLen];
00409 
00410                         // To avoid saturated arithmetics
00411                         if (IsInteger<T>::value){
00412                                 vStoreOpt1 = spu_min( vStoreOpt1, vGoal );
00413                                 vStoreOpt2 = spu_min( vStoreOpt2, vGoal );
00414                         }
00415                         // update the maxscore found so far
00416                         vMaxScore1 = spu_max( vMaxScore1, vStoreOpt1 );
00417                         vMaxScore2 = spu_max( vMaxScore2, vStoreOpt2 );
00418 
00419                         // precompute the maximum here (gives about 5% speedup)
00420                         vTmp1 = spu_max( vCD1, vRD1 );
00421                         vTmp2 = spu_max( vCD2, vRD2 );
00422                         // compute the correct opt score of the cell
00423                         vStoreOpt1 = spu_max( vStoreOpt1, vTmp1 );
00424                         vStoreOpt2 = spu_max( vStoreOpt2, vTmp2 );
00425                         vStoreOpt1 = spu_max( vStoreOpt1, vZero );
00426                         vStoreOpt2 = spu_max( vStoreOpt2, vZero );
00427 
00428                         // store the opt score of the cell
00429                         storeOpt[j          ] = vStoreOpt1;
00430                         storeOpt[j+subSegLen] = vStoreOpt2;
00431 
00432                         // precompute rd and cd for next iteration
00433                         vStoreOpt1 += vDelFixed;
00434                         vStoreOpt2 += vDelFixed;
00435                         vCD1       += vDelInc;
00436                         vCD2       += vDelInc;
00437                         if(IsInteger<T>::value) {
00438                                 vStoreOpt1 = spu_max( vStoreOpt1, vZero );
00439                                 vStoreOpt2 = spu_max( vStoreOpt2, vZero );
00440                         }
00441                         vCD1 = spu_max( vStoreOpt1, vCD1 );
00442                         vCD2 = spu_max( vStoreOpt2, vCD2 );
00443 
00444                         // load precomputed opt for next iteration
00445                         vStoreOpt1 = loadOpt[j];
00446                         vStoreOpt2 = loadOpt[j+subSegLen];
00447                 }
00448 
00449                 /* set totcells */
00450                 /****************/
00451 //         totcells += ls1;
00452                 /* check for a changed MaxScore */
00453                 /********************************/
00454                 V vMaxScore = spu_max( vMaxScore1, vMaxScore2 );
00455                 for( T* tmp = (T*)&vMaxScore; tmp<(T*)(&vMaxScore+1); tmp++ )
00456                         if (UNLIKELY(maxScore < *tmp))
00457                                 maxScore = *tmp;
00458                 // if the goal was reached, exit
00459                 if ( UNLIKELY(maxScore >= goal) )
00460                         return MaxValue<T>::value;
00461 
00462                 /* cleaning up with missed arrows */
00463                 /**********************************/
00464                 delS[i] = spu_extract( vCD2, nSeg-1 );
00465 
00466                 V vStoreOptx1 = storeOpt[0        ];
00467                 V vStoreOptx2 = storeOpt[subSegLen];
00468                 vStoreOptx1 = spu_max(vStoreOpt1 + (vDelFixed - vDelInc),vZero);
00469                 vStoreOptx2 = spu_max(vStoreOpt2 + (vDelFixed - vDelInc),vZero);
00470                 V vCDx2 = vCD1;
00471                 V vCDx1 = spu_rlmaskqwbyte(vCD2, -sizeof(T));
00472                 vCDx1 = spu_insert( zero, vCDx1, 0 );
00473                 if( spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCDx1,vStoreOptx1),(vector unsigned char)spu_cmpgt(vCDx2,vStoreOptx2))),0) != 0) {
00474                         for(int j=0; LIKELY(j<nSeg+1); ++j) {
00475                                 // set everything up for the next iteration
00476                                 V vRotate = vCD2;
00477                                 vCD2 = vCD1;
00478                                 vCD1 = spu_rlmaskqwbyte(vRotate, -sizeof(T));
00479                                 vCD1 = spu_insert( zero, vCD1, 0 );
00480 
00481                                 for(int k=0; k<subSegLen-1; ++k) {
00482                                         // compute the current optimal value of the cell
00483                                         vStoreOpt1 = storeOpt[k            ];
00484                                         vStoreOpt2 = storeOpt[k + subSegLen];
00485                                         vStoreOpt1 = spu_max( vStoreOpt1, vCD1 );
00486                                         vStoreOpt2 = spu_max( vStoreOpt2, vCD2 );
00487                                         storeOpt[k            ] = vStoreOpt1;
00488                                         storeOpt[k + subSegLen] = vStoreOpt2;
00489 
00490                                         // precompute the scores for the next cell
00491                                         vStoreOpt1 = spu_max( vStoreOpt1 + vDelFixed, vZero );
00492                                         vStoreOpt2 = spu_max( vStoreOpt2 + vDelFixed, vZero );
00493                                         vCD1 = spu_max( vCD1 + vDelInc, vZero );
00494                                         vCD2 = spu_max( vCD2 + vDelInc, vZero );
00495 
00496                                         #ifdef SHORTCUT
00497                                         if(UNLIKELY(spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0))
00498                                                 goto shortcut;
00499                                         #endif
00500 
00501                                 }
00502 
00503                                 // compute the current optimal value of the cell
00504                                 vStoreOpt1 = storeOpt[subSegLen - 1];
00505                                 vStoreOpt2 = storeOpt[segLen    - 1];
00506                                 vStoreOpt1 = spu_max( vStoreOpt1, vCD1 );
00507                                 vStoreOpt2 = spu_max( vStoreOpt2, vCD2 );
00508                                 storeOpt[subSegLen - 1] = vStoreOpt1;
00509                                 storeOpt[segLen    - 1] = vStoreOpt2;
00510 
00511                                 // precompute the scores for the next cell
00512                                 vStoreOpt1 = spu_max( vStoreOpt1 + vDelFixed, vZero );
00513                                 vStoreOpt2 = spu_max( vStoreOpt2 + vDelFixed, vZero );
00514                                 vCD1 = spu_max( vCD1 + vDelInc, vZero );
00515                                 vCD2 = spu_max( vCD2 + vDelInc, vZero );
00516 
00517                                 // Update the del Score
00518                                 T temp = spu_extract( vCD2, nSeg-1 );
00519                                 if ( UNLIKELY(delS[i] < temp) )
00520                                         delS[i] = temp;
00521 
00522                                 if(UNLIKELY(spu_extract(spu_gather(spu_or((vector unsigned char)spu_cmpgt(vCD1,vStoreOpt1),(vector unsigned char)spu_cmpgt(vCD2,vStoreOpt2))),0) == 0)) break;
00523                         }
00524                         #ifdef SHORTCUT
00525                         shortcut:
00526                         (void)1;
00527                         #endif
00528                 }
00529 
00530                 /* store the new MaxScore for the next line block */
00531                 /**************************************************/
00532                 prevMax = maxS[i];
00533                 maxS[i] = spu_extract( storeOpt[segLen-1], nSeg-1 );
00534 
00535 #ifdef DEBUG
00536                 printf("%c\t",s2[i]);
00537                 for(int ii=0; ii<nSeg; ++ii) {
00538                         for(int jj=0; jj<segLen; ++jj) {
00539                                 if(ii*segLen+jj < ls1)
00540                                         printf("%d\t",(int)(((T*)storeOpt)[ii+jj*nSeg]-zero));
00541                         }
00542                 }
00543                 printf("\n");
00544 #endif
00545         }
00546         return maxScore;
00547 }
00548 #endif
00549 
00558 template< class T, class V >
00559 static void doCreateProfile( int blockStart, int currentBlockSize, const T* simi, V* currentBlock){
00560         const int nSeg = sizeof(V)/sizeof(T);    // the number of segments
00561         const int segLen = currentBlockSize/nSeg; // the segment length
00562 
00563         for( int i=0; i<MATRIX_DIM; i++ ){
00564                 T *currentProfile = ((T*)currentBlock)+i*currentBlockSize;
00565                 for( int j=0; j<segLen; j++ ){
00566                         T *tmp = currentProfile + j*nSeg;
00567                         for( int k=0; k<nSeg; k++ )
00568                                 if( j + k*segLen + blockStart < ls1 )
00569                                         tmp[k] = simi[ s1[j + k*segLen + blockStart] * MATRIX_DIM + i ];
00570                                 else
00571                                         tmp[k] = 0;
00572                 }
00573         }
00574 }
00575 
00582 template< class T, class V >
00583 static void TcreateProfile(void){
00584         doCreateProfile( blockStart, ALIGN16(min(ls1-blockStart, blockSize)), (T*)simi, (V*)profile);
00585 }
00586 
00594 template< class T, class V >
00595 static double TdynProgLocal(void){
00596         T zero, goal;
00597         /* A vectorized template version */
00598         if (IsInteger<T>::value){
00599                 // adjust the zero and goal values...
00600                 zero = MinValue<T>::value;
00601                 zero-= mn;
00602                 goal = MaxValue<T>::value;
00603                 goal-= mx;
00604         } else {
00605                 zero = 0.0;
00606                 goal = MaxValue<T>::value;
00607         }
00608 
00609         /* Set the stored max and del score to zero */
00610         /********************************************/
00611         for( int i=0; i<ls2; i++ )
00612                 ((T*)maxS)[i] = ((T*)delS)[i] = (T)zero;
00613 
00614         T maxScore=zero;
00615         blockStart = 0;
00616         do {
00617                 const int currentBlockSize = ALIGN16(min(ls1-blockStart,blockSize));
00618                 /* initialize the profile for the current iteration */
00619                 if(blockStart != 0) { /* when blockStart==0 then the profile has been initialized already */
00620                         if(remote_profile == 0) {
00621 #ifdef DEBUG_FETCH
00622                                 printf(">>>> creating profile\n");
00623 #endif
00624                                 doCreateProfile<T,V>( blockStart, currentBlockSize, (T*)simi, (V*)profile);
00625                         } else {
00626 #ifdef DEBUG_FETCH
00627                                 printf(">>>> fetching profile (%lu bytes)\n",  currentBlockSize * MATRIX_DIM * sizeof(T));
00628 #endif
00629                                 for( int64_t bs=0; bs<currentBlockSize * MATRIX_DIM * sizeof(T); bs+=MAX_TRANSFER ) {
00630                                         mfc_get( ((char*)profile)+bs, remote_profile+blockStart*MATRIX_DIM*sizeof(T)+bs, ALIGN16(min(currentBlockSize*MATRIX_DIM*sizeof(T)-bs, (int64_t)MAX_TRANSFER)), 0, 0, 0 );
00631 
00632                                         /* wait for DMA to finish */
00633                                         mfc_write_tag_mask(1<<0);
00634                                         mfc_read_tag_status_all();
00635                                 }
00636                         }
00637                 }
00638 
00639 #ifdef UNROLL
00640                 T currentScore;
00641                 if (sizeof(T) < 2)
00642                         currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
00643                 else
00644                         currentScore = dynProgrLocalBlock2<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
00645 #else
00646                 T currentScore = dynProgrLocalBlock<T,V> ( currentBlockSize, zero, goal, (T*)maxS, (T*)delS, (V*)profile, (V*)loadOpt, (V*)storeOpt, (V*)rD );
00647 #endif
00648                 if( maxScore < currentScore)
00649                         maxScore = currentScore;
00650 
00651                 if(maxScore >= goal)
00652                         return DBL_MAX;
00653 
00654                 if(blockStart+blockSize >= ls1) break;
00655 
00656                 blockStart += blockSize;
00657         } while(1);
00658         /* Finally free all the memory we allocated */
00659         /********************************************/
00660         return (double)(maxScore-zero);
00661 }
00662 
00666 dvf_t dynProgLocal[] = {
00667         TdynProgLocal<int8_t, vector int8_t>,
00668         TdynProgLocal<int16_t, vector int16_t>,
00669         TdynProgLocal<int32_t, vector int32_t>,
00670         TdynProgLocal<float, vector float>,
00671         TdynProgLocal<double, vector double>
00672 };
00673 
00677 vvf_t createProfile[] = {
00678         TcreateProfile<int8_t, vector int8_t>,
00679         TcreateProfile<int16_t, vector int16_t>,
00680         TcreateProfile<int32_t, vector int32_t>,
00681         TcreateProfile<float, vector float>,
00682         TcreateProfile<double, vector double>
00683 };
00684 

Generated on Thu Jun 5 12:44:37 2008 for swps3 by  doxygen 1.5.4