DynProgr_sse_short.c

Go to the documentation of this file.
00001 
00005 /*
00006  * Copyright (c) 2007-2008 ETH Zürich, Institute of Computational Science
00007  *
00008  * Permission is hereby granted, free of charge, to any person
00009  * obtaining a copy of this software and associated documentation
00010  * files (the "Software"), to deal in the Software without
00011  * restriction, including without limitation the rights to use,
00012  * copy, modify, merge, publish, distribute, sublicense, and/or sell
00013  * copies of the Software, and to permit persons to whom the
00014  * Software is furnished to do so, subject to the following
00015  * conditions:
00016  *
00017  * The above copyright notice and this permission notice shall be
00018  * included in all copies or substantial portions of the Software.
00019  *
00020  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00021  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
00022  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00023  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
00024  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
00025  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
00026  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
00027  * OTHER DEALINGS IN THE SOFTWARE.
00028  */
00029 
00030 #include "DynProgr_sse_short.h"
00031 #include <stdio.h>
00032 #include <unistd.h>
00033 #include <float.h>
00034 
00035 #define PAGE_ALIGN(x) (((size_t)(x)+sysconf(_SC_PAGESIZE)-1)&~(sysconf(_SC_PAGESIZE)-1))
00036 EXPORT ProfileShort * swps3_createProfileShortSSE( const char * query, int queryLen, SBMatrix matrix ){
00037         /* int segLen  = ((queryLen+7)/8 + 1) & ~1; */
00038         int segLen  = (queryLen+7)/8;
00039         int i,j,k;
00040         int bias = 0;
00041         char * pprofile;
00042         ProfileShort * profile = malloc( sizeof(ProfileShort)+((segLen*MATRIX_DIM+1) & ~(0x1))*sizeof(__m64)+segLen*3*sizeof(__m128i)+64+2*sysconf(_SC_PAGESIZE) );
00043 
00044         profile->loadOpt     = (__m128i*) ((size_t) (profile->data + 15) & ~(0xf)) ;
00045         profile->storeOpt    = profile->loadOpt + segLen;
00046         profile->rD          = profile->storeOpt + segLen;
00047         profile->profile     = (__m64*) PAGE_ALIGN(profile->rD + segLen);
00048 
00049         /* Init the profile */
00050         profile->len = queryLen;
00051         /* Init the byte profile */
00052         for(i=0; i<MATRIX_DIM; i++)
00053                 for(j=0; j<MATRIX_DIM; j++)
00054                         if (bias < -matrix[ i*MATRIX_DIM+j ])
00055                                 bias = -matrix[ i*MATRIX_DIM+j ];
00056         pprofile = (char*)profile->profile;
00057 
00058         for(i=0; i<MATRIX_DIM; i++)
00059                 for(j=0; j<segLen; j++)
00060                         for(k=0; k<8; k++)
00061                                 if(j+k*segLen < queryLen)
00062                                         *(pprofile++) = matrix[query[j+k*segLen]*MATRIX_DIM+i]+bias;
00063                                 else
00064                                         *(pprofile++) = 0;
00065         profile->bias = bias;
00066         return profile;
00067 }
00068 
00069 double swps3_alignmentShort2SSE( ProfileShort * query, const char * db, int dbLen, Options * options )
00070 {
00071 
00072         /**********************************************************************
00073         * This version of the code implements the idea presented in
00074         *
00075         ***********************************************************************
00076         * Striped Smith-Waterman speeds database searches six times over other
00077         * SIMD implementations
00078         *
00079         * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
00080         **********************************************************************/
00081 
00082         int i, j;
00083         u_int16_t MaxScore = 0x8000;
00084         int segLength = ((query->len+7)/8 + 1) & ~1; /* the segment length */
00085         int subSegLen = segLength / 2;
00086 
00087         __m128i * loadOpt  = query->loadOpt;
00088         __m128i * storeOpt = query->storeOpt;
00089         __m128i * rD       = query->rD;
00090         __m64   * current_profile;
00091         __m128i * swap;
00092 
00093         __m128i vMinimums = _mm_set1_epi16(0x8000);
00094 
00095         __m128i vDelIncr  = _mm_set1_epi16(options->gapExt);
00096         __m128i vDelFixed = _mm_set1_epi16(options->gapOpen);
00097         __m128i vBias     = _mm_set1_epi16(query->bias);
00098 
00099         __m128i vMaxScore = vMinimums;
00100         __m128i vMaxScore1 = vMinimums, vMaxScore2 = vMinimums;
00101 
00102         __m128i vProfile1 = vMinimums, vProfile2 = vMinimums;   /* the score profile */
00103         __m128i vStoreOpt1, vStoreOpt2; /* the new optimal score */
00104         __m128i vRD1, vRD2;             /* the new row deletion score */
00105         __m128i vCD1, vCD2;             /* the column deletion score */
00106         __m128i vTmp1, vTmp2;           /* the column deletion score */
00107 
00108         /* initialize the other arrays used for the dynProg code */
00109         /*********************************************************/
00110         for(i=0; LIKELY(i<segLength); i++){
00111                 _mm_store_si128(loadOpt+i,vMinimums);
00112                 _mm_store_si128(storeOpt+i,vMinimums);
00113                 _mm_store_si128(rD+i,vMinimums);
00114         }
00115 
00116         /* looping through all the columns */
00117         /***********************************/
00118         for(j=0; LIKELY(j<dbLen); j++){
00119                 /* compute the opt and cd score depending on the previous column */
00120                 /*******************************************************************/
00121                 /* set the column deletion score to zero, has to be fixed later on */
00122                 vCD1 = vCD2 = vMinimums;
00123 
00124                 /* set the opt score to the elements computed in the previous column */
00125                 /* set the low of storeOpt to MaxS[j] */
00126                 vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
00127                 vStoreOpt1 = _mm_slli_si128(vStoreOpt1, 2);
00128                 vStoreOpt1 = _mm_insert_epi16(vStoreOpt1,(int)0x8000,0);
00129                 vStoreOpt2 = _mm_load_si128(storeOpt+subSegLen-1);
00130 
00131                 /* compute the current profile, depending on the character in s2 */
00132                 /*****************************************************************/
00133 
00134                 current_profile = query->profile + db[j]*segLength;
00135                 /* swap the old optimal score with the new one */
00136                 /***********************************************/
00137                 swap = storeOpt;
00138                 storeOpt = loadOpt;
00139                 loadOpt = swap;
00140 
00141                 /* main loop computing the max, precomputing etc. */
00142                 /**************************************************/
00143                 for(i=0; LIKELY(i<subSegLen); i++){
00144 
00145                         vRD1 = _mm_load_si128(rD+i);
00146                         vRD2 = _mm_load_si128(rD+i+subSegLen);
00147 
00148                         /* load the current profile */
00149                         /*vProfile = _mm_movpi64_epi64(current_profile[i]);*/
00150                         /*vProfile = _mm_loadl_epi64((__m128i*)(current_profile+i));*/
00151                         __asm__("movq (%1),%0" : "=x" (vProfile1) : "r" (current_profile+i));
00152                         vProfile1 = _mm_unpacklo_epi8(vProfile1, _mm_xor_si128(vProfile1,vProfile1));
00153                         vProfile1 = _mm_subs_epi16(vProfile1, vBias);
00154                         __asm__("movq (%1),%0" : "=x" (vProfile2) : "r" (current_profile+i+subSegLen));
00155                         vProfile2 = _mm_unpacklo_epi8(vProfile2, _mm_xor_si128(vProfile2,vProfile2));
00156                         vProfile2 = _mm_subs_epi16(vProfile2, vBias);
00157 
00158                         /* add the profile the prev. opt */
00159                         vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vProfile1);
00160                         vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vProfile2);
00161 
00162                         /* update the maxscore found so far */
00163                         vMaxScore1 = _mm_max_epi16(vMaxScore1, vStoreOpt1);
00164                         vMaxScore2 = _mm_max_epi16(vMaxScore2, vStoreOpt2);
00165 
00166                         /* compute the correct opt score of the cell */
00167                         vTmp1 = _mm_max_epi16(vRD1, vCD1);
00168                         vTmp2 = _mm_max_epi16(vRD2, vCD2);
00169                         vStoreOpt1 = _mm_max_epi16(vStoreOpt1, vTmp1);
00170                         vStoreOpt2 = _mm_max_epi16(vStoreOpt2, vTmp2);
00171 
00172                         /* store the opt score of the cell */
00173                         _mm_store_si128(storeOpt+i,           vStoreOpt1);
00174                         _mm_store_si128(storeOpt+i+subSegLen, vStoreOpt2);
00175 
00176                         /* precompute rd and cd for next iteration */
00177                         vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vDelFixed);
00178                         vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vDelFixed);
00179                         vRD1 = _mm_adds_epi16(vRD1, vDelIncr);
00180                         vRD2 = _mm_adds_epi16(vRD2, vDelIncr);
00181                         vRD1 = _mm_max_epi16(vRD1, vStoreOpt1);
00182                         vRD2 = _mm_max_epi16(vRD2, vStoreOpt2);
00183 
00184                         vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
00185                         vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
00186                         vCD1 = _mm_max_epi16(vCD1, vStoreOpt1);
00187                         vCD2 = _mm_max_epi16(vCD2, vStoreOpt2);
00188 
00189                         /* store precomputed rd */
00190                         _mm_store_si128(rD+i,           vRD1);
00191                         _mm_store_si128(rD+i+subSegLen, vRD2);
00192 
00193                         /* load precomputed opt for next iteration */
00194                         vStoreOpt1 = _mm_load_si128(loadOpt+i);
00195                         vStoreOpt2 = _mm_load_si128(loadOpt+i+subSegLen);
00196                 }
00197 
00198 
00199                 for(i=0;LIKELY(i<9);++i){
00200                         int k;
00201                         /* compute the gap extend penalty for the current cell */
00202                         __m128i vRotate = vCD2;
00203                         vCD2 = vCD1;
00204                         vCD1 = _mm_slli_si128(vRotate,2);
00205                         vCD1 = _mm_insert_epi16(vCD1,0x8000,0);
00206 
00207                         for(k=0;LIKELY(k<subSegLen);++k) {
00208                            /* compute the current optimal value of the cell */
00209                            vStoreOpt1 = _mm_load_si128(storeOpt+k);
00210                            vStoreOpt2 = _mm_load_si128(storeOpt+k+subSegLen);
00211                            vStoreOpt1 = _mm_max_epi16(vStoreOpt1,vCD1);
00212                            vStoreOpt2 = _mm_max_epi16(vStoreOpt2,vCD2);
00213                            _mm_store_si128(storeOpt+k,vStoreOpt1);
00214                            _mm_store_si128(storeOpt+k+subSegLen,vStoreOpt2);
00215 
00216                            /* precompute the scores for the next cell */
00217                            vStoreOpt1 = _mm_adds_epi16(vStoreOpt1,vDelFixed);
00218                            vStoreOpt2 = _mm_adds_epi16(vStoreOpt2,vDelFixed);
00219                            vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
00220                            vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
00221 
00222                            /* compute the current optimal rd value */
00223                            vRD1 = _mm_load_si128(rD+k);
00224                            vRD2 = _mm_load_si128(rD+k+subSegLen);
00225                            vRD1 = _mm_max_epi16(vRD1,vStoreOpt1);
00226                            vRD2 = _mm_max_epi16(vRD2,vStoreOpt2);
00227                            _mm_store_si128(rD+k,vRD1);
00228                            _mm_store_si128(rD+k+subSegLen,vRD2);
00229 
00230                            if(UNLIKELY(!_mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi16(vCD1,vStoreOpt1),_mm_cmpgt_epi16(vCD2,vStoreOpt2))))) goto shortcut;
00231                         }
00232                 }
00233                 shortcut:
00234                 /* store the new MaxScore for the next line block */
00235                 /**************************************************/
00236 
00237                 /* store the element of storeOpt in MaxS */
00238                 vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
00239                 vStoreOpt2 = _mm_load_si128(storeOpt+segLength+subSegLen-1);
00240         }
00241         vMaxScore = _mm_max_epi16(vMaxScore1, vMaxScore2);
00242         vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
00243         vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
00244         vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
00245         MaxScore = _mm_extract_epi16(vMaxScore,0);
00246         return (double)(MaxScore-0x8000);
00247 }
00248 
00249 EXPORT double swps3_alignmentShortSSE( ProfileShort * query, const char * db, int dbLen, Options * options )
00250 {
00251 
00252         /**********************************************************************
00253         * This version of the code implements the idea presented in
00254         *
00255         ***********************************************************************
00256         * Striped Smith-Waterman speeds database searches six times over other
00257         * SIMD implementations
00258         *
00259         * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
00260         **********************************************************************/
00261 
00262         int i, j;
00263         u_int16_t MaxScore = 0x8000;
00264         int segLength = (query->len+7)/8; /* the segment length */
00265 
00266         __m128i * loadOpt  = query->loadOpt;
00267         __m128i * storeOpt = query->storeOpt;
00268         __m128i * rD       = query->rD;
00269         __m64   * current_profile;
00270         __m128i * swap;
00271 
00272         __m128i vMinimums = _mm_set1_epi16(0x8000);
00273 
00274         __m128i vDelIncr  = _mm_set1_epi16(options->gapExt);
00275         __m128i vDelFixed = _mm_set1_epi16(options->gapOpen);
00276         __m128i vBias     = _mm_set1_epi16(query->bias);
00277 
00278         __m128i vMaxScore = vMinimums;  /* vMaxScore = [0,0] */
00279 
00280         __m128i vProfile = vMinimums;   /* the score profile */
00281         __m128i vStoreOpt;              /* the new optimal score */
00282         __m128i vRD;                    /* the new row deletion score */
00283         __m128i vCD;                    /* the column deletion score */
00284         __m128i vTmp;
00285 
00286         /* initialize the other arrays used for the dynProg code */
00287         /*********************************************************/
00288         for(i=0; LIKELY(i<segLength); i++){
00289                 _mm_store_si128(loadOpt+i,vMinimums);
00290                 _mm_store_si128(storeOpt+i,vMinimums);
00291                 _mm_store_si128(rD+i,vMinimums);
00292         }
00293 
00294         /* looping through all the columns */
00295         /***********************************/
00296         for(j=0; LIKELY(j<dbLen); j++){
00297 
00298 
00299                 /* compute the opt and cd score depending on the previous column */
00300                 /*******************************************************************/
00301                 /* set the column deletion score to zero, has to be fixed later on */
00302                 vCD = vMinimums;
00303 
00304                 /* set the opt score to the elements computed in the previous column */
00305                 /* set the low of storeOpt to MaxS[j] */
00306                 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
00307                 vStoreOpt = _mm_slli_si128(vStoreOpt, 2);
00308                 vStoreOpt = _mm_insert_epi16(vStoreOpt,(int)0x8000,0);
00309 
00310                 /* compute the current profile, depending on the character in s2 */
00311                 /*****************************************************************/
00312 
00313                 current_profile = query->profile + db[j]*segLength;
00314                 /* swap the old optimal score with the new one */
00315                 /***********************************************/
00316                 swap = storeOpt;
00317                 storeOpt = loadOpt;
00318                 loadOpt = swap;
00319 
00320                 /* main loop computing the max, precomputing etc. */
00321                 /**************************************************/
00322                 for(i=0; LIKELY(i<segLength); i++){
00323 
00324                         vRD = _mm_load_si128(rD+i);
00325                         vRD = _mm_adds_epi16(vRD, vDelIncr);
00326                         vTmp = _mm_load_si128(loadOpt+i);
00327                         vTmp = _mm_adds_epi16(vTmp,vDelFixed);
00328                         vRD = _mm_max_epi16(vRD,vTmp);
00329                         _mm_store_si128(rD+i, vRD);
00330 
00331                         /* load the current profile */
00332                         /*vProfile = _mm_movpi64_epi64(current_profile[i]);*/
00333                         /*vProfile = _mm_loadl_epi64((__m128i*)(current_profile+i));*/
00334                         __asm__("movq (%1),%0" : "=x" (vProfile) : "r" (current_profile+i));
00335                         vProfile = _mm_unpacklo_epi8(vProfile, _mm_xor_si128(vProfile,vProfile));
00336                         vProfile = _mm_subs_epi16(vProfile, vBias);
00337 
00338                         /* add the profile the prev. opt */
00339                         vStoreOpt = _mm_adds_epi16(vStoreOpt, vProfile);
00340 
00341                         /* update the maxscore found so far */
00342                         vMaxScore = _mm_max_epi16(vMaxScore, vStoreOpt);
00343 
00344                         /* compute the correct opt score of the cell */
00345                         vStoreOpt = _mm_max_epi16(vStoreOpt, vCD);
00346                         vStoreOpt = _mm_max_epi16(vStoreOpt, vRD);
00347 
00348                         /* store the opt score of the cell */
00349                         _mm_store_si128(storeOpt+i, vStoreOpt);
00350 
00351                         /* precompute cd for next iteration */
00352                         vStoreOpt = _mm_adds_epi16(vStoreOpt, vDelFixed);
00353                         vCD = _mm_adds_epi16(vCD, vDelIncr);
00354                         vCD = _mm_max_epi16(vCD, vStoreOpt);
00355 
00356                         /* load precomputed opt for next iteration */
00357                         vStoreOpt = _mm_load_si128(loadOpt+i);
00358                 }
00359 
00360 
00361                 for(i=0;LIKELY(i<8);++i){
00362                         int k;
00363                         /* compute the gap extend penalty for the current cell */
00364                         vCD = _mm_slli_si128(vCD,2);
00365                         vCD = _mm_insert_epi16(vCD,0x8000,0);
00366 
00367                         for(k=0;LIKELY(k<segLength);++k) {
00368                            /* compute the current optimal value of the cell */
00369                            vStoreOpt = _mm_load_si128(storeOpt+k);
00370                            vStoreOpt = _mm_max_epi16(vStoreOpt,vCD);
00371                            _mm_store_si128(storeOpt+k,vStoreOpt);
00372 
00373                            /* precompute the scores for the next cell */
00374                            vStoreOpt = _mm_adds_epi16(vStoreOpt,vDelFixed);
00375                            vCD = _mm_adds_epi16(vCD, vDelIncr);
00376 
00377                            if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(vCD,vStoreOpt)))) goto shortcut;
00378                         }
00379                 }
00380                 shortcut:
00381                 /* store the new MaxScore for the next line block */
00382                 /**************************************************/
00383 
00384                 /* store the element of storeOpt in MaxS */
00385                 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
00386         }
00387         vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
00388         vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
00389         vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
00390         MaxScore = _mm_extract_epi16(vMaxScore,0);
00391         if (MaxScore == 0x7fff)
00392                 return DBL_MAX;
00393         return (double)(u_int16_t)(MaxScore-(u_int16_t)0x8000);
00394 }
00395 
00396 
00397 EXPORT void swps3_freeProfileShortSSE( ProfileShort * profile ){
00398         free( profile );
00399 }
00400 

Generated on Thu Jun 5 12:44:37 2008 for swps3 by  doxygen 1.5.4