00001
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "DynProgr_sse_short.h"
00031 #include <stdio.h>
00032 #include <unistd.h>
00033 #include <float.h>
00034
00035 #define PAGE_ALIGN(x) (((size_t)(x)+sysconf(_SC_PAGESIZE)-1)&~(sysconf(_SC_PAGESIZE)-1))
00036 EXPORT ProfileShort * swps3_createProfileShortSSE( const char * query, int queryLen, SBMatrix matrix ){
00037
00038 int segLen = (queryLen+7)/8;
00039 int i,j,k;
00040 int bias = 0;
00041 char * pprofile;
00042 ProfileShort * profile = malloc( sizeof(ProfileShort)+((segLen*MATRIX_DIM+1) & ~(0x1))*sizeof(__m64)+segLen*3*sizeof(__m128i)+64+2*sysconf(_SC_PAGESIZE) );
00043
00044 profile->loadOpt = (__m128i*) ((size_t) (profile->data + 15) & ~(0xf)) ;
00045 profile->storeOpt = profile->loadOpt + segLen;
00046 profile->rD = profile->storeOpt + segLen;
00047 profile->profile = (__m64*) PAGE_ALIGN(profile->rD + segLen);
00048
00049
00050 profile->len = queryLen;
00051
00052 for(i=0; i<MATRIX_DIM; i++)
00053 for(j=0; j<MATRIX_DIM; j++)
00054 if (bias < -matrix[ i*MATRIX_DIM+j ])
00055 bias = -matrix[ i*MATRIX_DIM+j ];
00056 pprofile = (char*)profile->profile;
00057
00058 for(i=0; i<MATRIX_DIM; i++)
00059 for(j=0; j<segLen; j++)
00060 for(k=0; k<8; k++)
00061 if(j+k*segLen < queryLen)
00062 *(pprofile++) = matrix[query[j+k*segLen]*MATRIX_DIM+i]+bias;
00063 else
00064 *(pprofile++) = 0;
00065 profile->bias = bias;
00066 return profile;
00067 }
00068
00069 double swps3_alignmentShort2SSE( ProfileShort * query, const char * db, int dbLen, Options * options )
00070 {
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082 int i, j;
00083 u_int16_t MaxScore = 0x8000;
00084 int segLength = ((query->len+7)/8 + 1) & ~1;
00085 int subSegLen = segLength / 2;
00086
00087 __m128i * loadOpt = query->loadOpt;
00088 __m128i * storeOpt = query->storeOpt;
00089 __m128i * rD = query->rD;
00090 __m64 * current_profile;
00091 __m128i * swap;
00092
00093 __m128i vMinimums = _mm_set1_epi16(0x8000);
00094
00095 __m128i vDelIncr = _mm_set1_epi16(options->gapExt);
00096 __m128i vDelFixed = _mm_set1_epi16(options->gapOpen);
00097 __m128i vBias = _mm_set1_epi16(query->bias);
00098
00099 __m128i vMaxScore = vMinimums;
00100 __m128i vMaxScore1 = vMinimums, vMaxScore2 = vMinimums;
00101
00102 __m128i vProfile1 = vMinimums, vProfile2 = vMinimums;
00103 __m128i vStoreOpt1, vStoreOpt2;
00104 __m128i vRD1, vRD2;
00105 __m128i vCD1, vCD2;
00106 __m128i vTmp1, vTmp2;
00107
00108
00109
00110 for(i=0; LIKELY(i<segLength); i++){
00111 _mm_store_si128(loadOpt+i,vMinimums);
00112 _mm_store_si128(storeOpt+i,vMinimums);
00113 _mm_store_si128(rD+i,vMinimums);
00114 }
00115
00116
00117
00118 for(j=0; LIKELY(j<dbLen); j++){
00119
00120
00121
00122 vCD1 = vCD2 = vMinimums;
00123
00124
00125
00126 vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
00127 vStoreOpt1 = _mm_slli_si128(vStoreOpt1, 2);
00128 vStoreOpt1 = _mm_insert_epi16(vStoreOpt1,(int)0x8000,0);
00129 vStoreOpt2 = _mm_load_si128(storeOpt+subSegLen-1);
00130
00131
00132
00133
00134 current_profile = query->profile + db[j]*segLength;
00135
00136
00137 swap = storeOpt;
00138 storeOpt = loadOpt;
00139 loadOpt = swap;
00140
00141
00142
00143 for(i=0; LIKELY(i<subSegLen); i++){
00144
00145 vRD1 = _mm_load_si128(rD+i);
00146 vRD2 = _mm_load_si128(rD+i+subSegLen);
00147
00148
00149
00150
00151 __asm__("movq (%1),%0" : "=x" (vProfile1) : "r" (current_profile+i));
00152 vProfile1 = _mm_unpacklo_epi8(vProfile1, _mm_xor_si128(vProfile1,vProfile1));
00153 vProfile1 = _mm_subs_epi16(vProfile1, vBias);
00154 __asm__("movq (%1),%0" : "=x" (vProfile2) : "r" (current_profile+i+subSegLen));
00155 vProfile2 = _mm_unpacklo_epi8(vProfile2, _mm_xor_si128(vProfile2,vProfile2));
00156 vProfile2 = _mm_subs_epi16(vProfile2, vBias);
00157
00158
00159 vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vProfile1);
00160 vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vProfile2);
00161
00162
00163 vMaxScore1 = _mm_max_epi16(vMaxScore1, vStoreOpt1);
00164 vMaxScore2 = _mm_max_epi16(vMaxScore2, vStoreOpt2);
00165
00166
00167 vTmp1 = _mm_max_epi16(vRD1, vCD1);
00168 vTmp2 = _mm_max_epi16(vRD2, vCD2);
00169 vStoreOpt1 = _mm_max_epi16(vStoreOpt1, vTmp1);
00170 vStoreOpt2 = _mm_max_epi16(vStoreOpt2, vTmp2);
00171
00172
00173 _mm_store_si128(storeOpt+i, vStoreOpt1);
00174 _mm_store_si128(storeOpt+i+subSegLen, vStoreOpt2);
00175
00176
00177 vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vDelFixed);
00178 vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vDelFixed);
00179 vRD1 = _mm_adds_epi16(vRD1, vDelIncr);
00180 vRD2 = _mm_adds_epi16(vRD2, vDelIncr);
00181 vRD1 = _mm_max_epi16(vRD1, vStoreOpt1);
00182 vRD2 = _mm_max_epi16(vRD2, vStoreOpt2);
00183
00184 vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
00185 vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
00186 vCD1 = _mm_max_epi16(vCD1, vStoreOpt1);
00187 vCD2 = _mm_max_epi16(vCD2, vStoreOpt2);
00188
00189
00190 _mm_store_si128(rD+i, vRD1);
00191 _mm_store_si128(rD+i+subSegLen, vRD2);
00192
00193
00194 vStoreOpt1 = _mm_load_si128(loadOpt+i);
00195 vStoreOpt2 = _mm_load_si128(loadOpt+i+subSegLen);
00196 }
00197
00198
00199 for(i=0;LIKELY(i<9);++i){
00200 int k;
00201
00202 __m128i vRotate = vCD2;
00203 vCD2 = vCD1;
00204 vCD1 = _mm_slli_si128(vRotate,2);
00205 vCD1 = _mm_insert_epi16(vCD1,0x8000,0);
00206
00207 for(k=0;LIKELY(k<subSegLen);++k) {
00208
00209 vStoreOpt1 = _mm_load_si128(storeOpt+k);
00210 vStoreOpt2 = _mm_load_si128(storeOpt+k+subSegLen);
00211 vStoreOpt1 = _mm_max_epi16(vStoreOpt1,vCD1);
00212 vStoreOpt2 = _mm_max_epi16(vStoreOpt2,vCD2);
00213 _mm_store_si128(storeOpt+k,vStoreOpt1);
00214 _mm_store_si128(storeOpt+k+subSegLen,vStoreOpt2);
00215
00216
00217 vStoreOpt1 = _mm_adds_epi16(vStoreOpt1,vDelFixed);
00218 vStoreOpt2 = _mm_adds_epi16(vStoreOpt2,vDelFixed);
00219 vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
00220 vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
00221
00222
00223 vRD1 = _mm_load_si128(rD+k);
00224 vRD2 = _mm_load_si128(rD+k+subSegLen);
00225 vRD1 = _mm_max_epi16(vRD1,vStoreOpt1);
00226 vRD2 = _mm_max_epi16(vRD2,vStoreOpt2);
00227 _mm_store_si128(rD+k,vRD1);
00228 _mm_store_si128(rD+k+subSegLen,vRD2);
00229
00230 if(UNLIKELY(!_mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi16(vCD1,vStoreOpt1),_mm_cmpgt_epi16(vCD2,vStoreOpt2))))) goto shortcut;
00231 }
00232 }
00233 shortcut:
00234
00235
00236
00237
00238 vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
00239 vStoreOpt2 = _mm_load_si128(storeOpt+segLength+subSegLen-1);
00240 }
00241 vMaxScore = _mm_max_epi16(vMaxScore1, vMaxScore2);
00242 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
00243 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
00244 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
00245 MaxScore = _mm_extract_epi16(vMaxScore,0);
00246 return (double)(MaxScore-0x8000);
00247 }
00248
00249 EXPORT double swps3_alignmentShortSSE( ProfileShort * query, const char * db, int dbLen, Options * options )
00250 {
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262 int i, j;
00263 u_int16_t MaxScore = 0x8000;
00264 int segLength = (query->len+7)/8;
00265
00266 __m128i * loadOpt = query->loadOpt;
00267 __m128i * storeOpt = query->storeOpt;
00268 __m128i * rD = query->rD;
00269 __m64 * current_profile;
00270 __m128i * swap;
00271
00272 __m128i vMinimums = _mm_set1_epi16(0x8000);
00273
00274 __m128i vDelIncr = _mm_set1_epi16(options->gapExt);
00275 __m128i vDelFixed = _mm_set1_epi16(options->gapOpen);
00276 __m128i vBias = _mm_set1_epi16(query->bias);
00277
00278 __m128i vMaxScore = vMinimums;
00279
00280 __m128i vProfile = vMinimums;
00281 __m128i vStoreOpt;
00282 __m128i vRD;
00283 __m128i vCD;
00284 __m128i vTmp;
00285
00286
00287
00288 for(i=0; LIKELY(i<segLength); i++){
00289 _mm_store_si128(loadOpt+i,vMinimums);
00290 _mm_store_si128(storeOpt+i,vMinimums);
00291 _mm_store_si128(rD+i,vMinimums);
00292 }
00293
00294
00295
00296 for(j=0; LIKELY(j<dbLen); j++){
00297
00298
00299
00300
00301
00302 vCD = vMinimums;
00303
00304
00305
00306 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
00307 vStoreOpt = _mm_slli_si128(vStoreOpt, 2);
00308 vStoreOpt = _mm_insert_epi16(vStoreOpt,(int)0x8000,0);
00309
00310
00311
00312
00313 current_profile = query->profile + db[j]*segLength;
00314
00315
00316 swap = storeOpt;
00317 storeOpt = loadOpt;
00318 loadOpt = swap;
00319
00320
00321
00322 for(i=0; LIKELY(i<segLength); i++){
00323
00324 vRD = _mm_load_si128(rD+i);
00325 vRD = _mm_adds_epi16(vRD, vDelIncr);
00326 vTmp = _mm_load_si128(loadOpt+i);
00327 vTmp = _mm_adds_epi16(vTmp,vDelFixed);
00328 vRD = _mm_max_epi16(vRD,vTmp);
00329 _mm_store_si128(rD+i, vRD);
00330
00331
00332
00333
00334 __asm__("movq (%1),%0" : "=x" (vProfile) : "r" (current_profile+i));
00335 vProfile = _mm_unpacklo_epi8(vProfile, _mm_xor_si128(vProfile,vProfile));
00336 vProfile = _mm_subs_epi16(vProfile, vBias);
00337
00338
00339 vStoreOpt = _mm_adds_epi16(vStoreOpt, vProfile);
00340
00341
00342 vMaxScore = _mm_max_epi16(vMaxScore, vStoreOpt);
00343
00344
00345 vStoreOpt = _mm_max_epi16(vStoreOpt, vCD);
00346 vStoreOpt = _mm_max_epi16(vStoreOpt, vRD);
00347
00348
00349 _mm_store_si128(storeOpt+i, vStoreOpt);
00350
00351
00352 vStoreOpt = _mm_adds_epi16(vStoreOpt, vDelFixed);
00353 vCD = _mm_adds_epi16(vCD, vDelIncr);
00354 vCD = _mm_max_epi16(vCD, vStoreOpt);
00355
00356
00357 vStoreOpt = _mm_load_si128(loadOpt+i);
00358 }
00359
00360
00361 for(i=0;LIKELY(i<8);++i){
00362 int k;
00363
00364 vCD = _mm_slli_si128(vCD,2);
00365 vCD = _mm_insert_epi16(vCD,0x8000,0);
00366
00367 for(k=0;LIKELY(k<segLength);++k) {
00368
00369 vStoreOpt = _mm_load_si128(storeOpt+k);
00370 vStoreOpt = _mm_max_epi16(vStoreOpt,vCD);
00371 _mm_store_si128(storeOpt+k,vStoreOpt);
00372
00373
00374 vStoreOpt = _mm_adds_epi16(vStoreOpt,vDelFixed);
00375 vCD = _mm_adds_epi16(vCD, vDelIncr);
00376
00377 if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(vCD,vStoreOpt)))) goto shortcut;
00378 }
00379 }
00380 shortcut:
00381
00382
00383
00384
00385 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
00386 }
00387 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
00388 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
00389 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
00390 MaxScore = _mm_extract_epi16(vMaxScore,0);
00391 if (MaxScore == 0x7fff)
00392 return DBL_MAX;
00393 return (double)(u_int16_t)(MaxScore-(u_int16_t)0x8000);
00394 }
00395
00396
00397 EXPORT void swps3_freeProfileShortSSE( ProfileShort * profile ){
00398 free( profile );
00399 }
00400