35 #define PAGE_ALIGN(x) (((size_t)(x)+sysconf(_SC_PAGESIZE)-1)&~(sysconf(_SC_PAGESIZE)-1))
38 int segLen = (queryLen+7)/8;
44 profile->
loadOpt = (__m128i*) ((
size_t) (profile->
data + 15) & ~(0xf)) ;
50 profile->
len = queryLen;
54 if (bias < -matrix[ i*MATRIX_DIM+j ])
55 bias = -matrix[ i*MATRIX_DIM+j ];
56 pprofile = (
char*)profile->
profile;
58 for(i=0; i<MATRIX_DIM; i++)
59 for(j=0; j<segLen; j++)
61 if(j+k*segLen < queryLen)
62 *(pprofile++) = matrix[query[j+k*segLen]*MATRIX_DIM+i]+bias;
83 u_int16_t MaxScore = 0x8000;
84 int segLength = (query->
len+7)/8;
88 __m64 * current_profile;
91 __m128i vMinimums = _mm_set1_epi16(0x8000);
93 __m128i vDelFixed = _mm_set1_epi16(options->
gapOpen);
94 __m128i vBias = _mm_set1_epi16(query->
bias);
96 __m128i vMaxScore = vMinimums;
98 __m128i vProfile = vMinimums;
106 for(i=0;
LIKELY(i<segLength); i++){
107 _mm_store_si128(loadOpt+i,vMinimums);
108 _mm_store_si128(storeOpt+i,vMinimums);
113 for(j=0;
LIKELY(j<dbLen); j++){
123 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
124 vStoreOpt = _mm_slli_si128(vStoreOpt, 2);
125 vStoreOpt = _mm_insert_epi16(vStoreOpt,(
int)0x8000,0);
130 current_profile = query->
profile + db[j]*segLength;
139 for(i=0;
LIKELY(i<segLength); i++){
141 vTmp = _mm_load_si128(loadOpt+i);
142 vRD = _mm_adds_epi16(vTmp,vDelFixed);
147 __asm__(
"movq (%1),%0" :
"=x" (vProfile) :
"r" (current_profile+i));
148 vProfile = _mm_unpacklo_epi8(vProfile, _mm_xor_si128(vProfile,vProfile));
149 vProfile = _mm_subs_epi16(vProfile, vBias);
152 vStoreOpt = _mm_adds_epi16(vStoreOpt, vProfile);
155 vMaxScore = _mm_max_epi16(vMaxScore, vStoreOpt);
158 vStoreOpt = _mm_max_epi16(vStoreOpt, vCD);
159 vStoreOpt = _mm_max_epi16(vStoreOpt, vRD);
162 _mm_store_si128(storeOpt+i, vStoreOpt);
165 vCD = _mm_adds_epi16(vStoreOpt, vDelFixed);
175 vCD = _mm_slli_si128(vCD,2);
176 vCD = _mm_insert_epi16(vCD,0x8000,0);
178 for(k=0;
LIKELY(k<segLength);++k) {
180 vTmp = _mm_load_si128(storeOpt+k);
181 vStoreOpt = _mm_max_epi16(vTmp,vCD);
182 _mm_store_si128(storeOpt+k,vStoreOpt);
184 if(
UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(vStoreOpt,vTmp))))
goto shortcut;
187 vCD = _mm_adds_epi16(vStoreOpt,vDelFixed);
195 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
197 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
198 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
199 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
200 MaxScore = _mm_extract_epi16(vMaxScore,0);
201 if (MaxScore == 0x7fff)
203 return (
double)(u_int16_t)(MaxScore-(u_int16_t)0x8000);
220 u_int16_t MaxScore = 0x8000;
221 int segLength = ((query->
len+7)/8 + 1) & ~1;
222 int subSegLen = segLength / 2;
226 __m128i *
rD = query->
rD;
227 __m64 * current_profile;
230 __m128i vMinimums = _mm_set1_epi16(0x8000);
232 __m128i vDelIncr = _mm_set1_epi16(options->
gapExt);
233 __m128i vDelFixed = _mm_set1_epi16(options->
gapOpen);
234 __m128i vBias = _mm_set1_epi16(query->
bias);
236 __m128i vMaxScore = vMinimums;
237 __m128i vMaxScore1 = vMinimums, vMaxScore2 = vMinimums;
239 __m128i vProfile1 = vMinimums, vProfile2 = vMinimums;
240 __m128i vStoreOpt1, vStoreOpt2;
243 __m128i vTmp1, vTmp2;
251 for(i=0;
LIKELY(i<segLength); i++){
252 _mm_store_si128(loadOpt+i,vMinimums);
253 _mm_store_si128(storeOpt+i,vMinimums);
254 _mm_store_si128(rD+i,vMinimums);
259 for(j=0;
LIKELY(j<dbLen); j++){
263 vCD1 = vCD2 = vMinimums;
267 vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
268 vStoreOpt1 = _mm_slli_si128(vStoreOpt1, 2);
269 vStoreOpt1 = _mm_insert_epi16(vStoreOpt1,(
int)0x8000,0);
270 vStoreOpt2 = _mm_load_si128(storeOpt+subSegLen-1);
275 current_profile = query->
profile + db[j]*segLength;
284 for(i=0;
LIKELY(i<subSegLen); i++){
286 vRD1 = _mm_load_si128(rD+i);
287 vRD2 = _mm_load_si128(rD+i+subSegLen);
292 __asm__(
"movq (%1),%0" :
"=x" (vProfile1) :
"r" (current_profile+i));
293 vProfile1 = _mm_unpacklo_epi8(vProfile1, _mm_xor_si128(vProfile1,vProfile1));
294 vProfile1 = _mm_subs_epi16(vProfile1, vBias);
295 __asm__(
"movq (%1),%0" :
"=x" (vProfile2) :
"r" (current_profile+i+subSegLen));
296 vProfile2 = _mm_unpacklo_epi8(vProfile2, _mm_xor_si128(vProfile2,vProfile2));
297 vProfile2 = _mm_subs_epi16(vProfile2, vBias);
300 vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vProfile1);
301 vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vProfile2);
304 vMaxScore1 = _mm_max_epi16(vMaxScore1, vStoreOpt1);
305 vMaxScore2 = _mm_max_epi16(vMaxScore2, vStoreOpt2);
308 vTmp1 = _mm_max_epi16(vRD1, vCD1);
309 vTmp2 = _mm_max_epi16(vRD2, vCD2);
310 vStoreOpt1 = _mm_max_epi16(vStoreOpt1, vTmp1);
311 vStoreOpt2 = _mm_max_epi16(vStoreOpt2, vTmp2);
314 _mm_store_si128(storeOpt+i, vStoreOpt1);
315 _mm_store_si128(storeOpt+i+subSegLen, vStoreOpt2);
318 vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vDelFixed);
319 vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vDelFixed);
320 vRD1 = _mm_adds_epi16(vRD1, vDelIncr);
321 vRD2 = _mm_adds_epi16(vRD2, vDelIncr);
322 vRD1 = _mm_max_epi16(vRD1, vStoreOpt1);
323 vRD2 = _mm_max_epi16(vRD2, vStoreOpt2);
325 vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
326 vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
327 vCD1 = _mm_max_epi16(vCD1, vStoreOpt1);
328 vCD2 = _mm_max_epi16(vCD2, vStoreOpt2);
331 _mm_store_si128(rD+i, vRD1);
332 _mm_store_si128(rD+i+subSegLen, vRD2);
335 vStoreOpt1 = _mm_load_si128(loadOpt+i);
336 vStoreOpt2 = _mm_load_si128(loadOpt+i+subSegLen);
343 __m128i vRotate = vCD2;
345 vCD1 = _mm_slli_si128(vRotate,2);
346 vCD1 = _mm_insert_epi16(vCD1,0x8000,0);
348 for(k=0;
LIKELY(k<subSegLen);++k) {
350 vStoreOpt1 = _mm_load_si128(storeOpt+k);
351 vStoreOpt2 = _mm_load_si128(storeOpt+k+subSegLen);
352 vStoreOpt1 = _mm_max_epi16(vStoreOpt1,vCD1);
353 vStoreOpt2 = _mm_max_epi16(vStoreOpt2,vCD2);
354 _mm_store_si128(storeOpt+k,vStoreOpt1);
355 _mm_store_si128(storeOpt+k+subSegLen,vStoreOpt2);
358 vStoreOpt1 = _mm_adds_epi16(vStoreOpt1,vDelFixed);
359 vStoreOpt2 = _mm_adds_epi16(vStoreOpt2,vDelFixed);
360 vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
361 vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
364 vRD1 = _mm_load_si128(rD+k);
365 vRD2 = _mm_load_si128(rD+k+subSegLen);
366 vRD1 = _mm_max_epi16(vRD1,vStoreOpt1);
367 vRD2 = _mm_max_epi16(vRD2,vStoreOpt2);
368 _mm_store_si128(rD+k,vRD1);
369 _mm_store_si128(rD+k+subSegLen,vRD2);
371 if(
UNLIKELY(!_mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi16(vCD1,vStoreOpt1),_mm_cmpgt_epi16(vCD2,vStoreOpt2)))))
goto shortcut;
379 vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
380 vStoreOpt2 = _mm_load_si128(storeOpt+segLength+subSegLen-1);
382 vMaxScore = _mm_max_epi16(vMaxScore1, vMaxScore2);
383 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
384 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
385 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
386 MaxScore = _mm_extract_epi16(vMaxScore,0);
387 return (
double)(MaxScore-0x8000);
404 u_int16_t MaxScore = 0x8000;
405 int segLength = (query->
len+7)/8;
409 __m128i *
rD = query->
rD;
410 __m64 * current_profile;
413 __m128i vMinimums = _mm_set1_epi16(0x8000);
415 __m128i vDelIncr = _mm_set1_epi16(options->
gapExt);
416 __m128i vDelFixed = _mm_set1_epi16(options->
gapOpen);
417 __m128i vBias = _mm_set1_epi16(query->
bias);
419 __m128i vMaxScore = vMinimums;
421 __m128i vProfile = vMinimums;
433 for(i=0;
LIKELY(i<segLength); i++){
434 _mm_store_si128(loadOpt+i,vMinimums);
435 _mm_store_si128(storeOpt+i,vMinimums);
436 _mm_store_si128(rD+i,vMinimums);
441 for(j=0;
LIKELY(j<dbLen); j++){
451 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
452 vStoreOpt = _mm_slli_si128(vStoreOpt, 2);
453 vStoreOpt = _mm_insert_epi16(vStoreOpt,(
int)0x8000,0);
458 current_profile = query->
profile + db[j]*segLength;
467 for(i=0;
LIKELY(i<segLength); i++){
469 vRD = _mm_load_si128(rD+i);
470 vRD = _mm_adds_epi16(vRD, vDelIncr);
471 vTmp = _mm_load_si128(loadOpt+i);
472 vTmp = _mm_adds_epi16(vTmp,vDelFixed);
473 vRD = _mm_max_epi16(vRD,vTmp);
474 _mm_store_si128(rD+i, vRD);
479 __asm__(
"movq (%1),%0" :
"=x" (vProfile) :
"r" (current_profile+i));
480 vProfile = _mm_unpacklo_epi8(vProfile, _mm_xor_si128(vProfile,vProfile));
481 vProfile = _mm_subs_epi16(vProfile, vBias);
484 vStoreOpt = _mm_adds_epi16(vStoreOpt, vProfile);
487 vMaxScore = _mm_max_epi16(vMaxScore, vStoreOpt);
490 vStoreOpt = _mm_max_epi16(vStoreOpt, vCD);
491 vStoreOpt = _mm_max_epi16(vStoreOpt, vRD);
494 _mm_store_si128(storeOpt+i, vStoreOpt);
497 vStoreOpt = _mm_adds_epi16(vStoreOpt, vDelFixed);
498 vCD = _mm_adds_epi16(vCD, vDelIncr);
499 vCD = _mm_max_epi16(vCD, vStoreOpt);
502 vStoreOpt = _mm_load_si128(loadOpt+i);
509 vCD = _mm_slli_si128(vCD,2);
510 vCD = _mm_insert_epi16(vCD,0x8000,0);
512 for(k=0;
LIKELY(k<segLength);++k) {
514 vStoreOpt = _mm_load_si128(storeOpt+k);
515 vStoreOpt = _mm_max_epi16(vStoreOpt,vCD);
516 _mm_store_si128(storeOpt+k,vStoreOpt);
519 vStoreOpt = _mm_adds_epi16(vStoreOpt,vDelFixed);
520 vCD = _mm_adds_epi16(vCD, vDelIncr);
522 if(
UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(vCD,vStoreOpt))))
goto shortcut;
530 vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
532 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
533 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
534 vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
535 MaxScore = _mm_extract_epi16(vMaxScore,0);
536 if (MaxScore == 0x7fff)
538 return (
double)(u_int16_t)(MaxScore-(u_int16_t)0x8000);