swps3
DynProgr_sse_short.c
Go to the documentation of this file.
1 
5 /*
6  * Copyright (c) 2007-2008 ETH Zürich, Institute of Computational Science
7  *
8  * Permission is hereby granted, free of charge, to any person
9  * obtaining a copy of this software and associated documentation
10  * files (the "Software"), to deal in the Software without
11  * restriction, including without limitation the rights to use,
12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following
15  * conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27  * OTHER DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include "DynProgr_sse_short.h"
31 #include <stdio.h>
32 #include <unistd.h>
33 #include <float.h>
34 
35 #define PAGE_ALIGN(x) (((size_t)(x)+sysconf(_SC_PAGESIZE)-1)&~(sysconf(_SC_PAGESIZE)-1))
36 EXPORT ProfileShort * swps3_createProfileShortSSE( const char * query, int queryLen, SBMatrix matrix ){
37  /* int segLen = ((queryLen+7)/8 + 1) & ~1; */
38  int segLen = (queryLen+7)/8;
39  int i,j,k;
40  int bias = 0;
41  char * pprofile;
42  ProfileShort * profile = malloc( sizeof(ProfileShort)+((segLen*MATRIX_DIM+1) & ~(0x1))*sizeof(__m64)+segLen*3*sizeof(__m128i)+64+2*sysconf(_SC_PAGESIZE) );
43 
44  profile->loadOpt = (__m128i*) ((size_t) (profile->data + 15) & ~(0xf)) ;
45  profile->storeOpt = profile->loadOpt + segLen;
46  profile->rD = profile->storeOpt + segLen;
47  profile->profile = (__m64*) PAGE_ALIGN(profile->rD + segLen);
48 
49  /* Init the profile */
50  profile->len = queryLen;
51  /* Init the byte profile */
52  for(i=0; i<MATRIX_DIM; i++)
53  for(j=0; j<MATRIX_DIM; j++)
54  if (bias < -matrix[ i*MATRIX_DIM+j ])
55  bias = -matrix[ i*MATRIX_DIM+j ];
56  pprofile = (char*)profile->profile;
57 
58  for(i=0; i<MATRIX_DIM; i++)
59  for(j=0; j<segLen; j++)
60  for(k=0; k<8; k++)
61  if(j+k*segLen < queryLen)
62  *(pprofile++) = matrix[query[j+k*segLen]*MATRIX_DIM+i]+bias;
63  else
64  *(pprofile++) = 0;
65  profile->bias = bias;
66  return profile;
67 }
68 
69 EXPORT double swps3_alignmentShortSSE_lin( ProfileShort * query, const char * db, int dbLen, Options * options )
70 {
71 
72  /**********************************************************************
73  * This version of the code implements the idea presented in
74  *
75  ***********************************************************************
76  * Striped Smith-Waterman speeds database searches six times over other
77  * SIMD implementations
78  *
79  * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
80  **********************************************************************/
81 
82  int i, j;
83  u_int16_t MaxScore = 0x8000;
84  int segLength = (query->len+7)/8; /* the segment length */
85 
86  __m128i * loadOpt = query->loadOpt;
87  __m128i * storeOpt = query->storeOpt;
88  __m64 * current_profile;
89  __m128i * swap;
90 
91  __m128i vMinimums = _mm_set1_epi16(0x8000);
92 
93  __m128i vDelFixed = _mm_set1_epi16(options->gapOpen);
94  __m128i vBias = _mm_set1_epi16(query->bias);
95 
96  __m128i vMaxScore = vMinimums; /* vMaxScore = [0,0] */
97 
98  __m128i vProfile = vMinimums; /* the score profile */
99  __m128i vStoreOpt; /* the new optimal score */
100  __m128i vRD; /* the new row deletion score */
101  __m128i vCD; /* the column deletion score */
102  __m128i vTmp;
103 
104  /* initialize the other arrays used for the dynProg code */
105  /*********************************************************/
106  for(i=0; LIKELY(i<segLength); i++){
107  _mm_store_si128(loadOpt+i,vMinimums);
108  _mm_store_si128(storeOpt+i,vMinimums);
109  }
110 
111  /* looping through all the columns */
112  /***********************************/
113  for(j=0; LIKELY(j<dbLen); j++){
114 
115 
116  /* compute the opt and cd score depending on the previous column */
117  /*******************************************************************/
118  /* set the column deletion score to zero, has to be fixed later on */
119  vCD = vMinimums;
120 
121  /* set the opt score to the elements computed in the previous column */
122  /* set the low of storeOpt to MaxS[j] */
123  vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
124  vStoreOpt = _mm_slli_si128(vStoreOpt, 2);
125  vStoreOpt = _mm_insert_epi16(vStoreOpt,(int)0x8000,0);
126 
127  /* compute the current profile, depending on the character in s2 */
128  /*****************************************************************/
129 
130  current_profile = query->profile + db[j]*segLength;
131  /* swap the old optimal score with the new one */
132  /***********************************************/
133  swap = storeOpt;
134  storeOpt = loadOpt;
135  loadOpt = swap;
136 
137  /* main loop computing the max, precomputing etc. */
138  /**************************************************/
139  for(i=0; LIKELY(i<segLength); i++){
140 
141  vTmp = _mm_load_si128(loadOpt+i);
142  vRD = _mm_adds_epi16(vTmp,vDelFixed);
143 
144  /* load the current profile */
145  /*vProfile = _mm_movpi64_epi64(current_profile[i]);*/
146  /*vProfile = _mm_loadl_epi64((__m128i*)(current_profile+i));*/
147  __asm__("movq (%1),%0" : "=x" (vProfile) : "r" (current_profile+i));
148  vProfile = _mm_unpacklo_epi8(vProfile, _mm_xor_si128(vProfile,vProfile));
149  vProfile = _mm_subs_epi16(vProfile, vBias);
150 
151  /* add the profile the prev. opt */
152  vStoreOpt = _mm_adds_epi16(vStoreOpt, vProfile);
153 
154  /* update the maxscore found so far */
155  vMaxScore = _mm_max_epi16(vMaxScore, vStoreOpt);
156 
157  /* compute the correct opt score of the cell */
158  vStoreOpt = _mm_max_epi16(vStoreOpt, vCD);
159  vStoreOpt = _mm_max_epi16(vStoreOpt, vRD);
160 
161  /* store the opt score of the cell */
162  _mm_store_si128(storeOpt+i, vStoreOpt);
163 
164  /* precompute cd for next iteration */
165  vCD = _mm_adds_epi16(vStoreOpt, vDelFixed);
166 
167  /* load precomputed opt for next iteration */
168  vStoreOpt = vTmp;
169  }
170 
171 
172  for(i=0;LIKELY(i<8);++i){
173  int k;
174  /* compute the gap extend penalty for the current cell */
175  vCD = _mm_slli_si128(vCD,2);
176  vCD = _mm_insert_epi16(vCD,0x8000,0);
177 
178  for(k=0;LIKELY(k<segLength);++k) {
179  /* compute the current optimal value of the cell */
180  vTmp = _mm_load_si128(storeOpt+k);
181  vStoreOpt = _mm_max_epi16(vTmp,vCD);
182  _mm_store_si128(storeOpt+k,vStoreOpt);
183 
184  if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(vStoreOpt,vTmp)))) goto shortcut;
185 
186  /* precompute the scores for the next cell */
187  vCD = _mm_adds_epi16(vStoreOpt,vDelFixed);
188  }
189  }
190 shortcut:
191  /* store the new MaxScore for the next line block */
192  /**************************************************/
193 
194  /* store the element of storeOpt in MaxS */
195  vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
196  }
197  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
198  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
199  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
200  MaxScore = _mm_extract_epi16(vMaxScore,0);
201  if (MaxScore == 0x7fff)
202  return DBL_MAX;
203  return (double)(u_int16_t)(MaxScore-(u_int16_t)0x8000);
204 }
205 
206 double swps3_alignmentShort2SSE( ProfileShort * query, const char * db, int dbLen, Options * options )
207 {
208 
209  /**********************************************************************
210  * This version of the code implements the idea presented in
211  *
212  ***********************************************************************
213  * Striped Smith-Waterman speeds database searches six times over other
214  * SIMD implementations
215  *
216  * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
217  **********************************************************************/
218 
219  int i, j;
220  u_int16_t MaxScore = 0x8000;
221  int segLength = ((query->len+7)/8 + 1) & ~1; /* the segment length */
222  int subSegLen = segLength / 2;
223 
224  __m128i * loadOpt = query->loadOpt;
225  __m128i * storeOpt = query->storeOpt;
226  __m128i * rD = query->rD;
227  __m64 * current_profile;
228  __m128i * swap;
229 
230  __m128i vMinimums = _mm_set1_epi16(0x8000);
231 
232  __m128i vDelIncr = _mm_set1_epi16(options->gapExt);
233  __m128i vDelFixed = _mm_set1_epi16(options->gapOpen);
234  __m128i vBias = _mm_set1_epi16(query->bias);
235 
236  __m128i vMaxScore = vMinimums;
237  __m128i vMaxScore1 = vMinimums, vMaxScore2 = vMinimums;
238 
239  __m128i vProfile1 = vMinimums, vProfile2 = vMinimums; /* the score profile */
240  __m128i vStoreOpt1, vStoreOpt2; /* the new optimal score */
241  __m128i vRD1, vRD2; /* the new row deletion score */
242  __m128i vCD1, vCD2; /* the column deletion score */
243  __m128i vTmp1, vTmp2; /* the column deletion score */
244 
245  if ( options->gapExt <= options->gapOpen ) {
246  return swps3_alignmentShortSSE_lin(query, db, dbLen, options);
247  }
248 
249  /* initialize the other arrays used for the dynProg code */
250  /*********************************************************/
251  for(i=0; LIKELY(i<segLength); i++){
252  _mm_store_si128(loadOpt+i,vMinimums);
253  _mm_store_si128(storeOpt+i,vMinimums);
254  _mm_store_si128(rD+i,vMinimums);
255  }
256 
257  /* looping through all the columns */
258  /***********************************/
259  for(j=0; LIKELY(j<dbLen); j++){
260  /* compute the opt and cd score depending on the previous column */
261  /*******************************************************************/
262  /* set the column deletion score to zero, has to be fixed later on */
263  vCD1 = vCD2 = vMinimums;
264 
265  /* set the opt score to the elements computed in the previous column */
266  /* set the low of storeOpt to MaxS[j] */
267  vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
268  vStoreOpt1 = _mm_slli_si128(vStoreOpt1, 2);
269  vStoreOpt1 = _mm_insert_epi16(vStoreOpt1,(int)0x8000,0);
270  vStoreOpt2 = _mm_load_si128(storeOpt+subSegLen-1);
271 
272  /* compute the current profile, depending on the character in s2 */
273  /*****************************************************************/
274 
275  current_profile = query->profile + db[j]*segLength;
276  /* swap the old optimal score with the new one */
277  /***********************************************/
278  swap = storeOpt;
279  storeOpt = loadOpt;
280  loadOpt = swap;
281 
282  /* main loop computing the max, precomputing etc. */
283  /**************************************************/
284  for(i=0; LIKELY(i<subSegLen); i++){
285 
286  vRD1 = _mm_load_si128(rD+i);
287  vRD2 = _mm_load_si128(rD+i+subSegLen);
288 
289  /* load the current profile */
290  /*vProfile = _mm_movpi64_epi64(current_profile[i]);*/
291  /*vProfile = _mm_loadl_epi64((__m128i*)(current_profile+i));*/
292  __asm__("movq (%1),%0" : "=x" (vProfile1) : "r" (current_profile+i));
293  vProfile1 = _mm_unpacklo_epi8(vProfile1, _mm_xor_si128(vProfile1,vProfile1));
294  vProfile1 = _mm_subs_epi16(vProfile1, vBias);
295  __asm__("movq (%1),%0" : "=x" (vProfile2) : "r" (current_profile+i+subSegLen));
296  vProfile2 = _mm_unpacklo_epi8(vProfile2, _mm_xor_si128(vProfile2,vProfile2));
297  vProfile2 = _mm_subs_epi16(vProfile2, vBias);
298 
299  /* add the profile the prev. opt */
300  vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vProfile1);
301  vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vProfile2);
302 
303  /* update the maxscore found so far */
304  vMaxScore1 = _mm_max_epi16(vMaxScore1, vStoreOpt1);
305  vMaxScore2 = _mm_max_epi16(vMaxScore2, vStoreOpt2);
306 
307  /* compute the correct opt score of the cell */
308  vTmp1 = _mm_max_epi16(vRD1, vCD1);
309  vTmp2 = _mm_max_epi16(vRD2, vCD2);
310  vStoreOpt1 = _mm_max_epi16(vStoreOpt1, vTmp1);
311  vStoreOpt2 = _mm_max_epi16(vStoreOpt2, vTmp2);
312 
313  /* store the opt score of the cell */
314  _mm_store_si128(storeOpt+i, vStoreOpt1);
315  _mm_store_si128(storeOpt+i+subSegLen, vStoreOpt2);
316 
317  /* precompute rd and cd for next iteration */
318  vStoreOpt1 = _mm_adds_epi16(vStoreOpt1, vDelFixed);
319  vStoreOpt2 = _mm_adds_epi16(vStoreOpt2, vDelFixed);
320  vRD1 = _mm_adds_epi16(vRD1, vDelIncr);
321  vRD2 = _mm_adds_epi16(vRD2, vDelIncr);
322  vRD1 = _mm_max_epi16(vRD1, vStoreOpt1);
323  vRD2 = _mm_max_epi16(vRD2, vStoreOpt2);
324 
325  vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
326  vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
327  vCD1 = _mm_max_epi16(vCD1, vStoreOpt1);
328  vCD2 = _mm_max_epi16(vCD2, vStoreOpt2);
329 
330  /* store precomputed rd */
331  _mm_store_si128(rD+i, vRD1);
332  _mm_store_si128(rD+i+subSegLen, vRD2);
333 
334  /* load precomputed opt for next iteration */
335  vStoreOpt1 = _mm_load_si128(loadOpt+i);
336  vStoreOpt2 = _mm_load_si128(loadOpt+i+subSegLen);
337  }
338 
339 
340  for(i=0;LIKELY(i<9);++i){
341  int k;
342  /* compute the gap extend penalty for the current cell */
343  __m128i vRotate = vCD2;
344  vCD2 = vCD1;
345  vCD1 = _mm_slli_si128(vRotate,2);
346  vCD1 = _mm_insert_epi16(vCD1,0x8000,0);
347 
348  for(k=0;LIKELY(k<subSegLen);++k) {
349  /* compute the current optimal value of the cell */
350  vStoreOpt1 = _mm_load_si128(storeOpt+k);
351  vStoreOpt2 = _mm_load_si128(storeOpt+k+subSegLen);
352  vStoreOpt1 = _mm_max_epi16(vStoreOpt1,vCD1);
353  vStoreOpt2 = _mm_max_epi16(vStoreOpt2,vCD2);
354  _mm_store_si128(storeOpt+k,vStoreOpt1);
355  _mm_store_si128(storeOpt+k+subSegLen,vStoreOpt2);
356 
357  /* precompute the scores for the next cell */
358  vStoreOpt1 = _mm_adds_epi16(vStoreOpt1,vDelFixed);
359  vStoreOpt2 = _mm_adds_epi16(vStoreOpt2,vDelFixed);
360  vCD1 = _mm_adds_epi16(vCD1, vDelIncr);
361  vCD2 = _mm_adds_epi16(vCD2, vDelIncr);
362 
363  /* compute the current optimal rd value */
364  vRD1 = _mm_load_si128(rD+k);
365  vRD2 = _mm_load_si128(rD+k+subSegLen);
366  vRD1 = _mm_max_epi16(vRD1,vStoreOpt1);
367  vRD2 = _mm_max_epi16(vRD2,vStoreOpt2);
368  _mm_store_si128(rD+k,vRD1);
369  _mm_store_si128(rD+k+subSegLen,vRD2);
370 
371  if(UNLIKELY(!_mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi16(vCD1,vStoreOpt1),_mm_cmpgt_epi16(vCD2,vStoreOpt2))))) goto shortcut;
372  }
373  }
374 shortcut:
375  /* store the new MaxScore for the next line block */
376  /**************************************************/
377 
378  /* store the element of storeOpt in MaxS */
379  vStoreOpt1 = _mm_load_si128(storeOpt+segLength-1);
380  vStoreOpt2 = _mm_load_si128(storeOpt+segLength+subSegLen-1);
381  }
382  vMaxScore = _mm_max_epi16(vMaxScore1, vMaxScore2);
383  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
384  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
385  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
386  MaxScore = _mm_extract_epi16(vMaxScore,0);
387  return (double)(MaxScore-0x8000);
388 }
389 
390 EXPORT double swps3_alignmentShortSSE( ProfileShort * query, const char * db, int dbLen, Options * options )
391 {
392 
393  /**********************************************************************
394  * This version of the code implements the idea presented in
395  *
396  ***********************************************************************
397  * Striped Smith-Waterman speeds database searches six times over other
398  * SIMD implementations
399  *
400  * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
401  **********************************************************************/
402 
403  int i, j;
404  u_int16_t MaxScore = 0x8000;
405  int segLength = (query->len+7)/8; /* the segment length */
406 
407  __m128i * loadOpt = query->loadOpt;
408  __m128i * storeOpt = query->storeOpt;
409  __m128i * rD = query->rD;
410  __m64 * current_profile;
411  __m128i * swap;
412 
413  __m128i vMinimums = _mm_set1_epi16(0x8000);
414 
415  __m128i vDelIncr = _mm_set1_epi16(options->gapExt);
416  __m128i vDelFixed = _mm_set1_epi16(options->gapOpen);
417  __m128i vBias = _mm_set1_epi16(query->bias);
418 
419  __m128i vMaxScore = vMinimums; /* vMaxScore = [0,0] */
420 
421  __m128i vProfile = vMinimums; /* the score profile */
422  __m128i vStoreOpt; /* the new optimal score */
423  __m128i vRD; /* the new row deletion score */
424  __m128i vCD; /* the column deletion score */
425  __m128i vTmp;
426 
427  if ( options->gapExt <= options->gapOpen ) {
428  return swps3_alignmentShortSSE_lin(query, db, dbLen, options);
429  }
430 
431  /* initialize the other arrays used for the dynProg code */
432  /*********************************************************/
433  for(i=0; LIKELY(i<segLength); i++){
434  _mm_store_si128(loadOpt+i,vMinimums);
435  _mm_store_si128(storeOpt+i,vMinimums);
436  _mm_store_si128(rD+i,vMinimums);
437  }
438 
439  /* looping through all the columns */
440  /***********************************/
441  for(j=0; LIKELY(j<dbLen); j++){
442 
443 
444  /* compute the opt and cd score depending on the previous column */
445  /*******************************************************************/
446  /* set the column deletion score to zero, has to be fixed later on */
447  vCD = vMinimums;
448 
449  /* set the opt score to the elements computed in the previous column */
450  /* set the low of storeOpt to MaxS[j] */
451  vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
452  vStoreOpt = _mm_slli_si128(vStoreOpt, 2);
453  vStoreOpt = _mm_insert_epi16(vStoreOpt,(int)0x8000,0);
454 
455  /* compute the current profile, depending on the character in s2 */
456  /*****************************************************************/
457 
458  current_profile = query->profile + db[j]*segLength;
459  /* swap the old optimal score with the new one */
460  /***********************************************/
461  swap = storeOpt;
462  storeOpt = loadOpt;
463  loadOpt = swap;
464 
465  /* main loop computing the max, precomputing etc. */
466  /**************************************************/
467  for(i=0; LIKELY(i<segLength); i++){
468 
469  vRD = _mm_load_si128(rD+i);
470  vRD = _mm_adds_epi16(vRD, vDelIncr);
471  vTmp = _mm_load_si128(loadOpt+i);
472  vTmp = _mm_adds_epi16(vTmp,vDelFixed);
473  vRD = _mm_max_epi16(vRD,vTmp);
474  _mm_store_si128(rD+i, vRD);
475 
476  /* load the current profile */
477  /*vProfile = _mm_movpi64_epi64(current_profile[i]);*/
478  /*vProfile = _mm_loadl_epi64((__m128i*)(current_profile+i));*/
479  __asm__("movq (%1),%0" : "=x" (vProfile) : "r" (current_profile+i));
480  vProfile = _mm_unpacklo_epi8(vProfile, _mm_xor_si128(vProfile,vProfile));
481  vProfile = _mm_subs_epi16(vProfile, vBias);
482 
483  /* add the profile the prev. opt */
484  vStoreOpt = _mm_adds_epi16(vStoreOpt, vProfile);
485 
486  /* update the maxscore found so far */
487  vMaxScore = _mm_max_epi16(vMaxScore, vStoreOpt);
488 
489  /* compute the correct opt score of the cell */
490  vStoreOpt = _mm_max_epi16(vStoreOpt, vCD);
491  vStoreOpt = _mm_max_epi16(vStoreOpt, vRD);
492 
493  /* store the opt score of the cell */
494  _mm_store_si128(storeOpt+i, vStoreOpt);
495 
496  /* precompute cd for next iteration */
497  vStoreOpt = _mm_adds_epi16(vStoreOpt, vDelFixed);
498  vCD = _mm_adds_epi16(vCD, vDelIncr);
499  vCD = _mm_max_epi16(vCD, vStoreOpt);
500 
501  /* load precomputed opt for next iteration */
502  vStoreOpt = _mm_load_si128(loadOpt+i);
503  }
504 
505 
506  for(i=0;LIKELY(i<8);++i){
507  int k;
508  /* compute the gap extend penalty for the current cell */
509  vCD = _mm_slli_si128(vCD,2);
510  vCD = _mm_insert_epi16(vCD,0x8000,0);
511 
512  for(k=0;LIKELY(k<segLength);++k) {
513  /* compute the current optimal value of the cell */
514  vStoreOpt = _mm_load_si128(storeOpt+k);
515  vStoreOpt = _mm_max_epi16(vStoreOpt,vCD);
516  _mm_store_si128(storeOpt+k,vStoreOpt);
517 
518  /* precompute the scores for the next cell */
519  vStoreOpt = _mm_adds_epi16(vStoreOpt,vDelFixed);
520  vCD = _mm_adds_epi16(vCD, vDelIncr);
521 
522  if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(vCD,vStoreOpt)))) goto shortcut;
523  }
524  }
525 shortcut:
526  /* store the new MaxScore for the next line block */
527  /**************************************************/
528 
529  /* store the element of storeOpt in MaxS */
530  vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
531  }
532  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 8));
533  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 4));
534  vMaxScore = _mm_max_epi16(vMaxScore, _mm_srli_si128(vMaxScore, 2));
535  MaxScore = _mm_extract_epi16(vMaxScore,0);
536  if (MaxScore == 0x7fff)
537  return DBL_MAX;
538  return (double)(u_int16_t)(MaxScore-(u_int16_t)0x8000);
539 }
540 
541 
543  free( profile );
544 }
545