swps3
DynProgr_sse_byte.c
Go to the documentation of this file.
1 
5 /*
6  * Copyright (c) 2007-2008 ETH Zürich, Institute of Computational Science
7  *
8  * Permission is hereby granted, free of charge, to any person
9  * obtaining a copy of this software and associated documentation
10  * files (the "Software"), to deal in the Software without
11  * restriction, including without limitation the rights to use,
12  * copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following
15  * conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27  * OTHER DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include "DynProgr_sse_byte.h"
31 #include "debug.h"
32 #include <unistd.h>
33 #include <stdio.h>
34 #include <float.h>
35 
36 #define PAGE_ALIGN(x) (((size_t)(x)+sysconf(_SC_PAGESIZE)-1)&~(sysconf(_SC_PAGESIZE)-1))
37 
40 EXPORT ProfileByte * swps3_createProfileByteSSE( const char * query, int queryLen, SBMatrix matrix ){
41  int segLen = (queryLen+15)/16;
42  int i,j,k;
43  int bias = 0;
44  u_int8_t * pprofile;
45  ProfileByte * profile = malloc( sizeof(ProfileByte)+segLen*(MATRIX_DIM+3)*sizeof(__m128i)+64+2*sysconf(_SC_PAGESIZE) );
46 
47  profile->loadOpt = (__m128i*) ((size_t) (profile->data + 15) & ~(0xf)) ;
48  profile->storeOpt = profile->loadOpt + segLen;
49  profile->rD = profile->storeOpt + segLen;
50  profile->profile = (__m128i*) PAGE_ALIGN(profile->rD + segLen);
51 
52  /* Init the profile */
53  profile->len = queryLen;
54  /* Init the byte profile */
55  for(i=0; i<MATRIX_DIM; i++)
56  for(j=0; j<MATRIX_DIM; j++)
57  if (bias < -matrix[ i*MATRIX_DIM+j ])
58  bias = -matrix[ i*MATRIX_DIM+j ];
59  pprofile = (u_int8_t*)profile->profile;
60 
61  for(i=0; i<MATRIX_DIM; i++)
62  for(j=0; j<segLen; j++)
63  for(k=0; k<16; k++)
64  if(j+k*segLen < queryLen)
65  *(pprofile++) = matrix[query[j+k*segLen]*MATRIX_DIM+i]+bias;
66  else
67  *(pprofile++) = bias;
68  profile->bias = bias;
69 
70 #ifdef DEBUG
71  for(i=0; i<queryLen; ++i) debug("\t%c",query[i]+'A');
72  debug("\n");
73 #endif
74  return profile;
75 }
76 
77 
78 EXPORT double swps3_alignmentByteSSE_lin( ProfileByte * query, const char * db, int dbLen, Options * options )
79 {
80 
81  /**********************************************************************
82  * This version of the code implements the idea presented in
83  *
84  ***********************************************************************
85  * Striped Smith-Waterman speeds database searches six times over other
86  * SIMD implementations
87  *
88  * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
89  **********************************************************************/
90 
91  int i, j;
92  unsigned char MaxScore = 0;
93  int segLength = (query->len+15)/16; /* the segment length */
94 
95  __m128i * loadOpt = query->loadOpt;
96  __m128i * storeOpt = query->storeOpt;
97  __m128i * current_profile;
98  __m128i * swap;
99 
100  __m128i vMinimums = _mm_set1_epi32(0);
101 
102  __m128i vDelFixed = _mm_set1_epi8(-options->gapOpen);
103  __m128i vBias = _mm_set1_epi8(query->bias);
104 
105  __m128i vMaxScore = vMinimums; /* vMaxScore = [0,0] */
106 
107  __m128i vStoreOpt; /* the new optimal score */
108  __m128i vRD; /* the new row deletion score */
109  __m128i vCD = vMinimums; /* the column deletion score */
110  __m128i zero = vMinimums; /* the column deletion score */
111  __m128i vTmp;
112 #ifdef DEBUG
113  int ii,jj;
114 #endif
115 
116  /* initialize the other arrays used for the dynProg code */
117  /*********************************************************/
118  for(i=0; LIKELY(i<segLength); i++){
119  _mm_store_si128(loadOpt+i,zero);
120  _mm_store_si128(storeOpt+i,zero);
121  }
122 
123  /* looping through all the columns */
124  /***********************************/
125 
126  for(j=0; LIKELY(j<dbLen); j++){
127 
128  /* compute the opt and cd score depending on the previous column
129  *******************************************************************
130  * set the column deletion score to zero, has to be fixed later on */
131  vCD = zero;
132 
133  /* set the opt score to the elements computed in the previous column*/
134  /* set the low of storeOpt to MaxS[j] */
135  vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
136  vStoreOpt = _mm_slli_si128(vStoreOpt, 1);
137 
138  /* compute the current profile, depending on the character in s2 */
139  /*****************************************************************/
140  current_profile = query->profile + db[j]*segLength;
141 
142  /* swap the old optimal score with the new one */
143  /***********************************************/
144  swap = storeOpt;
145  storeOpt = loadOpt;
146  loadOpt = swap;
147 
148  /* main loop computing the max, precomputing etc. */
149  /**************************************************/
150  for(i=0; LIKELY(i<segLength); i++){
151  vTmp = _mm_load_si128(loadOpt+i);
152  vRD = _mm_subs_epu8(vTmp,vDelFixed);
153 
154  /* add the profile the prev. opt */
155  vStoreOpt = _mm_adds_epu8(vStoreOpt, *(current_profile+i));
156  vStoreOpt = _mm_subs_epu8(vStoreOpt, vBias);
157 
158  /* update the maxscore found so far (gaps only decrease score) */
159  vMaxScore = _mm_max_epu8(vMaxScore, vStoreOpt);
160 
161  /* compute the correct opt score of the cell */
162  vStoreOpt = _mm_max_epu8(vStoreOpt, vRD);
163  vStoreOpt = _mm_max_epu8(vStoreOpt, vCD);
164 
165  /* store the opt score of the cell */
166  _mm_store_si128(storeOpt+i, vStoreOpt);
167 
168  /* precompute cd for next iteration */
169  vCD = _mm_subs_epu8(vStoreOpt, vDelFixed);
170 
171  /* load precomputed opt for next iteration */
172  vStoreOpt = vTmp;
173  }
174 
175 
176  for(i=0;LIKELY(i<16);++i) {
177  int k;
178  /* compute the gap extend penalty for the current cell */
179  vCD = _mm_slli_si128(vCD,1);
180 
181  for(k=0;LIKELY(k<segLength);++k) {
182  /* compute the current optimal value of the cell */
183  vTmp = _mm_load_si128(storeOpt+k);
184  vStoreOpt = _mm_max_epu8(vTmp,vCD);
185  _mm_store_si128(storeOpt+k,vStoreOpt);
186 
187  /* break if vStoreOpt unchanged */
188  if(UNLIKELY(_mm_movemask_epi8(_mm_cmpeq_epi8(vTmp,vStoreOpt)) == 0xFFFF)) goto shortcut;
189 
190  /* precompute the scores for the next cell */
191  vCD = _mm_subs_epu8(vStoreOpt,vDelFixed);
192  }
193  }
194 shortcut:
195 
196 #ifdef DEBUG
197  debug("%c\t",db[j]);
198  for(ii=0; ii<16;++ii) {
199  for(jj=0; jj<segLength;++jj) {
200  if(ii*segLength+jj < query->len)
201  debug("%d\t",(int)((unsigned char*)storeOpt)[ii+jj*16]);
202  }
203  }
204  debug("\n");
205 #else
206  i = 0;
207 #endif
208  }
209 
210  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 8));
211  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 4));
212  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 2));
213  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 1));
214  MaxScore = (unsigned char)_mm_extract_epi16(vMaxScore,0);
215  if ((int)MaxScore + (int)query->bias >=255)
216  return DBL_MAX;
217  return((double)MaxScore);
218 }
219 
220 
221 EXPORT double swps3_alignmentByteSSE( ProfileByte * query, const char * db, int dbLen, Options * options )
222 {
223 
224  /**********************************************************************
225  * This version of the code implements the idea presented in
226  *
227  ***********************************************************************
228  * Striped Smith-Waterman speeds database searches six times over other
229  * SIMD implementations
230  *
231  * Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
232  **********************************************************************/
233 
234  int i, j;
235  unsigned char MaxScore = 0;
236  int segLength = (query->len+15)/16; /* the segment length */
237 
238  __m128i * loadOpt = query->loadOpt;
239  __m128i * storeOpt = query->storeOpt;
240  __m128i * rD = query->rD;
241  __m128i * current_profile;
242  __m128i * swap;
243 
244  __m128i vMinimums = _mm_set1_epi32(0);
245 
246  __m128i vDelIncr = _mm_set1_epi8(-options->gapExt);
247  __m128i vDelFixed = _mm_set1_epi8(-options->gapOpen);
248  __m128i vBias = _mm_set1_epi8(query->bias);
249 
250  __m128i vMaxScore = vMinimums; /* vMaxScore = [0,0] */
251 
252  __m128i vStoreOpt; /* the new optimal score */
253  __m128i vRD; /* the new row deletion score */
254  __m128i vCD = vMinimums; /* the column deletion score */
255  __m128i zero = vMinimums; /* the column deletion score */
256  __m128i vTmp;
257 #ifdef DEBUG
258  int ii,jj;
259 #endif
260 
261  if ( options->gapExt <= options->gapOpen ) {
262  return swps3_alignmentByteSSE_lin(query, db, dbLen, options);
263  }
264 
265  /* initialize the other arrays used for the dynProg code */
266  /*********************************************************/
267  for(i=0; LIKELY(i<segLength); i++){
268  _mm_store_si128(loadOpt+i,zero);
269  _mm_store_si128(storeOpt+i,zero);
270  _mm_store_si128(rD+i,zero);
271  }
272 
273  /* looping through all the columns */
274  /***********************************/
275 
276  for(j=0; LIKELY(j<dbLen); j++) {
277  /* compute the opt and cd score depending on the previous column
278  *******************************************************************
279  * set the column deletion score to zero, has to be fixed later on */
280  vCD = zero;
281 
282  /* set the opt score to the elements computed in the previous column*/
283  vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
284  vStoreOpt = _mm_slli_si128(vStoreOpt, 1);
285 
286  /* compute the current profile, depending on the character in s2 */
287  /*****************************************************************/
288  current_profile = query->profile + db[j]*segLength;
289 
290  /* swap the old optimal score with the new one */
291  /***********************************************/
292  swap = storeOpt;
293  storeOpt = loadOpt;
294  loadOpt = swap;
295 
296  /* main loop computing the max, precomputing etc. */
297  /**************************************************/
298  for(i=0; LIKELY(i<segLength); i++){
299  vRD = _mm_load_si128(rD+i);
300  vRD = _mm_subs_epu8(vRD, vDelIncr);
301  vTmp = _mm_load_si128(loadOpt+i);
302  vTmp = _mm_subs_epu8(vTmp,vDelFixed);
303  vRD = _mm_max_epu8(vRD,vTmp);
304  _mm_store_si128(rD+i, vRD);
305 
306  /* add the profile the prev. opt */
307  vStoreOpt = _mm_adds_epu8(vStoreOpt, *(current_profile+i));
308  vStoreOpt = _mm_subs_epu8(vStoreOpt, vBias);
309 
310  /* update the maxscore found so far (gaps only decrease score) */
311  vMaxScore = _mm_max_epu8(vMaxScore, vStoreOpt);
312 
313  /* compute the correct opt score of the cell */
314  vStoreOpt = _mm_max_epu8(vStoreOpt, vRD);
315  vStoreOpt = _mm_max_epu8(vStoreOpt, vCD);
316 
317  /* store the opt score of the cell */
318  _mm_store_si128(storeOpt+i, vStoreOpt);
319 
320  /* precompute cd for next iteration */
321  vStoreOpt = _mm_subs_epu8(vStoreOpt, vDelFixed);
322  vCD = _mm_subs_epu8(vCD, vDelIncr);
323  vCD = _mm_max_epu8(vCD, vStoreOpt);
324 
325  /* load precomputed opt for next iteration */
326  vStoreOpt = _mm_load_si128(loadOpt+i);
327  }
328 
329 
330  for(i=0;LIKELY(i<16);++i) {
331  int k;
332  /* compute the gap extend penalty for the current cell */
333  vCD = _mm_slli_si128(vCD,1);
334 
335  for(k=0;LIKELY(k<segLength);++k) {
336  /* compute the current optimal value of the cell */
337  vStoreOpt = _mm_load_si128(storeOpt+k);
338  vStoreOpt = _mm_max_epu8(vStoreOpt,vCD);
339 
340  _mm_store_si128(storeOpt+k,vStoreOpt);
341 
342  /* precompute the scores for the next cell */
343  vStoreOpt = _mm_subs_epu8(vStoreOpt,vDelFixed);
344  vCD = _mm_subs_epu8(vCD, vDelIncr);
345 
346  if(UNLIKELY(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(vCD,vStoreOpt),zero)) == 0xFFFF)) goto shortcut;
347  }
348  }
349 shortcut:
350 
351 #ifdef DEBUG
352  debug("%c\t",db[j]);
353  for(ii=0; ii<16;++ii) {
354  for(jj=0; jj<segLength;++jj) {
355  if(ii*segLength+jj < query->len)
356  debug("%d\t",(int)((unsigned char*)storeOpt)[ii+jj*16]);
357  }
358  }
359  debug("\n");
360 #else
361  i = 0;
362 #endif
363  }
364 
365  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 8));
366  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 4));
367  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 2));
368  vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 1));
369  MaxScore = (unsigned char)_mm_extract_epi16(vMaxScore,0);
370  if ((int)MaxScore + (int)query->bias >=255)
371  return DBL_MAX;
372  return((double)MaxScore);
373 }
374 
375 
377  free( profile );
378 }
379