From: Michael R. Crusoe <crusoe@debian.org>
Subject: Fix use of some intrinsics when compiling for SSE2 only
--- gmap.orig/src/intersect-simd.c
+++ gmap/src/intersect-simd.c
@@ -238,9 +238,12 @@
   maxFreq = freq[2 * 4 - 1];
 #ifdef HAVE_AVX2
   F = _mm256_loadu_si256((const __m256i *)(freq));
-#else
+#elif defined(HAVE_SSSE3)  // Actually SSE3
   F0 = _mm_lddqu_si128((const __m128i *)(freq));
   F1 = _mm_lddqu_si128((const __m128i *)(freq + 4));
+#else 
+  F0 = _mm_loadu_si128((const __m128i *)(freq));
+  F1 = _mm_loadu_si128((const __m128i *)(freq + 4));
 #endif
 
   if (COMPILER_RARELY(maxFreq < valRare))
@@ -281,8 +284,13 @@
     if (_mm_movemask_epi8(F0)) {
       matchOut++;
     }
+#ifdef HAVE_SSSE3  // Actually SSE3
     F0 = _mm_lddqu_si128((const __m128i *)(freq));
     F1 = _mm_lddqu_si128((const __m128i *)(freq + 4));
+#else
+    F0 = _mm_loadu_si128((const __m128i *)(freq));
+    F1 = _mm_loadu_si128((const __m128i *)(freq + 4));
+#endif
 #endif
 
   } while (maxFreq >= valRare);
@@ -305,9 +313,12 @@
 
 #ifdef HAVE_AVX2
   F = _mm256_loadu_si256((const __m256i *)(freq));
-#else
+#elif defined(HAVE_SSSE3)  // Actually SSE3
   F0 = _mm_lddqu_si128((const __m128i *)(freq));
   F1 = _mm_lddqu_si128((const __m128i *)(freq + 4));
+#else 
+  F0 = _mm_loadu_si128((const __m128i *)(freq));
+  F1 = _mm_loadu_si128((const __m128i *)(freq + 4));
 #endif
 
   goto ADVANCE_RARE;
--- gmap.orig/src/intersect-uint2.c
+++ gmap/src/intersect-uint2.c
@@ -100,7 +100,7 @@
 
 
 #ifdef HAVE_SSE2
-#if !defined(HAVE_STTNI)
+#if !defined(HAVE_STTNI) || !defined(HAVE_SSE4_2)
 
 static int
 intersect_vector16 (uint16_t *A, uint16_t deltaA, uint16_t **Bptr, uint16_t *B,
@@ -478,7 +478,11 @@
   if ((i_a < st_a) && (i_b < st_b)) {
     v_delta_a = _mm_set1_epi16(deltaA);
     v_a = _mm_add_epi16(_mm_loadu_si128((const __m128i *) &A[i_a]), v_delta_a);
+#if defined(HAVE_SSSE3)  // Actually SSE3
     v_b = _mm_lddqu_si128((const __m128i *) &B[i_b]);
+#else
+    v_b = _mm_loadu_si128((const __m128i *) &B[i_b]);
+#endif
 #ifdef DEBUG
     print_vector("v_a",v_a);
     print_vector("v_b",v_b);
@@ -510,7 +514,11 @@
         if (i_b == st_b) {
           break;
 	}
+#if defined(HAVE_SSSE3)  // Actually SSE3
         v_b = _mm_lddqu_si128((const __m128i *) &B[i_b]);
+#else
+        v_b = _mm_loadu_si128((const __m128i *) &B[i_b]);
+#endif
       }
 
 #ifdef DEBUG
@@ -544,7 +552,11 @@
           if (i_b == st_b) {
             break;
 	  }
+#if defined(HAVE_SSSE3)  // Actually SSE3
           v_b = _mm_lddqu_si128((const __m128i *) &B[i_b]);
+#else
+          v_b = _mm_loadu_si128((const __m128i *) &B[i_b]);
+#endif
         }
 #ifdef DEBUG
 	print_vector("v_a",v_a);
--- gmap.orig/src/genomebits.h
+++ gmap/src/genomebits.h
@@ -144,7 +144,7 @@
 #define clear_lowbit_64(diff,relpos) (diff & (diff - 1))
 
 
-#if defined(HAVE_LZCNT)
+#if 0
 #define count_leading_zeroes_32(diff) _lzcnt_u32(diff)
 #define count_leading_zeroes_64(diff) _lzcnt_u64(diff)
 #elif defined(HAVE_BUILTIN_CLZ)
@@ -155,7 +155,7 @@
 #define count_leading_zeroes_64(diff) ((diff >> 48) ? clz_table[diff >> 48] : ((diff >> 32) ? 16 + clz_table[diff >> 32] : ((diff >> 16) ? 32 + clz_table[diff >> 16] : 48 + clz_table[diff])))
 #endif
 
-#if defined(HAVE_TZCNT)
+#if 0
 #define count_trailing_zeroes_32(diff) _tzcnt_u32(diff)
 #define count_trailing_zeroes_64(diff) _tzcnt_u64(diff)
 #elif defined(HAVE_BUILTIN_CTZ)
@@ -320,7 +320,7 @@
   debugx(printf("Entered count_leading_zeroes_128 with "));
   debugx(print_vector_hex(_diff));
 
-#if defined(HAVE_LZCNT) && defined(HAVE_MM_EXTRACT_EPI64)
+#if 0 && defined(HAVE_MM_EXTRACT_EPI64)
   UINT8 x;
 
   if ((x = _mm_extract_epi64(_diff,1)) != 0) {
@@ -329,7 +329,7 @@
     return 64 + (int) _lzcnt_u64(_mm_extract_epi64(_diff,0));
   }
 
-#elif defined(HAVE_MM_EXTRACT_EPI64)
+#elif defined(HAVE_MM_EXTRACT_EPI64) && defined(HAVE_SSE4_1)
   UINT8 x;
 
   if ((x = _mm_extract_epi64(_diff,1)) != 0) {
@@ -366,7 +366,7 @@
   debugx(printf("Entered count_trailing_zeroes_128 with "));
   debugx(print_vector_hex(_diff));
 
-#if defined(HAVE_TZCNT) && defined(HAVE_MM_EXTRACT_EPI64)
+#if 0 && defined(HAVE_MM_EXTRACT_EPI64)
   UINT8 x;
 
   if ((x = _mm_extract_epi64(_diff,0)) != 0) {
@@ -375,7 +375,7 @@
     return 64 + (int) _tzcnt_u64(_mm_extract_epi64(_diff,1));
   }
 
-#elif defined(HAVE_MM_EXTRACT_EPI64)
+#elif defined(HAVE_MM_EXTRACT_EPI64) && defined(HAVE_SSE4_1)
   UINT8 x;
 
   if ((x = _mm_extract_epi64(_diff,0)) != 0) {
--- gmap.orig/src/intersect-approx-simd.c
+++ gmap/src/intersect-approx-simd.c
@@ -404,9 +404,17 @@
   Rare_low = _mm_set1_epi32(valRare - slop_plus_1 - EPI32_MAX);
   Rare_high = _mm_set1_epi32(valRare + slop_plus_1 - EPI32_MAX);
 
+#if defined(HAVE_SSSE3)  // Actually SSE3
   F0 = _mm_lddqu_si128((const __m128i *)(freq));
+#else
+  F0 = _mm_loadu_si128((const __m128i *)(freq));
+#endif
   F0 = _mm_sub_epi32(F0, _epi32_offset);
+#if defined(HAVE_SSSE3)  // Actually SSE3
   F1 = _mm_lddqu_si128((const __m128i *)(freq + 4));
+#else
+  F1 = _mm_loadu_si128((const __m128i *)(freq + 4));
+#endif
   F1 = _mm_sub_epi32(F1, _epi32_offset);
 #endif
 
@@ -504,9 +512,17 @@
 
       idx = _mm256_movemask_ps((__m256) M);
 #else
+#if defined(HAVE_SSSE3)  // Actually SSE3
       F0 = _mm_lddqu_si128((const __m128i *)(freq));
+#else
+      F0 = _mm_loadu_si128((const __m128i *)(freq));
+#endif
       F0 = _mm_sub_epi32(F0, _epi32_offset);
+#if defined(HAVE_SSSE3)  // Actually SSE3
       F1 = _mm_lddqu_si128((const __m128i *)(freq + 4));
+#else
+      F1 = _mm_loadu_si128((const __m128i *)(freq + 4));
+#endif
       F1 = _mm_sub_epi32(F1, _epi32_offset);
 
       M_above = _mm_cmpgt_epi32(F1, Rare_low);
@@ -610,9 +626,17 @@
 	F = _mm256_loadu_si256((const __m256i *)(freq));
 	F = _mm256_sub_epi32(F, _epi32_offset);
 #else
+#if defined(HAVE_SSSE3)  // Actually SSE3
 	F0 = _mm_lddqu_si128((const __m128i *)(freq));
+#else
+	F0 = _mm_loadu_si128((const __m128i *)(freq));
+#endif
 	F0 = _mm_sub_epi32(F0, _epi32_offset);
+#if defined(HAVE_SSSE3)  // Actually SSE3
 	F1 = _mm_lddqu_si128((const __m128i *)(freq + 4));
+#else
+	F1 = _mm_loadu_si128((const __m128i *)(freq + 4));
+#endif
 	F1 = _mm_sub_epi32(F1, _epi32_offset);
 #endif
       }
--- gmap.orig/src/bitvector.c
+++ gmap/src/bitvector.c
@@ -15,7 +15,7 @@
 
 #if !defined(HAVE_SSE4_2)
 #define count_leading_zeroes_32(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
-#elif defined(HAVE_LZCNT)
+#elif 0
 #define count_leading_zeroes_32(diff) _lzcnt_u32(diff)
 #elif defined(HAVE_BUILTIN_CLZ)
 #define count_leading_zeroes_32(diff) __builtin_clz(diff)
@@ -25,7 +25,7 @@
 
 #if !defined(HAVE_SSE4_2)
 #define count_trailing_zeroes_32(elt) mod_37_bit_position[(-elt & elt) % 37]
-#elif defined(HAVE_TZCNT)
+#elif 0
 #define count_trailing_zeroes_32(elt) _tzcnt_u32(elt)
 #elif defined(HAVE_BUILTIN_CTZ)
 #define count_trailing_zeroes_32(elt) __builtin_ctz(elt)
--- gmap.orig/src/gmap.c
+++ gmap/src/gmap.c
@@ -860,13 +860,13 @@
 #ifdef HAVE_SSE4_2
   fprintf(stderr,"Checking compiler options for SSE4.2: ");
   fprintf(stderr,"%08X ",x);
-#ifdef HAVE_LZCNT
+#if 0
   fprintf(stderr,"_lzcnt_u32=%d ",_lzcnt_u32(x));
 #endif
 #ifdef HAVE_BUILTIN_CLZ
   fprintf(stderr,"__builtin_clz=%d ",__builtin_clz(x));
 #endif
-#ifdef HAVE_TZCNT
+#if 0
   fprintf(stderr,"_tzcnt_u32=%d ",_tzcnt_u32(x));
 #endif
 #ifdef HAVE_BUILTIN_CTZ
--- gmap.orig/src/gmapindex.c
+++ gmap/src/gmapindex.c
@@ -148,13 +148,13 @@
 
   fprintf(stderr,"Checking compiler assumptions for popcnt: ");
   fprintf(stderr,"%08X ",x);
-#ifdef HAVE_LZCNT
+#if 0
   fprintf(stderr,"_lzcnt_u32=%d ",_lzcnt_u32(x));
 #endif
 #ifdef HAVE_BUILTIN_CLZ
   fprintf(stderr,"__builtin_clz=%d ",__builtin_clz(x));
 #endif
-#ifdef HAVE_TZCNT
+#if 0
   fprintf(stderr,"_tzcnt_u32=%d ",_tzcnt_u32(x));
 #endif
 #ifdef HAVE_BUILTIN_CTZ
--- gmap.orig/src/gsnap.c
+++ gmap/src/gsnap.c
@@ -931,13 +931,13 @@
 #ifdef HAVE_SSE4_2
   fprintf(stderr,"Checking compiler assumptions for SSE4.2 options: ");
   fprintf(stderr,"%08X ",x);
-#ifdef HAVE_LZCNT
+#if 0
   fprintf(stderr,"_lzcnt_u32=%d ",_lzcnt_u32(x));
 #endif
 #ifdef HAVE_BUILTIN_CLZ
   fprintf(stderr,"__builtin_clz=%d ",__builtin_clz(x));
 #endif
-#ifdef HAVE_TZCNT
+#if 0
   fprintf(stderr,"_tzcnt_u32=%d ",_tzcnt_u32(x));
 #endif
 #ifdef HAVE_BUILTIN_CTZ
--- gmap.orig/src/oligoindex_hr.c
+++ gmap/src/oligoindex_hr.c
@@ -9904,7 +9904,7 @@
 
 #if !defined(HAVE_SSE4_2)
 #define count_trailing_zeroes_32(diff) mod_37_bit_position[(-diff & diff) % 37]
-#elif defined(HAVE_TZCNT)
+#elif 0
 #define count_trailing_zeroes_32(diff) _tzcnt_u32(diff)
 #elif defined(HAVE_BUILTIN_CTZ)
 #define count_trailing_zeroes_32(diff) __builtin_ctz(diff)
--- gmap.orig/src/popcount.c
+++ gmap/src/popcount.c
@@ -4,7 +4,7 @@
 #endif
 
 
-#if !defined(HAVE_SSE4_2) || (!defined(HAVE_TZCNT) && !defined(HAVE_BUILTIN_CTZ))
+#if !defined(HAVE_SSE4_2) || !defined(HAVE_BUILTIN_CTZ)
 const int mod_37_bit_position[] = 
   {
     32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4,
@@ -2066,7 +2066,7 @@
  };
 #endif
 
-#if !defined(HAVE_SSE4_2) || (!defined(HAVE_LZCNT) && !defined(HAVE_BUILTIN_CLZ))
+#if !defined(HAVE_SSE4_2) || !defined(HAVE_BUILTIN_CLZ)
 const int clz_table[] =
 {16,15,14,14,13,13,13,13,12,12,12,12,12,12,12,12,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
  10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,
--- gmap.orig/src/popcount.h
+++ gmap/src/popcount.h
@@ -5,7 +5,7 @@
 #include "config.h"		/* For HAVE_BUILTIN_CTZ, HAVE_BUILTIN_POPCOUNT, HAVE_BUILTIN_CLZ */
 #endif
 
-#if !defined(HAVE_SSE4_2) || (!defined(HAVE_TZCNT) && !defined(HAVE_BUILTIN_CTZ))
+#if !defined(HAVE_SSE4_2) || !defined(HAVE_BUILTIN_CTZ)
 extern const int mod_37_bit_position[];
 #endif
 
@@ -13,7 +13,7 @@
 extern const int count_bits[];
 #endif
 
-#if !defined(HAVE_SSE4_2) || (!defined(HAVE_LZCNT) && !defined(HAVE_BUILTIN_CLZ))
+#if !defined(HAVE_SSE4_2) || !defined(HAVE_BUILTIN_CLZ)
 extern const int clz_table[];
 #endif
 
