Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: leverage the SIMD Everywhere library
--- kalign.orig/src/alignment.c
+++ kalign/src/alignment.c
@@ -20,7 +20,7 @@
 
 */
 
-#include <xmmintrin.h>
+#include "../debian/include/simde/x86/sse.h"
 #include "alignment.h"
 
 #include "pick_anchor.h"
--- kalign.orig/src/bisectingKmeans.c
+++ kalign/src/bisectingKmeans.c
@@ -20,7 +20,12 @@
 
 */
 
-#include <xmmintrin.h>
+#include "../debian/include/simde/x86/sse.h"
+#if !defined(SIMDE_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#endif
 
 #include "msa.h"
 
--- kalign.orig/src/bpm.c
+++ kalign/src/bpm.c
@@ -27,16 +27,16 @@
 
 
 
+#include "../debian/include/simde/x86/avx2.h"
 #ifdef HAVE_AVX2
-#include <immintrin.h>
 
-__m256i BROADCAST_MASK[16];
+simde__m256i BROADCAST_MASK[16];
 
- void bitShiftLeft256ymm (__m256i *data, int count);
-__m256i bitShiftRight256ymm (__m256i *data, int count);
+ void bitShiftLeft256ymm (simde__m256i *data, int count);
+simde__m256i bitShiftRight256ymm (simde__m256i *data, int count);
 
 /* taken from Alexander Yee: http://www.numberworld.org/y-cruncher/internals/addition.html#ks_add */
- __m256i add256(uint32_t carry, __m256i A, __m256i B);
+ simde__m256i add256(uint32_t carry, simde__m256i A, simde__m256i B);
 #endif
 
 /* Below are test functions  */
@@ -53,8 +53,8 @@
 
 #ifdef HAVE_AVX2
 /* For debugging */
-void print_256(__m256i X);
-void print_256_all(__m256i X);
+void print_256(simde__m256i X);
+void print_256_all(simde__m256i X);
 #endif
 
 /* The actual test.  */
@@ -373,18 +373,18 @@
 
 }
 
-void print_256(__m256i X)
+void print_256(simde__m256i X)
 {
         alignas(32) uint64_t debug[4];
-        _mm256_store_si256( (__m256i*)& debug,X);
+        simde_mm256_store_si256( (simde__m256i*)& debug,X);
         fprintf(stdout,"%lu ", debug[0]);
 }
 
 
-void print_256_all(__m256i X)
+void print_256_all(simde__m256i X)
 {
         alignas(32) uint64_t debug[4];
-        _mm256_store_si256( (__m256i*)& debug,X);
+        simde_mm256_store_si256( (simde__m256i*)& debug,X);
         int i;
         for(i = 0; i < 4;i++){
                 fprintf(stdout,"%lu ", debug[i]);
@@ -462,10 +462,10 @@
 #ifdef HAVE_AVX2
 uint8_t bpm_256(const uint8_t* t,const uint8_t* p,int n,int m)
 {
-        __m256i VP,VN,D0,HN,HP,X,NOTONE;
-        __m256i xmm1,xmm2;
-        __m256i MASK;
-        __m256i B[13];
+        simde__m256i VP,VN,D0,HN,HP,X,NOTONE;
+        simde__m256i xmm1,xmm2;
+        simde__m256i MASK;
+        simde__m256i B[13];
 
         int i,j, k,diff;
 
@@ -486,16 +486,16 @@
         }
 
         for(i = 0; i < 13;i++){
-                B[i] = _mm256_load_si256((__m256i const*) &f[i]);
+                B[i] = simde_mm256_load_si256((simde__m256i const*) &f[i]);
         }
 
         diff = m;
         k = m;
 
-        VP     = _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFFul);
-        VN     = _mm256_setzero_si256();
-        NOTONE = _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFFul);
-        MASK   = _mm256_set_epi64x (0ul,0ul,0ul,1);
+        VP     = simde_mm256_set1_epi64x(0xFFFFFFFFFFFFFFFFul);
+        VN     = simde_mm256_setzero_si256();
+        NOTONE = simde_mm256_set1_epi64x(0xFFFFFFFFFFFFFFFFul);
+        MASK   = simde_mm256_set_epi64x (0ul,0ul,0ul,1);
         m--;
 
         i = m / 64;
@@ -508,26 +508,26 @@
         for(i = 0; i < n ;i++){
                 //X = (B[(int) *t] | VN);
 
-                X = _mm256_or_si256(B[t[i]], VN);
+                X = simde_mm256_or_si256(B[t[i]], VN);
                                 //fprintf(stdout,"%lu ", X);
                 //D0 = ((VP+(X&VP)) ^ VP) | X ;
 
                 //print_256(X);
-                xmm1 = _mm256_and_si256(X, VP);
+                xmm1 = simde_mm256_and_si256(X, VP);
 
                 xmm2 = add256(0, VP, xmm1);
-                //xmm2 = _mm256_add_epi64(VP, xmm1);
-                xmm1 = _mm256_xor_si256(xmm2, VP);
-                D0 = _mm256_or_si256(xmm1, X);
+                //xmm2 = simde_mm256_add_epi64(VP, xmm1);
+                xmm1 = simde_mm256_xor_si256(xmm2, VP);
+                D0 = simde_mm256_or_si256(xmm1, X);
                 //print_256(D0);
                 //HN = VP & D0;
-                HN =_mm256_and_si256(VP, D0);
+                HN =simde_mm256_and_si256(VP, D0);
                 //print_256(HN);
                 //HP = VN | ~(VP | D0);
 
-                xmm1 = _mm256_or_si256(VP, D0);
-                xmm2 = _mm256_andnot_si256(xmm1, NOTONE);
-                HP = _mm256_or_si256(VN, xmm2);
+                xmm1 = simde_mm256_or_si256(VP, D0);
+                xmm2 = simde_mm256_andnot_si256(xmm1, NOTONE);
+                HP = simde_mm256_or_si256(VN, xmm2);
                 //print_256(HP);
 
                 //X = HP << 1ul;
@@ -536,28 +536,28 @@
 
                 //print_256(X);
                 //VN = X & D0;
-                VN= _mm256_and_si256(X, D0);
+                VN= simde_mm256_and_si256(X, D0);
                 //print_256(VN);
                 //VP = (HN << 1ul) | ~(X | D0);
                 xmm1 = HN;
                 bitShiftLeft256ymm(&xmm1, 1);
 
-                xmm2 = _mm256_or_si256(X, D0);
+                xmm2 = simde_mm256_or_si256(X, D0);
 
-                xmm2 = _mm256_andnot_si256(xmm2, NOTONE);
-                //xmm2 = _mm_andnot_si128 (xmm2,NOTONE);
-                VP = _mm256_or_si256(xmm1, xmm2);
+                xmm2 = simde_mm256_andnot_si256(xmm2, NOTONE);
+                //xmm2 = simde_mm_andnot_si128 (xmm2,NOTONE);
+                VP = simde_mm256_or_si256(xmm1, xmm2);
                 //print_256(VP);
 
 
                 //diff += (HP & MASK) >> m;
-                diff += 1- _mm256_testz_si256(HP, MASK);
+                diff += 1- simde_mm256_testz_si256(HP, MASK);
 
                 ///diff -= (HN & MASK) >> m;
-                diff -= 1- _mm256_testz_si256(HN,MASK);
+                diff -= 1- simde_mm256_testz_si256(HN,MASK);
 
                 //fprintf(stdout,"%d ",diff);
-                //xmm1 = _mm256_cmpgt_epi64(K, diff);
+                //xmm1 = simde_mm256_cmpgt_epi64(K, diff);
                 k = MACRO_MIN(k, diff);
         }
         return k;
@@ -569,33 +569,33 @@
 /* Must be called before BPM_256 is!!!  */
 void set_broadcast_mask(void)
 {
-        BROADCAST_MASK[0] =  _mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-        BROADCAST_MASK[1] = _mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000001);
-        BROADCAST_MASK[2] = _mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000001, 0x8000000000000000);
-        BROADCAST_MASK[3] = _mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000001, 0x8000000000000001);
-        BROADCAST_MASK[4] = _mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000000, 0x8000000000000000);
-        BROADCAST_MASK[5] = _mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000000, 0x8000000000000001);
-        BROADCAST_MASK[6] = _mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000001, 0x8000000000000000);
-        BROADCAST_MASK[7] = _mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000001, 0x8000000000000001);
-        BROADCAST_MASK[8] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-        BROADCAST_MASK[9] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000000, 0x8000000000000001);
-        BROADCAST_MASK[10] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000001, 0x8000000000000000);
-        BROADCAST_MASK[11] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000001, 0x8000000000000001);
-        BROADCAST_MASK[12] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000000, 0x8000000000000000);
-        BROADCAST_MASK[13] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000000, 0x8000000000000001);
-        BROADCAST_MASK[14] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000001, 0x8000000000000000);
-        BROADCAST_MASK[15] = _mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000001, 0x8000000000000001);
+        BROADCAST_MASK[0] =  simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+        BROADCAST_MASK[1] = simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000001);
+        BROADCAST_MASK[2] = simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000001, 0x8000000000000000);
+        BROADCAST_MASK[3] = simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000001, 0x8000000000000001);
+        BROADCAST_MASK[4] = simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000000, 0x8000000000000000);
+        BROADCAST_MASK[5] = simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000000, 0x8000000000000001);
+        BROADCAST_MASK[6] = simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000001, 0x8000000000000000);
+        BROADCAST_MASK[7] = simde_mm256_set_epi64x(0x8000000000000000, 0x8000000000000001, 0x8000000000000001, 0x8000000000000001);
+        BROADCAST_MASK[8] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+        BROADCAST_MASK[9] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000000, 0x8000000000000001);
+        BROADCAST_MASK[10] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000001, 0x8000000000000000);
+        BROADCAST_MASK[11] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000000, 0x8000000000000001, 0x8000000000000001);
+        BROADCAST_MASK[12] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000000, 0x8000000000000000);
+        BROADCAST_MASK[13] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000000, 0x8000000000000001);
+        BROADCAST_MASK[14] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000001, 0x8000000000000000);
+        BROADCAST_MASK[15] = simde_mm256_set_epi64x(0x8000000000000001, 0x8000000000000001, 0x8000000000000001, 0x8000000000000001);
 }
 
 
-__m256i add256(uint32_t carry, __m256i A, __m256i B)
-{
-        A = _mm256_xor_si256(A, _mm256_set1_epi64x(0x8000000000000000));
-        __m256i s = _mm256_add_epi64(A, B);
-        __m256i cv = _mm256_cmpgt_epi64(A, s);
-        __m256i mv = _mm256_cmpeq_epi64(s, _mm256_set1_epi64x(0x7fffffffffffffff));
-        uint32_t c = _mm256_movemask_pd(_mm256_castsi256_pd(cv));
-        uint32_t m = _mm256_movemask_pd(_mm256_castsi256_pd(mv));
+simde__m256i add256(uint32_t carry, simde__m256i A, simde__m256i B)
+{
+        A = simde_mm256_xor_si256(A, simde_mm256_set1_epi64x(0x8000000000000000));
+        simde__m256i s = simde_mm256_add_epi64(A, B);
+        simde__m256i cv = simde_mm256_cmpgt_epi64(A, s);
+        simde__m256i mv = simde_mm256_cmpeq_epi64(s, simde_mm256_set1_epi64x(0x7fffffffffffffff));
+        uint32_t c = simde_mm256_movemask_pd(simde_mm256_castsi256_pd(cv));
+        uint32_t m = simde_mm256_movemask_pd(simde_mm256_castsi256_pd(mv));
 
         {
                 c = m + 2*c; //  lea
@@ -604,41 +604,41 @@
                 carry >>= 4;
                 m &= 0x0f;
         }
-        return _mm256_add_epi64(s, BROADCAST_MASK[m]);
+        return simde_mm256_add_epi64(s, BROADCAST_MASK[m]);
 }
 
 
 //----------------------------------------------------------------------------
 // bit shift left a 256-bit value using ymm registers
-//          __m256i *data - data to shift
+//          simde__m256i *data - data to shift
 //          int count     - number of bits to shift
-// return:  __m256i       - carry out bit(s)
+// return:  simde__m256i       - carry out bit(s)
 
-void bitShiftLeft256ymm (__m256i *data, int count)
+void bitShiftLeft256ymm (simde__m256i *data, int count)
 {
-        __m256i innerCarry, rotate;
+        simde__m256i innerCarry, rotate;
 
-        innerCarry = _mm256_srli_epi64 (*data, 64 - count);                        // carry outs in bit 0 of each qword
-        rotate     = _mm256_permute4x64_epi64 (innerCarry, 0x93);                  // rotate ymm left 64 bits
-        innerCarry = _mm256_blend_epi32 (_mm256_setzero_si256 (), rotate, 0xFC);   // clear lower qword
-        *data    = _mm256_slli_epi64 (*data, count);                               // shift all qwords left
-        *data    = _mm256_or_si256 (*data, innerCarry);                            // propagate carrys from low qwords
-        //carryOut   = _mm256_xor_si256 (innerCarry, rotate);                        // clear all except lower qword
+        innerCarry = simde_mm256_srli_epi64 (*data, 64 - count);                        // carry outs in bit 0 of each qword
+        rotate     = simde_mm256_permute4x64_epi64 (innerCarry, 0x93);                  // rotate ymm left 64 bits
+        innerCarry = simde_mm256_blend_epi32 (simde_mm256_setzero_si256 (), rotate, 0xFC);   // clear lower qword
+        *data    = simde_mm256_slli_epi64 (*data, count);                               // shift all qwords left
+        *data    = simde_mm256_or_si256 (*data, innerCarry);                            // propagate carrys from low qwords
+        //carryOut   = simde_mm256_xor_si256 (innerCarry, rotate);                        // clear all except lower qword
         //return carryOut;
 }
 
-__m256i bitShiftRight256ymm (__m256i *data, int count)
+simde__m256i bitShiftRight256ymm (simde__m256i *data, int count)
 {
-        __m256i innerCarry, carryOut, rotate;
+        simde__m256i innerCarry, carryOut, rotate;
 
 
-        innerCarry = _mm256_slli_epi64(*data, 64 - count);
-        rotate =  _mm256_permute4x64_epi64 (innerCarry, 0x39);
-        innerCarry = _mm256_blend_epi32 (_mm256_setzero_si256 (), rotate, 0x3F);
-        *data = _mm256_srli_epi64(*data, count);
-        *data = _mm256_or_si256(*data,  innerCarry);
+        innerCarry = simde_mm256_slli_epi64(*data, 64 - count);
+        rotate =  simde_mm256_permute4x64_epi64 (innerCarry, 0x39);
+        innerCarry = simde_mm256_blend_epi32 (simde_mm256_setzero_si256 (), rotate, 0x3F);
+        *data = simde_mm256_srli_epi64(*data, count);
+        *data = simde_mm256_or_si256(*data,  innerCarry);
 
-        carryOut   = _mm256_xor_si256 (innerCarry, rotate);                        //FIXME: not sure if this is correct!!!
+        carryOut   = simde_mm256_xor_si256 (innerCarry, rotate);                        //FIXME: not sure if this is correct!!!
         return carryOut;
 }
 #endif
--- kalign.orig/src/euclidean_dist.c
+++ kalign/src/euclidean_dist.c
@@ -22,15 +22,20 @@
 
 #include "euclidean_dist.h"
 #include "rng.h"
-#include <xmmintrin.h>
-#include <immintrin.h>
+#include "../debian/include/simde/x86/sse.h"
+#include "../debian/include/simde/x86/avx.h"
+#if !defined(SIMDE_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#endif
 #include "float.h"
 
 #include "esl_stopwatch.h"
 /* These functions were taken from:  */
 /* https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 */
-float hsum256_ps_avx(__m256 v);
-float hsum_ps_sse3(__m128 v);
+float hsum256_ps_avx(simde__m256 v);
+float hsum_ps_sse3(simde__m128 v);
 
 
 #ifdef ITEST_EDIST
@@ -163,18 +168,18 @@
 
         float d = 0.0f;
         register int i;
-        __m256 xmm1;// = _mm256_load_ps(a);
-        __m256 xmm2;// = _mm256_load_ps(b);
-        __m256 r = _mm256_set1_ps(0.0f);
+        simde__m256 xmm1;// = simde_mm256_load_ps(a);
+        simde__m256 xmm2;// = simde_mm256_load_ps(b);
+        simde__m256 r = simde_mm256_set1_ps(0.0f);
         for(i = 0;i < len;i+=8){
-                xmm1 = _mm256_load_ps(a);
-                xmm2 = _mm256_load_ps(b);
+                xmm1 = simde_mm256_load_ps(a);
+                xmm2 = simde_mm256_load_ps(b);
 
-                xmm1 =  _mm256_sub_ps(xmm1, xmm2);
+                xmm1 =  simde_mm256_sub_ps(xmm1, xmm2);
 
-                xmm1 = _mm256_mul_ps(xmm1, xmm1);
+                xmm1 = simde_mm256_mul_ps(xmm1, xmm1);
 
-                r = _mm256_add_ps(r, xmm1);
+                r = simde_mm256_add_ps(r, xmm1);
                 a+=8;
                 b+=8;
         }
@@ -187,22 +192,22 @@
 
 
 
-float hsum256_ps_avx(__m256 v)
+float hsum256_ps_avx(simde__m256 v)
 {
-        __m128 vlow  = _mm256_castps256_ps128(v);
-        __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128
-        vlow  = _mm_add_ps(vlow, vhigh);     // add the low 128
+        simde__m128 vlow  = simde_mm256_castps256_ps128(v);
+        simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128
+        vlow  = simde_mm_add_ps(vlow, vhigh);     // add the low 128
         return hsum_ps_sse3(vlow);         // and inline the sse3 version, which is optimal for AVX
         // (no wasted instructions, and all of them are the 4B minimum)
 }
 
-float hsum_ps_sse3(__m128 v)
+float hsum_ps_sse3(simde__m128 v)
 {
-        __m128 shuf = _mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
-        __m128 sums = _mm_add_ps(v, shuf);
-        shuf        = _mm_movehl_ps(shuf, sums); // high half -> low half
-        sums        = _mm_add_ss(sums, shuf);
-        return        _mm_cvtss_f32(sums);
+        simde__m128 shuf = simde_mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
+        simde__m128 sums = simde_mm_add_ps(v, shuf);
+        shuf        = simde_mm_movehl_ps(shuf, sums); // high half -> low half
+        sums        = simde_mm_add_ss(sums, shuf);
+        return        simde_mm_cvtss_f32(sums);
 }
 
 #endif
--- kalign.orig/src/misc.c
+++ kalign/src/misc.c
@@ -20,7 +20,7 @@
 
 */
 
-#include <immintrin.h>
+#include "../debian/include/simde/x86/avx.h"
 
 #include "misc.h"
 #include  <stdalign.h>
--- kalign.orig/src/sequence_distance.c
+++ kalign/src/sequence_distance.c
@@ -21,7 +21,12 @@
 
 */
 
-#include <xmmintrin.h>
+#include "../debian/include/simde/x86/sse.h"
+#if !defined(SIMDE_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#endif
 #include "sequence_distance.h"
 
 #include "alphabet.h"
