From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Enable building on non-x86 and less than SSE4.1
--- mmseqs2.orig/src/CMakeLists.txt
+++ mmseqs2/src/CMakeLists.txt
@@ -133,21 +133,6 @@
     append_target_property(mmseqs-framework LINK_FLAGS -msse4.1)
 elseif (HAVE_NEON)
     target_compile_definitions(mmseqs-framework PUBLIC -DSSE=1 -DNEON=1)
-else ()
-    include(CheckSSEFeatures)
-    append_target_property(mmseqs-framework COMPILE_FLAGS ${SSE_FLAGS})
-    append_target_property(mmseqs-framework LINK_FLAGS ${SSE_FLAGS})
-    if (HAVE_AVX2_EXTENSIONS)
-        target_compile_definitions(mmseqs-framework PUBLIC -DAVX2=1)
-        # debugging
-        #   list(APPEND MMSEQS_DEFINITIONS -DSSE=1)
-    else ()
-        if (HAVE_SSE4_1_EXTENSIONS)
-            target_compile_definitions(mmseqs-framework PUBLIC -DSSE=1)
-        else ()
-            message(FATAL_ERROR "At least SSE4.2 is needed to compile!")
-        endif (HAVE_SSE4_1_EXTENSIONS)
-    endif (HAVE_AVX2_EXTENSIONS)
 endif ()
 
 target_link_libraries(mmseqs-framework tinyexpr zstd microtar)
@@ -261,10 +246,10 @@
     add_subdirectory(version)
     set(mmseqs_source_files mmseqs.cpp)
 
-    add_executable(mmseqs ${mmseqs_source_files})
-    mmseqs_setup_derived_target(mmseqs)
-    target_link_libraries(mmseqs version)
-    install(TARGETS mmseqs DESTINATION bin)
+    add_executable(mmseqs${EXE_SUFFIX} ${mmseqs_source_files})
+    mmseqs_setup_derived_target(mmseqs${EXE_SUFFIX})
+    target_link_libraries(mmseqs${EXE_SUFFIX} version)
+    install(TARGETS mmseqs${EXE_SUFFIX} DESTINATION bin)
 
     if (HAVE_TESTS)
         add_subdirectory(test)
--- mmseqs2.orig/lib/simd/simd.h
+++ mmseqs2/lib/simd/simd.h
@@ -50,11 +50,8 @@
 #define SSE
 #endif
 
-#ifdef NEON
-#include "sse2neon.h"
-#else
-#include <xmmintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
 
 #ifdef AVX512
 #include <zmmintrin.h.h> // AVX512
@@ -284,11 +281,9 @@
 #endif //AVX_SUPPORT
 
 
-#ifdef SSE
+#include <simde/x86/sse4.1.h>
 uint16_t simd_hmax16(const __m128i buffer);
 uint8_t simd_hmax8(const __m128i buffer);
-#ifndef NEON
-#include <smmintrin.h>  //SSE4.1
 // double support
 #ifndef SIMD_DOUBLE
 #define SIMD_DOUBLE
@@ -311,7 +306,6 @@
 #define simdf64_andnot(x,y) _mm_andnot_pd(x,y)
 #define simdf64_xor(x,y)    _mm_xor_pd(x,y)
 #endif //SIMD_DOUBLE
-#endif
 
 // float support
 #ifndef SIMD_FLOAT
@@ -395,7 +389,6 @@
 #define simdi32_i2f(x) 	    _mm_cvtepi32_ps(x)  // convert integer to s.p. float
 #define simdi_i2fcast(x)    _mm_castsi128_ps(x)
 #endif //SIMD_INT
-#endif //SSE
 
 #ifdef NEON
 inline uint16_t simd_hmax16(const __m128i buffer) {
@@ -488,7 +481,6 @@
     return 0;
 }
 #else
-#ifdef SSE
 inline unsigned short extract_epi16(__m128i v, int pos) {
     switch(pos){
         case 0: return _mm_extract_epi16(v, 0);
@@ -503,7 +495,6 @@
     return 0;
 }
 #endif
-#endif
 
 
 /* horizontal max */
@@ -608,7 +599,6 @@
 //
 //
 //TODO fix this
-#ifdef SSE
     float __attribute__((aligned(16))) res;
     __m128 P; // query 128bit SSE2 register holding 4 floats
     __m128 R;// result
@@ -637,7 +627,6 @@
     R = _mm_add_ps(R,P);
     _mm_store_ss(&res, R);
     return res;
-#endif
 //#endif
     return tj[0] * qi[0] + tj[1] * qi[1] + tj[2] * qi[2] + tj[3] * qi[3]
             + tj[4] * qi[4] + tj[5] * qi[5] + tj[6] * qi[6] + tj[7] * qi[7]
--- mmseqs2.orig/src/commons/itoa.h
+++ mmseqs2/src/commons/itoa.h
@@ -25,7 +25,8 @@
 #ifdef NEON
 #include "sse2neon.h"
 #else
-#include <emmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse2.h>
 #endif
 
 #include <stdint.h>
--- mmseqs2.orig/CMakeLists.txt
+++ mmseqs2/CMakeLists.txt
@@ -8,6 +8,7 @@
 set(HAVE_SANITIZER 0 CACHE BOOL "Have Sanitizers")
 set(INSTALL_UTIL 1 CACHE BOOL "Install util scripts")
 set(VERSION_OVERRIDE "" CACHE STRING "Override version string in help and usage messages")
+set(EXE_SUFFIX "" CACHE STRING "Suffix to add to executable names")
 
 #Sanitizers
 if (${HAVE_SANITIZER})
@@ -40,9 +41,6 @@
 
 # set flags
 set(MMSEQS_CXX_FLAGS "-std=c++0x")
-if (NOT ${HAVE_NEON})
-    set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -m64")
-endif ()
 
 # Compiler-specific features
 if (CMAKE_COMPILER_IS_CLANG)
--- mmseqs2.orig/src/prefiltering/UngappedAlignment.cpp
+++ mmseqs2/src/prefiltering/UngappedAlignment.cpp
@@ -71,12 +71,8 @@
     simd_int vscore        = simdi_setzero();
     simd_int vMaxScore     = simdi_setzero();
     const simd_int vBias   = simdi8_set(bias);
-#ifndef AVX2
-    #ifdef SSE
     const simd_int sixten  = simdi8_set(16);
     const simd_int fiveten = simdi8_set(15);
-#endif
-#endif
     for(unsigned int pos = 0; pos < seqLen; pos++){
         simd_int template01 = simdi_load((simd_int *)&dbSeq[pos*VECSIZE_INT*4]);
 #ifdef AVX2
@@ -85,7 +81,7 @@
         //        __m256i score_vec_8bit = _mm256_shuffle_epi8(score_matrix_vec01, template01);
         //        __m256i lookup_mask01  = _mm256_cmpgt_epi8(sixten, template01); // 16 > t
         //        score_vec_8bit = _mm256_and_si256(score_vec_8bit, lookup_mask01);
-#elif defined(SSE)
+#else
         // each position has 32 byte
         // 20 scores and 12 zeros
         // load score 0 - 15
@@ -96,16 +92,8 @@
         // _mm_shuffle_epi8
         // for i ... 16
         //   score01[i] = score_matrix_vec01[template01[i]%16]
-#ifdef NEON
-        __m128i score01 =vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(score_matrix_vec01),vreinterpretq_u8_m128i(template01)));
-#else
         __m128i score01 =_mm_shuffle_epi8(score_matrix_vec01,template01);
-#endif
-#ifdef NEON
-        __m128i score16 =vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(score_matrix_vec16),vreinterpretq_u8_m128i(template01)));
-#else
         __m128i score16 =_mm_shuffle_epi8(score_matrix_vec16,template01);
-#endif
         // t[i] < 16 => 0 - 15
         // example: template01: 02 15 12 18 < 16 16 16 16 => FF FF FF 00
         __m128i lookup_mask01 = _mm_cmplt_epi8(template01, sixten);
@@ -292,7 +280,7 @@
     EXTRACT_AVX(24);  EXTRACT_AVX(25);  EXTRACT_AVX(26);  EXTRACT_AVX(27);
     EXTRACT_AVX(28);  EXTRACT_AVX(29);  EXTRACT_AVX(30);  EXTRACT_AVX(31);
 #undef EXTRACT_AVX
-#elif defined(SSE)
+#else
     #define EXTRACT_SSE(i) score_arr[i] = _mm_extract_epi8(score, i)
     EXTRACT_SSE(0);  EXTRACT_SSE(1);   EXTRACT_SSE(2);  EXTRACT_SSE(3);
     EXTRACT_SSE(4);  EXTRACT_SSE(5);   EXTRACT_SSE(6);  EXTRACT_SSE(7);
--- mmseqs2.orig/src/commons/Application.cpp
+++ mmseqs2/src/commons/Application.cpp
@@ -4,10 +4,6 @@
 #include "DistanceCalculator.h"
 #include "Timer.h"
 
-#ifndef NEON
-#include <CpuInfo.h>
-#endif
-
 #include <iomanip>
 
 extern const char *binary_name;
@@ -24,30 +20,6 @@
 extern std::vector<Categories> categories;
 
 void checkCpu() {
-#ifndef NEON
-    CpuInfo info;
-    if (info.HW_x64 == false) {
-        Debug(Debug::ERROR) << "64-bit system is required to run MMseqs2.\n";
-        EXIT(EXIT_FAILURE);
-    }
-#ifdef SEE
-    if(info.HW_SSE41 == false) {
-        Debug(Debug::ERROR) << "SSE4.1 is required to run MMseqs2.\n";
-        EXIT(EXIT_FAILURE);
-    }
-#endif
-#ifdef AVX2
-    if (info.HW_AVX2 == false) {
-        Debug(Debug::ERROR) << "Your machine does not support AVX2.\n";
-        if (info.HW_SSE41 == true) {
-            Debug(Debug::ERROR) << "Please recompile with SSE4.1: cmake -DHAVE_SSE4_1=1 \n";
-        } else {
-            Debug(Debug::ERROR) << "SSE4.1 is the minimum requirement to run MMseqs2.\n";
-        }
-        EXIT(EXIT_FAILURE);
-    }
-#endif
-#endif
 }
 
 Command *getCommandByName(const char *s) {
--- mmseqs2.orig/lib/ksw2/ksw2_extz2_sse.cpp
+++ mmseqs2/lib/ksw2/ksw2_extz2_sse.cpp
@@ -31,24 +31,8 @@
 #include <assert.h>
 #include "ksw2.h"
 
-#ifdef NEON
-#include "sse2neon.h"
-#define __SSE2__
-#define KSW_SSE2_ONLY
-#endif
-
-#ifdef __SSE2__
-#ifndef NEON
-#include <emmintrin.h>
-#endif
-
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse4.1.h>
 
 #ifdef KSW_CPU_DISPATCH
 #ifdef __SSE4_1__
@@ -165,11 +149,7 @@
 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
 				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
-#endif
 				tmp = _mm_andnot_si128(mask, tmp);
 				_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
 			}
@@ -186,22 +166,10 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-#endif
 				__dp_code_block2;
-#ifdef __SSE4_1__
 				_mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
 				_mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
-#else
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t], _mm_and_si128(a, tmp));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t], _mm_and_si128(b, tmp));
-#endif
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
 			__m128i *pr = p + r * n_col_ - st_;
@@ -210,16 +178,9 @@
 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 				tmp = _mm_cmpgt_epi8(b, z);
 				d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(b, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(a, zero_);
 				_mm_store_si128(&x[t], _mm_and_si128(tmp, a));
@@ -236,16 +197,9 @@
 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 				tmp = _mm_cmpgt_epi8(z, b);
 				d = _mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(zero_, a);
 				_mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
@@ -276,13 +230,8 @@
 					_mm_storeu_si128((__m128i*)&H[t], H1);
 					t_ = _mm_set1_epi32(t);
 					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
 				}
 				_mm_storeu_si128((__m128i*)HH, max_H_);
 				_mm_storeu_si128((__m128i*)tt, max_t_);
@@ -334,4 +283,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
