__m128i Sub128128( __m128i A, __m128i B )
{
__m128i result = _mm_sub_epi64( A, B );
__m128i AL = _mm_move_epi64( A );
__m128i BL = _mm_move_epi64( B );
__m128i INV = _mm_xor_si128( AL, BL );
__m128i AGB = _mm_cmpgt_epi8( AL, BL );
AGB = _mm_xor_si128( AGB, INV );
__m128i BGA = _mm_cmpgt_epi8( BL, AL );
BGA = _mm_xor_si128( BGA, INV );
__m128i BORROW = _mm_setzero_si128();
BORROW = _mm_insert_epi16( BORROW, _mm_movemask_epi8( BGA ) > _mm_movemask_epi8( AGB ), 4 );
result = _mm_sub_epi64( result, BORROW );
return result;
}
void Prn128( const unsigned char* p, const wchar_t* pszPrefix = L"" )
{
wprintf_s( L"%s", pszPrefix );
p += 15;
for ( int i = 0; i < 16; ++i, --p )
wprintf_s( L"%02X ", (int)*p );
wprintf_s( L"(msb...lsb)\n" );
}
int _tmain(int argc, _TCHAR* argv[])
{
__declspec( align(16) ) unsigned char aA[16] = { 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33 };
__declspec( align(16) ) unsigned char aB[16] = { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 };
__declspec( align(16) ) unsigned char aR[16];
_mm_store_si128( reinterpret_cast< __m128i* >( aR ), Sub128128( _mm_load_si128( reinterpret_cast< const __m128i* >( aA ) ), _mm_load_si128( reinterpret_cast< const __m128i* >( aB ) ) ) );
Prn128( aA, L"A = " );
Prn128( aB, L"B = " );
Prn128( aR, L"A-B = " );
return 0;
}