Check binary

# SSE4
awk '/[ \t](mpsadbw|phminposuw|pmulld|pmuldq|dpps|dppd|blendps|blendpd|blendvps|blendvpd|pblendvb|pblenddw|pminsb|pmaxsb|pminuw|pmaxuw|pminud|pmaxud|pminsd|pmaxsd|roundps|roundss|roundpd|roundsd|insertps|pinsrb|pinsrd|pinsrq|extractps|pextrb|pextrd|pextrw|pextrq|pmovsxbw|pmovzxbw|pmovsxbd|pmovzxbd|pmovsxbq|pmovzxbq|pmovsxwd|pmovzxwd|pmovsxwq|pmovzxwq|pmovsxdq|pmovzxdq|ptest|pcmpeqq|pcmpgtq|packusdw|pcmpestri|pcmpestrm|pcmpistri|pcmpistrm|crc32|popcnt|movntdqa|extrq|insertq|movntsd|movntss|lzcnt)[ \t]/'

# AVX
awk '/[ \t](vmovapd|vmulpd|vaddpd|vsubpd|vfmadd213pd|vfmadd231pd|vfmadd132pd|vmulsd|vaddsd|vmosd|vsubsd|vbroadcastss|vbroadcastsd|vblendpd|vshufpd|vroundpd|vroundsd|vxorpd|vfnmadd231pd|vfnmadd213pd|vfnmadd132pd|vandpd|vmaxpd|vmovmskpd|vcmppd|vpaddd|vbroadcastf128|vinsertf128|vextractf128|vfmsub231pd|vfmsub132pd|vfmsub213pd|vmaskmovps|vmaskmovpd|vpermilps|vpermilpd|vperm2f128|vzeroall|vzeroupper|vpbroadcastb|vpbroadcastw|vpbroadcastd|vpbroadcastq|vbroadcasti128|vinserti128|vextracti128|vpminud|vpmuludq|vgatherdpd|vgatherqpd|vgatherdps|vgatherqps|vpgatherdd|vpgatherdq|vpgatherqd|vpgatherqq|vpmaskmovd|vpmaskmovq|vpermps|vpermd|vpermpd|vpermq|vperm2i128|vpblendd|vpsllvd|vpsllvq|vpsrlvd|vpsrlvq|vpsravd|vblendmpd|vblendmps|vpblendmd|vpblendmq|vpblendmb|vpblendmw|vpcmpd|vpcmpud|vpcmpq|vpcmpuq|vpcmpb|vpcmpub|vpcmpw|vpcmpuw|vptestmd|vptestmq|vptestnmd|vptestnmq|vptestmb|vptestmw|vptestnmb|vptestnmw|vcompresspd|vcompressps|vpcompressd|vpcompressq|vexpandpd|vexpandps|vpexpandd|vpexpandq|vpermb|vpermw|vpermt2b|vpermt2w|vpermi2pd|vpermi2ps|vpermi2d|vpermi2q|vpermi2b|vpermi2w|vpermt2ps|vpermt2pd|vpermt2d|vpermt2q|vshuff32x4|vshuff64x2|vshuffi32x4|vshuffi64x2|vpmultishiftqb|vpternlogd|vpternlogq|vpmovqd|vpmovsqd|vpmovusqd|vpmovqw|vpmovsqw|vpmovusqw|vpmovqb|vpmovsqb|vpmovusqb|vpmovdw|vpmovsdw|vpmovusdw|vpmovdb|vpmovsdb|vpmovusdb|vpmovwb|vpmovswb|vpmovuswb|vcvtps2udq|vcvtpd2udq|vcvttps2udq|vcvttpd2udq|vcvtss2usi|vcvtsd2usi|vcvttss2usi|vcvttsd2usi|vcvtps2qq|vcvtpd2qq|vcvtps2uqq|vcvtpd2uqq|vcvttps2qq|vcvttpd2qq|vcvttps2uqq|vcvttpd2uqq|vcvtudq2ps|vcvtudq2pd|vcvtusi2ps|vcvtusi2pd|vcvtusi2sd|vcvtusi2ss|vcvtuqq2ps|vcvtuqq2pd|vcvtqq2pd|vcvtqq2ps|vgetexppd|vgetexpps|vgetexpsd|vgetexpss|vgetmantpd|vgetmantps|vgetmantsd|vgetmantss|vfixupimmpd|vfixupimmps|vfixupimmsd|vfixupimmss|vrcp14pd|vrcp14ps|vrcp14sd|vrcp14ss|vrndscaleps|vrndscalepd|vrndscaless|vrndscalesd|vrsqrt14pd|vrsqrt14ps|vrsqrt14sd|vrsqrt14ss|vscalefps|vscalefpd|vscalefss|vscalefsd|valignd|valignq|vdbpsadbw|vpabsq|vpmaxsq|vpmaxuq|vpminsq|vpminuq|vprold|vprolvd|vprolq|vprolvq|vprord|vprorvd|vprorq|vprorvq|vpscatterdd|vpscatterdq|vpscatterqd|vpscatterqq|vscatterdps|vscatterdpd|vscatterqps|vscatterqpd|vpconflictd|vpconflictq|vplzcntd|vplzcntq|vpbroadcastmb2q|vpbroadcastmw2d|vexp2pd|vexp2ps|vrcp28pd|vrcp28ps|vrcp28sd|vrcp28ss|vrsqrt28pd|vrsqrt28ps|vrsqrt28sd|vrsqrt28ss|vgatherpf0dps|vgatherpf0qps|vgatherpf0dpd|vgatherpf0qpd|vgatherpf1dps|vgatherpf1qps|vgatherpf1dpd|vgatherpf1qpd|vscatterpf0dps|vscatterpf0qps|vscatterpf0dpd|vscatterpf0qpd|vscatterpf1dps|vscatterpf1qps|vscatterpf1dpd|vscatterpf1qpd|vfpclassps|vfpclasspd|vfpclassss|vfpclasssd|vrangeps|vrangepd|vrangess|vrangesd|vreduceps|vreducepd|vreducess|vreducesd|vpmovm2d|vpmovm2q|vpmovm2b|vpmovm2w|vpmovd2m|vpmovq2m|vpmovb2m|vpmovw2m|vpmullq|vpmadd52luq|vpmadd52huq|v4fmaddps|v4fmaddss|v4fnmaddps|v4fnmaddss|vp4dpwssd|vp4dpwssds|vpdpbusd|vpdpbusds|vpdpwssd|vpdpwssds|vpcompressb|vpcompressw|vpexpandb|vpexpandw|vpshld|vpshldv|vpshrd|vpshrdv|vpopcntd|vpopcntq|vpopcntb|vpopcntw|vpshufbitqmb|gf2p8affineinvqb|gf2p8affineqb|gf2p8mulb|vpclmulqdq|vaesdec|vaesdeclast|vaesenc|vaesenclast)[ \t]/'

Tutorials

intrinsics

SSE intrinsics optimizations in popular compilers

Headers

Header files for x86 SIMD intrinsics

<x86intrin.h>

<mmintrin.h>  MMX

<xmmintrin.h> SSE

<emmintrin.h> SSE2

<pmmintrin.h> SSE3

<tmmintrin.h> SSSE3

<smmintrin.h> SSE4.1

<nmmintrin.h> SSE4.2

<ammintrin.h> SSE4A

<wmmintrin.h> AES

<immintrin.h> AVX

<zmmintrin.h> AVX512

SSE

Converting to Streaming SIMD Extensions Chart

compiler options

Memory access

Memory access using _m128i address

`__m128`

https://gist.github.com/martin-steinegger/6601809

inline float get_first( const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(0,0)));}
inline float get_second(const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(0,1)));}
inline float get_third( const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(1,0)));}
inline float get_fourth(const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(1,1)));}

__m128 sseval;
float a, b, c, d;

sseval = _mm_set_ps(a, b, c, d);  // make vector from [ a, b, c, d ]
sseval = _mm_setr_ps(a, b, c, d); // make vector from [ d, c, b, a ]
sseval = _mm_load_ps(&a);         // ill-specified here - "a" not float[] ...
                                  // same as _mm_set_ps(a[0], a[1], a[2], a[3])
                                  // if you have an actual array

sseval = _mm_set1_ps(a);          // make vector from [ a, a, a, a ]
sseval = _mm_load1_ps(&a);        // load from &a, replicate - same as previous

sseval = _mm_set_ss(a);           // make vector from [ a, 0, 0, 0 ]
sseval = _mm_load_ss(&a);         // load from &a, zero others - same as prev

Check zero

// SSE4.1
_mm_testz_si128
// SE2-compatible
inline bool isAllZeros(__m128i xmm) {
    return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) == 0xFFFF;
}

res = _mm256_testz_si256(_mm256_castpd_si256(a), _mm256_castpd_si256(a));

__m256 vcmp = _mm256_cmp_ps(*pSrc1, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
int mask = _mm256_movemask_ps(vcmp);
bool any_nz = mask != 0xff;

Shuffle

Matrix Multiplication

Efficient 4x4 matrix vector multiplication with SSE: horizontal add and dot product - what's the point?

__m128 m4x4v_colSSE(const __m128 cols[4], const __m128 v) {
  __m128 u1 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(0,0,0,0));
  __m128 u2 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(1,1,1,1));
  __m128 u3 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(2,2,2,2));
  __m128 u4 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(3,3,3,3));

  __m128 prod1 = _mm_mul_ps(u1, cols[0]);
  __m128 prod2 = _mm_mul_ps(u2, cols[1]);
  __m128 prod3 = _mm_mul_ps(u3, cols[2]);
  __m128 prod4 = _mm_mul_ps(u4, cols[3]);

  return _mm_add_ps(_mm_add_ps(prod1, prod2), _mm_add_ps(prod3, prod4));
}

__m128 m4x4v_rowSSE3(const __m128 rows[4], const __m128 v) {
  __m128 prod1 = _mm_mul_ps(rows[0], v);
  __m128 prod2 = _mm_mul_ps(rows[1], v);
  __m128 prod3 = _mm_mul_ps(rows[2], v);
  __m128 prod4 = _mm_mul_ps(rows[3], v);

  return _mm_hadd_ps(_mm_hadd_ps(prod1, prod2), _mm_hadd_ps(prod3, prod4));
}

__m128 m4x4v_rowSSE4(const __m128 rows[4], const __m128 v) {
  __m128 prod1 = _mm_dp_ps (rows[0], v, 0xFF);
  __m128 prod2 = _mm_dp_ps (rows[1], v, 0xFF);
  __m128 prod3 = _mm_dp_ps (rows[2], v, 0xFF);
  __m128 prod4 = _mm_dp_ps (rows[3], v, 0xFF);

  return _mm_shuffle_ps(_mm_movelh_ps(prod1, prod2), _mm_movelh_ps(prod3, prod4),  _MM_SHUFFLE(2, 0, 2, 0));
}

matrix transpose

`_MM_TRANSPOSE4_PS`


static void transpose_4x4_ver0(__m128 & v0, __m128 & v1, __m128 v2, __m128 v3)
{
    __m128 a0 = _mm_unpacklo_ps(v0, v1); /* a0 = { x0, x1, y0, y1 } */
    __m128 a1 = _mm_unpackhi_ps(v0, v1); /* a1 = { z0, z1, z0, z1 } */

    __m128 a2 = _mm_unpacklo_ps(v2, v3); /* a2 = { x2, x3, y2, y3 } */
    __m128 a3 = _mm_unpackhi_ps(v2, v3); /* a3 = { z2, z3, z2, z3 } */

    v0 = _mm_unpacklo_ps(a0, a2); /* v0 = { x0, x1, x2, x3 } */
    v1 = _mm_unpackhi_ps(a0, a2); /* v1 = { y0, y1, y2, y3 } */

    v2 = _mm_unpacklo_ps(a1, a3); /* v2 = { z0, z1, z2, z3 } */
    v3 = _mm_unpackhi_ps(a1, a3); /* v3 = { w0, w1, w2, w3 } */
}

static void transpose_4x4_ver1(__m128 & v0, __m128 & v1, __m128 v2, __m128 v3)
{
    __m128 a0 = _mm_unpacklo_ps(v0, v2); /* a0 = { x0, x2, y0, y2 } */
    __m128 a1 = _mm_unpacklo_ps(v1, v3); /* a1 = { x1, x3, y1, y3 } */

    __m128 a2 = _mm_unpackhi_ps(v0, v2); /* a2 = { z0, z2, w0, w2 } */
    __m128 a3 = _mm_unpackhi_ps(v1, v3); /* a3 = { z1, z3, w1, w3 } */

    v0 = _mm_unpacklo_ps(a0, a1); /* v0 = { x0, x1, x2, x3 } */
    v1 = _mm_unpackhi_ps(a0, a1); /* v1 = { y0, y1, y2, y3 } */

    v2 = _mm_unpacklo_ps(a2, a3); /* v2 = { z0, z1, z2, z3 } */
    v3 = _mm_unpackhi_ps(a2, a3); /* v3 = { w0, w1, w2, w3 } */
}

dot product

// 4 dot products at one time.
__m256d xy0 = _mm256_mul_pd( x[0], y[0] );
__m256d xy1 = _mm256_mul_pd( x[1], y[1] );
__m256d xy2 = _mm256_mul_pd( x[2], y[2] );
__m256d xy3 = _mm256_mul_pd( x[3], y[3] );

// low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
__m256d temp01 = _mm256_hadd_pd( xy0, xy1 );   

// low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
__m256d temp23 = _mm256_hadd_pd( xy2, xy3 );

// low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
__m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 );

// low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
__m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);

__m256d dotproduct = _mm256_add_pd( swapped, blended );

matrix inverse

abs

How to absolute 2 double or 4 floats using SSE instruction set? (Up to SSE4)

inline __m128 abs_ps(__m128 x) {
    static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
    return _mm_andnot_ps(sign_mask, x);
}

inline __m128d abs_pd(__m128d x) {
    static const __m128d sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63
    return _mm_andnot_pd(sign_mask, x); // !sign_mask & x
}

How do I perform absolute value on double using intrinsics? [duplicate]

inline __m128 abs_ps(const __m128& x)
{
  static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
  return _mm_andnot_ps(sign_mask, x);
}
#if defined (__AVX__)
inline __m256d abs_pd(const __m256d& x)
{
  static const __m256d sign_mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
  return _mm256_and_pd(sign_mask, x);
}
#endif

SSE/AVX: Choose from two __m256 float vectors based on per-element min and max absolute value

change sign

_mm_xor_ps(vec, _mm_set1_ps(-0.f))

Bitwise NOT/complement in AVX2 [duplicate]

extract

Intel SSE: Why does _mm_extract_ps return int instead of float?

rsqrt

Fast vectorized rsqrt and reciprocal with SSE/AVX depending on precision

FMA

Macro in C / C++

How to detect SSE/SSE2/AVX/AVX2/AVX-512/AVX-128-FMA/KCVI availability at compile-time?

$ gcc -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX" | sort
#define __AVX__ 1
#define __AVX2__ 1
#define __SSE__ 1
#define __SSE2__ 1
#define __SSE2_MATH__ 1
#define __SSE3__ 1
#define __SSE4_1__ 1
#define __SSE4_2__ 1
#define __SSE_MATH__ 1
#define __SSSE3__ 1

How do I know if I can compile with FMA instruction sets?

#if !defined(__FMA__) && defined(__AVX2__)
    #define __FMA__ 1
#endif

AVX

Crunching Numbers with AVX and AVX2

load

Load constant floats into SSE registers

__attribute__((aligned(16))) float vec[4] = { 1.0f, 1.1f, 1.2f, 1.3f };
__m128 v = _mm_load_ps(vec); // edit by sor: removed the "&" cause its already an address

Is it possible to cast floats directly to __m128 if they are 16 byte aligned?

combine

How to combine two __m128 values to __m256?

__m128 a = _mm_set_ps(1,2,3,4);
__m128 b = _mm_set_ps(5,6,7,8);

__m256 c = _mm256_castps128_ps256(a);
c = _mm256_insertf128_ps(c,b,1);

set

Fastest way to set __m256 value to all ONE bits

test

comparision

Comparing 2 vectors in AVX/AVX2 (c)

#include <immintrin.h>
#include <stdbool.h>

bool vec_equal(__m256i a, __m256i b) {
    __m256i pcmp = _mm256_cmpeq_epi32(a, b);  // epi8 is fine too
    unsigned bitmask = _mm256_movemask_epi8(pcmp);
    return (bitmask == 0xffffffffU);
}

/* Compare */
#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */

What does ordered / unordered comparison mean?

shuffle / permute

Snippets

Filtering Data

int FilterFloats_Reference(const float input[], float output[], int count, float limit)
{
  float *outputp = output;
  for (int i = 0; i < count; ++i) {
    if (input[i] >= limit)
      *outputp++ = input[i];
  }
  return (int) (outputp - output);
}

for (int i = 0; i < count; i += 4) {
  __m128 val = _mm_load_ps(input + i);
  __m128 mask = _mm_cmpge_ps(val, _mm_set1_ps(limit));
  __m128 result = LeftPack(mask, val);
  _mm_storeu_ps(output, result);

  output += _popcnt(_mm_movemask_ps(mask));
}

__m128i LeftPack_SSSE3(__m128 mask, __m128 val)
{
  // Move 4 sign bits of mask to 4-bit integer value.
  int mask = _mm_movemask_ps(mask);
  // Select shuffle control data
  __m128i shuf_ctrl = _mm_load_si128(&shufmasks[mask]);
  // Permute to move valid values to front of SIMD register
  __m128i packed = _mm_shuffle_epi8(_mm_castps_si128(val), shuf_ctrl);
  return packed;
}

__m128 PackLeft_SSE2(__m128 mask, __m128 val)
{
  int valid = _mm_movemask_ps(mask);
  __m128 mask0 = _mm_load_ps((float *)(&g_Masks[valid][0]));
  __m128 mask1 = _mm_load_ps((float *)(&g_Masks[valid][4]));
  __m128 s0 = _mm_shuffle_ps(val, val, _MM_SHUFFLE(0, 3, 2, 1));
  __m128 r0 = _mm_or_ps(_mm_and_ps(mask0, s0), _mm_andnot_ps(mask0, val));
  __m128 s1 = _mm_shuffle_ps(r0, r0, _MM_SHUFFLE(1, 0, 3, 2));
  __m128 r1 = _mm_or_ps(_mm_and_ps(mask1, s1), _mm_andnot_ps(mask1, r0));
  return r1;
}

Trouble-shooting

_mm_shuffle_ps: __builtin_shufflevector requires a constant integer

《Intel® 64 and IA-32 Architectures Optimization Reference Manual》

Chap.4

4.2 CONSIDERATIONS FOR CODE CONVERSION TO SIMD PROGRAMMING

To use any of the SIMD technologies optimally, you must evaluate the following situations in your code:

Fragments that are computationally intensive

Fragments that are executed often enough to have an impact on performance

Fragments that with little data-dependent control flow

Fragments that require floating-point computations

Fragments that can benefit from moving data 16 bytes at a time

Fragments of computation that can coded using fewer instructions

Fragments that require help in using the cache hierarchy efficiently

4.2.1 Identifying Hot Spots

4.2.2 Determine If Code Benefits by Conversion to SIMD Execution

4.4 STACK AND DATA ALIGNMENT

4.4.1 Alignment and Contiguity of Data Access Patterns

4.4.1.1 Using Padding to Align Data

4.4.1.2 Using Arrays to Make Data Contiguous

4.4.2 Stack Alignment For 128-bit SIMD Technologies

Functions that use Streaming SIMD Extensions or Streaming SIMD Extensions 2 data need to provide a 16-byte aligned stack frame.
__M128* parameters need to be aligned to 16-byte boundaries, possibly creating “holes” (due to padding) in the argument block.

4.4.3 Data Alignment for MMX Technology

/* Make newp a pointer to a 64-bit aligned array of NUM_ELEMENTS 64-bit elements. */
double *p, *newp;
p = (double*)malloc (sizeof(double)*(NUM_ELEMENTS+1));
newp = (p+7) & (~0x7);

4.4.4 Data Alignment for 128-bit data

4.4.4.1 Compiler-Supported Alignment

Alignment by `F32vec4` or `__m128` Data Types

`__declspec(align(16))` specifications

Alignment by Using a UNION Structure

union {
float f[400];
__m128 m[100];
} buffer;

struct __declspec(align(16)) my_m128
{
float f[4];
};

class my_m128 {
union {
__m128 m;
float f[4];
};
};

Alignment by Using __m64 or DOUBLE Data

4.5 IMPROVING MEMORY UTILIZATION

4.5.1 Data Structure Layout

The recommended way for computing data in AoS format is to swizzle each set of elements to SoA format before processing it using SIMD technologies. Swizzling can either be done dynamically during program execution or statically when the data structures are generated.

Note that SoA can have the disadvantage of requiring more independent memory stream references. This can require the use of more prefetches, additional address generation calculations, as well as having a greater impact on DRAM page access efficiency.

NumOfGroups = NumOfVertices/SIMDwidth
typedef struct{
  float x[SIMDwidth];
  float y[SIMDwidth];
  float z[SIMDwidth];
} VerticesCoordList;
typedef struct{
  int a[SIMDwidth];
  int b[SIMDwidth];
  int c[SIMDwidth];
} VerticesColorList;
VerticesCoordList VerticesCoord[NumOfGroups];
VerticesColorList VerticesColor[NumOfGroups];

The hybrid SoA approach ensures:

Data is organized to enable more efficient vertical SIMD computation
Simpler/less address generation than AoS
Fewer streams, which reduces DRAM page misses
Use of fewer prefetches, due to fewer streams
Efficient cache line packing of data elements that are used concurrently.

4.5.2 Strip-Mining

First introduced for vectorizers, this technique consists of the generation of code when each vector operation is done for a size less than or equal to the maximum vector length on a given vector machine. By fragmenting a large loop into smaller segments or strips, this technique transforms the loop structure by:

Increasing the temporal and spatial locality in the data cache if the data are reusable in different passes of an algorithm.

Reducing the number of iterations of the loop by a factor of the length of each “vector,” or number of operations being performed per SIMD operation.

4.5.3 Loop Blocking

// A. Original Loop
float A[MAX, MAX], B[MAX, MAX]
for (i=0; i< MAX; i++) {
  for (j=0; j< MAX; j++) {
    A[i,j] = A[i,j] + B[j, i];
  }
}
// B. Transformed Loop after Blocking
float A[MAX, MAX], B[MAX, MAX];
for (i=0; i< MAX; i+=block_size) {
  for (j=0; j< MAX; j+=block_size) {
    for (ii=i; ii<i+block_size; ii++) {
      for (jj=j; jj<j+block_size; jj++) {
        A[ii,jj] = A[ii,jj] + B[jj, ii];
      }
    }
  }
}

If MAX is huge, loop blocking can also help reduce the penalty from DTLB (data translation look-aside buffer) misses. In addition to improving the cache/memory performance, this optimization technique also saves external bus bandwidth.

4.6 INSTRUCTION SELECTION

One barrier to SIMD computation can be the existence of data-dependent branches. Conditional moves can be used to eliminate data-dependent branches.

// High-level code:
__declspec(align(16)) short A[MAX_ELEMENT], B[MAX_ELEMENT], C[MAX_ELEMENT],
  D[MAX_ELEMENT], E[MAX_ELEMENT];
for (i=0; i<MAX_ELEMENT; i++) {
  if (A[i] > B[i]) {
    C[i] = D[i];
  } else {
    C[i] = E[i];
  }
}

; MMX assembly code processes 4 short values per iteration:
xor       eax, eax
top_of_loop:
movq      mm0, [A + eax]
pcmpgtw   xmm0, [B + eax]; Create compare mask
movq      mm1, [D + eax]
pand      mm1, mm0; Drop elements where A<B
pandn     mm0, [E + eax] ; Drop elements where A>B
por       mm0, mm1; Create single word
movq      [C + eax], mm0
add       eax, 8
cmp       eax, MAX_ELEMENT*2
jle       top_of_loop

; SSE4.1 assembly processes 8 short values per iteration:
xor       eax, eax
top_of_loop:
movdqq    xmm0, [A + eax]
pcmpgtw   pcmpgtwxmm0, [B + eax]; Create compare mask
movdqa    xmm1, [E + eax]
pblendv   xmm1, [D + eax], xmm0;
movdqa    [C + eax], xmm1;
add       eax, 16
cmp       eax, MAX_ELEMENT*2
jle       top_of_loop

4.6.1 SIMD Optimizations and Microarchitectures

Recommendation: When targeting code generation for Intel Core Solo and Intel Core Duo processors, favor instructions consisting of two μ ops over those with more than two μ ops. Recommendation: With the proliferation of 128-bit SIMD hardware in Intel Core microarchitecture and Enhanced Intel Core microarchitecture, integer SIMD code written using MMX instructions should consider more efficient implementations using 128-bit SIMD instructions.

RISC-V

SIMD Instructions Considered Harmful