SIMD - yszheda/wiki GitHub Wiki
- How to check if a binary requires SSE4 or AVX on Linux
- How to check if compiled code uses sse and avx instructions?
- How to remove “noise” from GCC/clang assembly output?
# SSE4
awk '/[ \t](mpsadbw|phminposuw|pmulld|pmuldq|dpps|dppd|blendps|blendpd|blendvps|blendvpd|pblendvb|pblenddw|pminsb|pmaxsb|pminuw|pmaxuw|pminud|pmaxud|pminsd|pmaxsd|roundps|roundss|roundpd|roundsd|insertps|pinsrb|pinsrd|pinsrq|extractps|pextrb|pextrd|pextrw|pextrq|pmovsxbw|pmovzxbw|pmovsxbd|pmovzxbd|pmovsxbq|pmovzxbq|pmovsxwd|pmovzxwd|pmovsxwq|pmovzxwq|pmovsxdq|pmovzxdq|ptest|pcmpeqq|pcmpgtq|packusdw|pcmpestri|pcmpestrm|pcmpistri|pcmpistrm|crc32|popcnt|movntdqa|extrq|insertq|movntsd|movntss|lzcnt)[ \t]/'
# AVX
awk '/[ \t](vmovapd|vmulpd|vaddpd|vsubpd|vfmadd213pd|vfmadd231pd|vfmadd132pd|vmulsd|vaddsd|vmosd|vsubsd|vbroadcastss|vbroadcastsd|vblendpd|vshufpd|vroundpd|vroundsd|vxorpd|vfnmadd231pd|vfnmadd213pd|vfnmadd132pd|vandpd|vmaxpd|vmovmskpd|vcmppd|vpaddd|vbroadcastf128|vinsertf128|vextractf128|vfmsub231pd|vfmsub132pd|vfmsub213pd|vmaskmovps|vmaskmovpd|vpermilps|vpermilpd|vperm2f128|vzeroall|vzeroupper|vpbroadcastb|vpbroadcastw|vpbroadcastd|vpbroadcastq|vbroadcasti128|vinserti128|vextracti128|vpminud|vpmuludq|vgatherdpd|vgatherqpd|vgatherdps|vgatherqps|vpgatherdd|vpgatherdq|vpgatherqd|vpgatherqq|vpmaskmovd|vpmaskmovq|vpermps|vpermd|vpermpd|vpermq|vperm2i128|vpblendd|vpsllvd|vpsllvq|vpsrlvd|vpsrlvq|vpsravd|vblendmpd|vblendmps|vpblendmd|vpblendmq|vpblendmb|vpblendmw|vpcmpd|vpcmpud|vpcmpq|vpcmpuq|vpcmpb|vpcmpub|vpcmpw|vpcmpuw|vptestmd|vptestmq|vptestnmd|vptestnmq|vptestmb|vptestmw|vptestnmb|vptestnmw|vcompresspd|vcompressps|vpcompressd|vpcompressq|vexpandpd|vexpandps|vpexpandd|vpexpandq|vpermb|vpermw|vpermt2b|vpermt2w|vpermi2pd|vpermi2ps|vpermi2d|vpermi2q|vpermi2b|vpermi2w|vpermt2ps|vpermt2pd|vpermt2d|vpermt2q|vshuff32x4|vshuff64x2|vshuffi32x4|vshuffi64x2|vpmultishiftqb|vpternlogd|vpternlogq|vpmovqd|vpmovsqd|vpmovusqd|vpmovqw|vpmovsqw|vpmovusqw|vpmovqb|vpmovsqb|vpmovusqb|vpmovdw|vpmovsdw|vpmovusdw|vpmovdb|vpmovsdb|vpmovusdb|vpmovwb|vpmovswb|vpmovuswb|vcvtps2udq|vcvtpd2udq|vcvttps2udq|vcvttpd2udq|vcvtss2usi|vcvtsd2usi|vcvttss2usi|vcvttsd2usi|vcvtps2qq|vcvtpd2qq|vcvtps2uqq|vcvtpd2uqq|vcvttps2qq|vcvttpd2qq|vcvttps2uqq|vcvttpd2uqq|vcvtudq2ps|vcvtudq2pd|vcvtusi2ps|vcvtusi2pd|vcvtusi2sd|vcvtusi2ss|vcvtuqq2ps|vcvtuqq2pd|vcvtqq2pd|vcvtqq2ps|vgetexppd|vgetexpps|vgetexpsd|vgetexpss|vgetmantpd|vgetmantps|vgetmantsd|vgetmantss|vfixupimmpd|vfixupimmps|vfixupimmsd|vfixupimmss|vrcp14pd|vrcp14ps|vrcp14sd|vrcp14ss|vrndscaleps|vrndscalepd|vrndscaless|vrndscalesd|vrsqrt14pd|vrsqrt14ps|vrsqrt14sd|vrsqrt14ss|vscalefps|vscalefpd|vscalefss|vscalefsd|valignd|valignq|vdbpsadbw|vpabsq|vpmaxsq|vpmaxuq|vpminsq|vpminuq|vprold|vprolvd|vprolq|vprolvq|vprord|vprorvd|vprorq|vprorvq|vpscatterdd|vpscatterdq|vpscatterqd|vpscatterqq|vscatterdps|vscatterdpd|vscatterqps|vscatterqpd|vpconflictd|vpconflictq|vplzcntd|vplzcntq|vpbroadcastmb2q|vpbroadcastmw2d|vexp2pd|vexp2ps|vrcp28pd|vrcp28ps|vrcp28sd|vrcp28ss|vrsqrt28pd|vrsqrt28ps|vrsqrt28sd|vrsqrt28ss|vgatherpf0dps|vgatherpf0qps|vgatherpf0dpd|vgatherpf0qpd|vgatherpf1dps|vgatherpf1qps|vgatherpf1dpd|vgatherpf1qpd|vscatterpf0dps|vscatterpf0qps|vscatterpf0dpd|vscatterpf0qpd|vscatterpf1dps|vscatterpf1qps|vscatterpf1dpd|vscatterpf1qpd|vfpclassps|vfpclasspd|vfpclassss|vfpclasssd|vrangeps|vrangepd|vrangess|vrangesd|vreduceps|vreducepd|vreducess|vreducesd|vpmovm2d|vpmovm2q|vpmovm2b|vpmovm2w|vpmovd2m|vpmovq2m|vpmovb2m|vpmovw2m|vpmullq|vpmadd52luq|vpmadd52huq|v4fmaddps|v4fmaddss|v4fnmaddps|v4fnmaddss|vp4dpwssd|vp4dpwssds|vpdpbusd|vpdpbusds|vpdpwssd|vpdpwssds|vpcompressb|vpcompressw|vpexpandb|vpexpandw|vpshld|vpshldv|vpshrd|vpshrdv|vpopcntd|vpopcntq|vpopcntb|vpopcntw|vpshufbitqmb|gf2p8affineinvqb|gf2p8affineqb|gf2p8mulb|vpclmulqdq|vaesdec|vaesdeclast|vaesenc|vaesenclast)[ \t]/'
- How to Write Fast Numerical Code 263-2300 (ETH, CS)
- Software Grundlagen HPC
- GEMM: From Pure C to SSE Optimized Micro Kernels
<x86intrin.h>
<mmintrin.h> MMX
<xmmintrin.h> SSE
<emmintrin.h> SSE2
<pmmintrin.h> SSE3
<tmmintrin.h> SSSE3
<smmintrin.h> SSE4.1
<nmmintrin.h> SSE4.2
<ammintrin.h> SSE4A
<wmmintrin.h> AES
<immintrin.h> AVX
<zmmintrin.h> AVX512
-
Will enabling -msse, -msse2 and -mfpmath=sse always make my program run faster?
-
x87 and SSE Floating Point Assists in IA-32: Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ)
inline float get_first( const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(0,0)));}
inline float get_second(const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(0,1)));}
inline float get_third( const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(1,0)));}
inline float get_fourth(const __m128 vec){return _mm_cvtss_f32(_mm_shuffle_ps(vec,vec, _MM_SHUFFLE2(1,1)));}
-
https://stackoverflow.com/questions/14367553/sse-convert-m128-to-float
-
Is it possible to cast floats directly to __m128 if they are 16 byte aligned?
__m128 sseval;
float a, b, c, d;
sseval = _mm_set_ps(a, b, c, d); // make vector from [ a, b, c, d ]
sseval = _mm_setr_ps(a, b, c, d); // make vector from [ d, c, b, a ]
sseval = _mm_load_ps(&a); // ill-specified here - "a" not float[] ...
// same as _mm_set_ps(a[0], a[1], a[2], a[3])
// if you have an actual array
sseval = _mm_set1_ps(a); // make vector from [ a, a, a, a ]
sseval = _mm_load1_ps(&a); // load from &a, replicate - same as previous
sseval = _mm_set_ss(a); // make vector from [ a, 0, 0, 0 ]
sseval = _mm_load_ss(&a); // load from &a, zero others - same as prev
// SSE4.1
_mm_testz_si128
// SE2-compatible
inline bool isAllZeros(__m128i xmm) {
return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) == 0xFFFF;
}
res = _mm256_testz_si256(_mm256_castpd_si256(a), _mm256_castpd_si256(a));
- Fastest way to test if any element in __m256 is non zero [duplicate]
- Intel SIMD - How can I check if an __m256* contains any non-zero values
__m256 vcmp = _mm256_cmp_ps(*pSrc1, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
int mask = _mm256_movemask_ps(vcmp);
bool any_nz = mask != 0xff;
__m128 m4x4v_colSSE(const __m128 cols[4], const __m128 v) {
__m128 u1 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(0,0,0,0));
__m128 u2 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(1,1,1,1));
__m128 u3 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(2,2,2,2));
__m128 u4 = _mm_shuffle_ps(v,v, _MM_SHUFFLE(3,3,3,3));
__m128 prod1 = _mm_mul_ps(u1, cols[0]);
__m128 prod2 = _mm_mul_ps(u2, cols[1]);
__m128 prod3 = _mm_mul_ps(u3, cols[2]);
__m128 prod4 = _mm_mul_ps(u4, cols[3]);
return _mm_add_ps(_mm_add_ps(prod1, prod2), _mm_add_ps(prod3, prod4));
}
__m128 m4x4v_rowSSE3(const __m128 rows[4], const __m128 v) {
__m128 prod1 = _mm_mul_ps(rows[0], v);
__m128 prod2 = _mm_mul_ps(rows[1], v);
__m128 prod3 = _mm_mul_ps(rows[2], v);
__m128 prod4 = _mm_mul_ps(rows[3], v);
return _mm_hadd_ps(_mm_hadd_ps(prod1, prod2), _mm_hadd_ps(prod3, prod4));
}
__m128 m4x4v_rowSSE4(const __m128 rows[4], const __m128 v) {
__m128 prod1 = _mm_dp_ps (rows[0], v, 0xFF);
__m128 prod2 = _mm_dp_ps (rows[1], v, 0xFF);
__m128 prod3 = _mm_dp_ps (rows[2], v, 0xFF);
__m128 prod4 = _mm_dp_ps (rows[3], v, 0xFF);
return _mm_shuffle_ps(_mm_movelh_ps(prod1, prod2), _mm_movelh_ps(prod3, prod4), _MM_SHUFFLE(2, 0, 2, 0));
}
- 矩阵交集的宏
- _MM_TRANSPOSE4_PS causes compiler errors in GCC?
- https://gist.github.com/reinsteam/d12f65c6b02613ae7218
static void transpose_4x4_ver0(__m128 & v0, __m128 & v1, __m128 v2, __m128 v3)
{
__m128 a0 = _mm_unpacklo_ps(v0, v1); /* a0 = { x0, x1, y0, y1 } */
__m128 a1 = _mm_unpackhi_ps(v0, v1); /* a1 = { z0, z1, z0, z1 } */
__m128 a2 = _mm_unpacklo_ps(v2, v3); /* a2 = { x2, x3, y2, y3 } */
__m128 a3 = _mm_unpackhi_ps(v2, v3); /* a3 = { z2, z3, z2, z3 } */
v0 = _mm_unpacklo_ps(a0, a2); /* v0 = { x0, x1, x2, x3 } */
v1 = _mm_unpackhi_ps(a0, a2); /* v1 = { y0, y1, y2, y3 } */
v2 = _mm_unpacklo_ps(a1, a3); /* v2 = { z0, z1, z2, z3 } */
v3 = _mm_unpackhi_ps(a1, a3); /* v3 = { w0, w1, w2, w3 } */
}
static void transpose_4x4_ver1(__m128 & v0, __m128 & v1, __m128 v2, __m128 v3)
{
__m128 a0 = _mm_unpacklo_ps(v0, v2); /* a0 = { x0, x2, y0, y2 } */
__m128 a1 = _mm_unpacklo_ps(v1, v3); /* a1 = { x1, x3, y1, y3 } */
__m128 a2 = _mm_unpackhi_ps(v0, v2); /* a2 = { z0, z2, w0, w2 } */
__m128 a3 = _mm_unpackhi_ps(v1, v3); /* a3 = { z1, z3, w1, w3 } */
v0 = _mm_unpacklo_ps(a0, a1); /* v0 = { x0, x1, x2, x3 } */
v1 = _mm_unpackhi_ps(a0, a1); /* v1 = { y0, y1, y2, y3 } */
v2 = _mm_unpacklo_ps(a2, a3); /* v2 = { z0, z1, z2, z3 } */
v3 = _mm_unpackhi_ps(a2, a3); /* v3 = { w0, w1, w2, w3 } */
}
-
How to use the SSE 4.1 DPPS instruction (dot product packed single)
-
Intel AVX: 256-bits version of dot product for double precision floating point variables
// 4 dot products at one time.
__m256d xy0 = _mm256_mul_pd( x[0], y[0] );
__m256d xy1 = _mm256_mul_pd( x[1], y[1] );
__m256d xy2 = _mm256_mul_pd( x[2], y[2] );
__m256d xy3 = _mm256_mul_pd( x[3], y[3] );
// low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
__m256d temp01 = _mm256_hadd_pd( xy0, xy1 );
// low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
__m256d temp23 = _mm256_hadd_pd( xy2, xy3 );
// low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
__m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 );
// low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
__m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);
__m256d dotproduct = _mm256_add_pd( swapped, blended );
- Fast 4x4 Matrix Inverse with SSE SIMD, Explained
- https://github.com/niswegmann/small-matrix-inverse
- - SSE - Matrix inverse with cramer 4x4, How do extends NxN?
inline __m128 abs_ps(__m128 x) {
static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
return _mm_andnot_ps(sign_mask, x);
}
inline __m128d abs_pd(__m128d x) {
static const __m128d sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63
return _mm_andnot_pd(sign_mask, x); // !sign_mask & x
}
inline __m128 abs_ps(const __m128& x)
{
static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
return _mm_andnot_ps(sign_mask, x);
}
#if defined (__AVX__)
inline __m256d abs_pd(const __m256d& x)
{
static const __m256d sign_mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
return _mm256_and_pd(sign_mask, x);
}
#endif
- Flipping sign on packed SSE floats
- How to negate (change sign) of the floating point elements in a __m128 type variable?
_mm_xor_ps(vec, _mm_set1_ps(-0.f))
-
How to use Fused Multiply-Add (FMA) instructions with SSE/AVX
-
How to use Fused Multiply-Add (FMA) instructions with SSE/AVX
$ gcc -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX" | sort
#define __AVX__ 1
#define __AVX2__ 1
#define __SSE__ 1
#define __SSE2__ 1
#define __SSE2_MATH__ 1
#define __SSE3__ 1
#define __SSE4_1__ 1
#define __SSE4_2__ 1
#define __SSE_MATH__ 1
#define __SSSE3__ 1
#if !defined(__FMA__) && defined(__AVX2__)
#define __FMA__ 1
#endif
__attribute__((aligned(16))) float vec[4] = { 1.0f, 1.1f, 1.2f, 1.3f };
__m128 v = _mm_load_ps(vec); // edit by sor: removed the "&" cause its already an address
__m128 a = _mm_set_ps(1,2,3,4);
__m128 b = _mm_set_ps(5,6,7,8);
__m256 c = _mm256_castps128_ps256(a);
c = _mm256_insertf128_ps(c,b,1);
- Packing and de-interleaving two __m256 registers
- AVX2, How to Efficiently Load Four Integers to Even Indices of a 256 Bit Register and Copy to Odd Indices?
- Checking If A Vector Contains Any Element Greater Than Zero
- Could I compare to zero register in avx correctly?
#include <immintrin.h>
#include <stdbool.h>
bool vec_equal(__m256i a, __m256i b) {
__m256i pcmp = _mm256_cmpeq_epi32(a, b); // epi8 is fine too
unsigned bitmask = _mm256_movemask_epi8(pcmp);
return (bitmask == 0xffffffffU);
}
/* Compare */
#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
-
Shuffle AVX 256 Vector elements by 1 position left/right - C intrinsics
-
How can I exchange the low 128 bits and high 128 bits in a 256 bit AVX (YMM) register
-
How can I exchange the low 128 bits and high 128 bits in a 256 bit AVX (YMM) register
-
What's the difference between vextracti128 and vextractf128?
int FilterFloats_Reference(const float input[], float output[], int count, float limit)
{
float *outputp = output;
for (int i = 0; i < count; ++i) {
if (input[i] >= limit)
*outputp++ = input[i];
}
return (int) (outputp - output);
}
for (int i = 0; i < count; i += 4) {
__m128 val = _mm_load_ps(input + i);
__m128 mask = _mm_cmpge_ps(val, _mm_set1_ps(limit));
__m128 result = LeftPack(mask, val);
_mm_storeu_ps(output, result);
output += _popcnt(_mm_movemask_ps(mask));
}
__m128i LeftPack_SSSE3(__m128 mask, __m128 val)
{
// Move 4 sign bits of mask to 4-bit integer value.
int mask = _mm_movemask_ps(mask);
// Select shuffle control data
__m128i shuf_ctrl = _mm_load_si128(&shufmasks[mask]);
// Permute to move valid values to front of SIMD register
__m128i packed = _mm_shuffle_epi8(_mm_castps_si128(val), shuf_ctrl);
return packed;
}
__m128 PackLeft_SSE2(__m128 mask, __m128 val)
{
int valid = _mm_movemask_ps(mask);
__m128 mask0 = _mm_load_ps((float *)(&g_Masks[valid][0]));
__m128 mask1 = _mm_load_ps((float *)(&g_Masks[valid][4]));
__m128 s0 = _mm_shuffle_ps(val, val, _MM_SHUFFLE(0, 3, 2, 1));
__m128 r0 = _mm_or_ps(_mm_and_ps(mask0, s0), _mm_andnot_ps(mask0, val));
__m128 s1 = _mm_shuffle_ps(r0, r0, _MM_SHUFFLE(1, 0, 3, 2));
__m128 r1 = _mm_or_ps(_mm_and_ps(mask1, s1), _mm_andnot_ps(mask1, r0));
return r1;
}
- http://lists.llvm.org/pipermail/llvm-bugs/2011-April/017680.html
- https://bugs.llvm.org/show_bug.cgi?id=9665
- 编译器(gcc)可以保证C++循环吗?
- https://software.intel.com/en-us/forums/intel-c-compiler/topic/287217
To use any of the SIMD technologies optimally, you must evaluate the following situations in your code:
- Fragments that are computationally intensive
- Fragments that are executed often enough to have an impact on performance
- Fragments that with little data-dependent control flow
- Fragments that require floating-point computations
- Fragments that can benefit from moving data 16 bytes at a time
- Fragments of computation that can coded using fewer instructions
- Fragments that require help in using the cache hierarchy efficiently
- Functions that use Streaming SIMD Extensions or Streaming SIMD Extensions 2 data need to provide a 16-byte aligned stack frame.
-
__M128*
parameters need to be aligned to 16-byte boundaries, possibly creating “holes” (due to padding) in the argument block.
/* Make newp a pointer to a 64-bit aligned array of NUM_ELEMENTS 64-bit elements. */
double *p, *newp;
p = (double*)malloc (sizeof(double)*(NUM_ELEMENTS+1));
newp = (p+7) & (~0x7);
union {
float f[400];
__m128 m[100];
} buffer;
struct __declspec(align(16)) my_m128
{
float f[4];
};
class my_m128 {
union {
__m128 m;
float f[4];
};
};
The recommended way for computing data in AoS format is to swizzle each set of elements to SoA format before processing it using SIMD technologies. Swizzling can either be done dynamically during program execution or statically when the data structures are generated.
Note that SoA can have the disadvantage of requiring more independent memory stream references. This can require the use of more prefetches, additional address generation calculations, as well as having a greater impact on DRAM page access efficiency.
NumOfGroups = NumOfVertices/SIMDwidth
typedef struct{
float x[SIMDwidth];
float y[SIMDwidth];
float z[SIMDwidth];
} VerticesCoordList;
typedef struct{
int a[SIMDwidth];
int b[SIMDwidth];
int c[SIMDwidth];
} VerticesColorList;
VerticesCoordList VerticesCoord[NumOfGroups];
VerticesColorList VerticesColor[NumOfGroups];
The hybrid SoA approach ensures:
- Data is organized to enable more efficient vertical SIMD computation
- Simpler/less address generation than AoS
- Fewer streams, which reduces DRAM page misses
- Use of fewer prefetches, due to fewer streams
- Efficient cache line packing of data elements that are used concurrently.
First introduced for vectorizers, this technique consists of the generation of code when each vector operation is done for a size less than or equal to the maximum vector length on a given vector machine. By fragmenting a large loop into smaller segments or strips, this technique transforms the loop structure by:
- Increasing the temporal and spatial locality in the data cache if the data are reusable in different passes of an algorithm.
- Reducing the number of iterations of the loop by a factor of the length of each “vector,” or number of operations being performed per SIMD operation.
// A. Original Loop
float A[MAX, MAX], B[MAX, MAX]
for (i=0; i< MAX; i++) {
for (j=0; j< MAX; j++) {
A[i,j] = A[i,j] + B[j, i];
}
}
// B. Transformed Loop after Blocking
float A[MAX, MAX], B[MAX, MAX];
for (i=0; i< MAX; i+=block_size) {
for (j=0; j< MAX; j+=block_size) {
for (ii=i; ii<i+block_size; ii++) {
for (jj=j; jj<j+block_size; jj++) {
A[ii,jj] = A[ii,jj] + B[jj, ii];
}
}
}
}
If
MAX
is huge, loop blocking can also help reduce the penalty from DTLB (data translation look-aside buffer) misses. In addition to improving the cache/memory performance, this optimization technique also saves external bus bandwidth.
One barrier to SIMD computation can be the existence of data-dependent branches. Conditional moves can be used to eliminate data-dependent branches.
// High-level code:
__declspec(align(16)) short A[MAX_ELEMENT], B[MAX_ELEMENT], C[MAX_ELEMENT],
D[MAX_ELEMENT], E[MAX_ELEMENT];
for (i=0; i<MAX_ELEMENT; i++) {
if (A[i] > B[i]) {
C[i] = D[i];
} else {
C[i] = E[i];
}
}
; MMX assembly code processes 4 short values per iteration:
xor eax, eax
top_of_loop:
movq mm0, [A + eax]
pcmpgtw xmm0, [B + eax]; Create compare mask
movq mm1, [D + eax]
pand mm1, mm0; Drop elements where A<B
pandn mm0, [E + eax] ; Drop elements where A>B
por mm0, mm1; Create single word
movq [C + eax], mm0
add eax, 8
cmp eax, MAX_ELEMENT*2
jle top_of_loop
; SSE4.1 assembly processes 8 short values per iteration:
xor eax, eax
top_of_loop:
movdqq xmm0, [A + eax]
pcmpgtw pcmpgtwxmm0, [B + eax]; Create compare mask
movdqa xmm1, [E + eax]
pblendv xmm1, [D + eax], xmm0;
movdqa [C + eax], xmm1;
add eax, 16
cmp eax, MAX_ELEMENT*2
jle top_of_loop
Recommendation: When targeting code generation for Intel Core Solo and Intel Core Duo processors, favor instructions consisting of two μ ops over those with more than two μ ops. Recommendation: With the proliferation of 128-bit SIMD hardware in Intel Core microarchitecture and Enhanced Intel Core microarchitecture, integer SIMD code written using MMX instructions should consider more efficient implementations using 128-bit SIMD instructions.