一次元配列同士の加算する。C++、AVX-512の例を示す。
- C++で記述
- AVX-512 intrinsicで記述、intの例。
- AVX-512 intrinsicで記述、floatの例。
- AVX-512 intrinsicで記述、aligned intの例。
- AVX-512 intrinsicで記述、aligned floatの例。
- 共通関数
C++で記述
ごく普通にC++で記述した例を示す。
#include <immintrin.h> #include "../common.h" //main int main(void) { __m512i a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); __m512i b = _mm512_set_epi32(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26); __m512i c = _mm512_add_epi32(a, b); printData((int*)&a); printData((int*)&b); printData((int*)&c); return 0; }
実行結果
C:\>cl /EHsc add.cpp
C:\>add
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42
AVX-512 intrinsicで記述、intの例。
alignmentの揃っていない、32ビット整数の一次元配列同士の加算をAVX-512で処理する。
#include <immintrin.h> #include "../common.h" // main int main(void) { const int ArrLen = 256; const int AlignSize = 64; int* a = new int[ArrLen]; int* b = new int[ArrLen]; int* c = new int[ArrLen]; int* r = new int[ArrLen]; initData(ArrLen, a, b); // by simd const int units = sizeof(__m512i) / sizeof(int); for (size_t i = 0; i < ArrLen / units; i++) { __m512i ia = _mm512_loadu_epi32(&a[i * units]); __m512i ib = _mm512_loadu_epi32(&b[i * units]); __m512i y = _mm512_add_epi32(ia, ib); _mm512_storeu_epi32(&c[i * units], y); } // by C++ for (size_t i = 0; i < ArrLen; i++) { r[i] = a[i] + b[i]; } verifyVVec(ArrLen, c, r); delete[] a, b, c, r; return 0; }
実行結果
C:\>cl /EHsc vAdd512i.cpp
C:\>vAdd512i
AVX-512 intrinsicで記述、floatの例。
alignmentの揃っていない、単精度浮動小数点の一次元配列同士の加算をAVX-512で処理する。
include <immintrin.h> #include "../common.h" // main int main(void) { const int ArrLen = 256; const int AlignSize = 64; float* a = new float[ArrLen]; float* b = new float[ArrLen]; float* c = new float[ArrLen]; float* r = new float[ArrLen]; initData(ArrLen, a, b); // by simd const int units = sizeof(__m512i) / sizeof(int); for (size_t i = 0; i < ArrLen / units; i++) { __m512 ia = _mm512_loadu_ps(&a[i * units]); __m512 ib = _mm512_loadu_ps(&b[i * units]); __m512 y = _mm512_add_ps(ia, ib); _mm512_storeu_ps(&c[i * units], y); } // by C++ for (size_t i = 0; i < ArrLen; i++) { r[i] = a[i] + b[i]; } verifyVVec(ArrLen, c, r); delete[] a, b, c, r; return 0; }
AVX-512 intrinsicで記述、aligned intの例。
alignmentされている、32ビット整数の一次元配列同士の加算をAVX-512で処理する。
#include <immintrin.h> #include "../common.h" // main int main(void) { const int ArrLen = 256; const int AlignSize = 64; int* a = (int*)_mm_malloc(sizeof(int) * ArrLen, AlignSize); int* b = (int*)_mm_malloc(sizeof(int) * ArrLen, AlignSize); int* c = (int*)_mm_malloc(sizeof(int) * ArrLen, AlignSize); int* r = new int[ArrLen]; initData(ArrLen, a, b); // by simd const int units = sizeof(__m512i) / sizeof(int); __m512i* pa = (__m512i*)a; __m512i* pb = (__m512i*)b; for (size_t i = 0; i < ArrLen / units; i++, pa++, pb++) { __m512i y = _mm512_add_epi32(*pa, *pb); _mm512_store_epi32(&c[i * units], y); } // by C++ for (size_t i = 0; i < ArrLen; i++) { r[i] = a[i] + b[i]; } verifyVVec(ArrLen, c, r); _mm_free(a); _mm_free(b); _mm_free(c); delete[] r; return 0; }
AVX-512 intrinsicで記述、aligned floatの例。
alignmentされている、単精度浮動小数点の一次元配列同士の加算をAVX-512で処理する。
#include <immintrin.h> #include "../common.h" // main int main(void) { const int ArrLen = 256; const int AlignSize = 64; float* a = (float*)_mm_malloc(sizeof(float) * ArrLen, AlignSize); float* b = (float*)_mm_malloc(sizeof(float) * ArrLen, AlignSize); float* c = (float*)_mm_malloc(sizeof(float) * ArrLen, AlignSize); float* r = new float[ArrLen]; initData(ArrLen, a, b); // by simd const int units = sizeof(__m512) / sizeof(float); __m512* pa = (__m512*)a; __m512* pb = (__m512*)b; for (size_t i = 0; i < ArrLen / units; i++, pa++, pb++) { __m512 y = _mm512_add_ps(*pa, *pb); _mm512_store_ps(&c[i * units], y); } // by C++ for (size_t i = 0; i < ArrLen; i++) { r[i] = a[i] + b[i]; } verifyVVec(ArrLen, c, r); _mm_free(a); _mm_free(b); _mm_free(c); delete[] r; return 0; }
共通関数
#include <iostream> #include <iomanip> #include <random> using namespace std; // print template <typename T> void printData(T a[]) { cout.setf(ios::right); for (int i = (sizeof(__m512i) / sizeof(float)) - 1; i >= 0; i--) cout << fixed << setprecision(0) << setw(3) << a[i] << ","; cout << "\b \b" << endl; } // verify template <typename T> void verifyVVec(const size_t length, const T* c, T* r) { cout.setf(ios::right); for (size_t i = 0; i < length; i++) { if (c[i] != r[i]) { cout << fixed << setprecision(2) << "c[" << i << "] =" << setw(10) << c[i] << ", " << "r[" << i << "] =" << setw(10) << r[i] << endl; } } } // random int inline genRandom(int low, int high) { random_device rd; default_random_engine eng(rd()); uniform_int_distribution<int> distr(low, high); return distr(eng); } // init template <typename T> void initData(const size_t length, T* a, T* b) { for (size_t i = 0; i < length; i++) { a[i] = (T)genRandom(-100, 100); b[i] = (T)genRandom(-100, 100); } } // init template <typename T> void initData(const size_t length, T* a) { for (size_t i = 0; i < length; i++) { a[i] = (T)genRandom(-100, 100); } }