一次元配列同士の加算、C++、SIMD(AVX-512)

一次元配列同士の加算する。C++、AVX-512の例を示す。

C++で記述

ごく普通にC++で記述した例を示す。

#include <immintrin.h>
#include "../common.h"

//main
int main(void)
{
    __m512i a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
    __m512i b = _mm512_set_epi32(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26);

    __m512i c = _mm512_add_epi32(a, b);

    printData((int*)&a);
    printData((int*)&b);
    printData((int*)&c);

    return 0;
}


 実行結果

C:\>cl /EHsc add.cpp
C:\>add
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42


AVX-512 intrinsicで記述、intの例。

alignmentの揃っていない、32ビット整数の一次元配列同士の加算をAVX-512で処理する。

#include <immintrin.h>
#include "../common.h"

// main
int main(void)
{
    const int ArrLen = 256;
    const int AlignSize = 64;

    int* a = new int[ArrLen];
    int* b = new int[ArrLen];
    int* c = new int[ArrLen];
    int* r = new int[ArrLen];

    initData(ArrLen, a, b);

    // by simd
    const int units = sizeof(__m512i) / sizeof(int);
    for (size_t i = 0; i < ArrLen / units; i++)
    {
        __m512i ia = _mm512_loadu_epi32(&a[i * units]);
        __m512i ib = _mm512_loadu_epi32(&b[i * units]);
        __m512i y = _mm512_add_epi32(ia, ib);
        _mm512_storeu_epi32(&c[i * units], y);
    }

    // by C++
    for (size_t i = 0; i < ArrLen; i++)
    {
        r[i] = a[i] + b[i];
    }

    verifyVVec(ArrLen, c, r);

    delete[] a, b, c, r;

    return 0;
}


 実行結果

C:\>cl /EHsc vAdd512i.cpp
C:\>vAdd512i


AVX-512 intrinsicで記述、floatの例。

alignmentの揃っていない、単精度浮動小数点の一次元配列同士の加算をAVX-512で処理する。

include <immintrin.h>
#include "../common.h"

// main
int main(void)
{
    const int ArrLen = 256;
    const int AlignSize = 64;

    float* a = new float[ArrLen];
    float* b = new float[ArrLen];
    float* c = new float[ArrLen];
    float* r = new float[ArrLen];

    initData(ArrLen, a, b);

    // by simd
    const int units = sizeof(__m512i) / sizeof(int);
    for (size_t i = 0; i < ArrLen / units; i++)
    {
        __m512 ia = _mm512_loadu_ps(&a[i * units]);
        __m512 ib = _mm512_loadu_ps(&b[i * units]);
        __m512 y = _mm512_add_ps(ia, ib);
        _mm512_storeu_ps(&c[i * units], y);
    }

    // by C++
    for (size_t i = 0; i < ArrLen; i++)
    {
        r[i] = a[i] + b[i];
    }

    verifyVVec(ArrLen, c, r);

    delete[] a, b, c, r;

    return 0;
}


AVX-512 intrinsicで記述、aligned intの例。

alignmentされている、32ビット整数の一次元配列同士の加算をAVX-512で処理する。

#include <immintrin.h>
#include "../common.h"

// main
int main(void)
{
    const int ArrLen = 256;
    const int AlignSize = 64;

    int* a = (int*)_mm_malloc(sizeof(int) * ArrLen, AlignSize);
    int* b = (int*)_mm_malloc(sizeof(int) * ArrLen, AlignSize);
    int* c = (int*)_mm_malloc(sizeof(int) * ArrLen, AlignSize);
    int* r = new int[ArrLen];

    initData(ArrLen, a, b);

    // by simd
    const int units = sizeof(__m512i) / sizeof(int);
    __m512i* pa = (__m512i*)a;
    __m512i* pb = (__m512i*)b;
    for (size_t i = 0; i < ArrLen / units; i++, pa++, pb++)
    {
        __m512i y = _mm512_add_epi32(*pa, *pb);
        _mm512_store_epi32(&c[i * units], y);
    }

    // by C++
    for (size_t i = 0; i < ArrLen; i++)
    {
        r[i] = a[i] + b[i];
    }

    verifyVVec(ArrLen, c, r);

    _mm_free(a);
    _mm_free(b);
    _mm_free(c);
    delete[] r;

    return 0;
}


AVX-512 intrinsicで記述、aligned floatの例。

alignmentされている、単精度浮動小数点の一次元配列同士の加算をAVX-512で処理する。

#include <immintrin.h>
#include "../common.h"

// main
int main(void)
{
    const int ArrLen = 256;
    const int AlignSize = 64;

    float* a = (float*)_mm_malloc(sizeof(float) * ArrLen, AlignSize);
    float* b = (float*)_mm_malloc(sizeof(float) * ArrLen, AlignSize);
    float* c = (float*)_mm_malloc(sizeof(float) * ArrLen, AlignSize);
    float* r = new float[ArrLen];

    initData(ArrLen, a, b);

    // by simd
    const int units = sizeof(__m512) / sizeof(float);
    __m512* pa = (__m512*)a;
    __m512* pb = (__m512*)b;
    for (size_t i = 0; i < ArrLen / units; i++, pa++, pb++)
    {
        __m512 y = _mm512_add_ps(*pa, *pb);
        _mm512_store_ps(&c[i * units], y);
    }

    // by C++
    for (size_t i = 0; i < ArrLen; i++)
    {
        r[i] = a[i] + b[i];
    }

    verifyVVec(ArrLen, c, r);

    _mm_free(a);
    _mm_free(b);
    _mm_free(c);
    delete[] r;

    return 0;
}


共通関数

#include <iostream>
#include <iomanip>
#include <random>

using namespace std;

// print
template <typename T>
void printData(T a[])
{
    cout.setf(ios::right);
    for (int i = (sizeof(__m512i) / sizeof(float)) - 1; i >= 0; i--)
        cout << fixed << setprecision(0) << setw(3) << a[i] << ",";
    cout << "\b \b" << endl;
}

// verify
template <typename T>
void verifyVVec(const size_t length, const T* c, T* r)
{
    cout.setf(ios::right);
    for (size_t i = 0; i < length; i++)
    {
        if (c[i] != r[i])
        {
            cout << fixed << setprecision(2)
                << "c[" << i << "] =" << setw(10) << c[i] << ", "
                << "r[" << i << "] =" << setw(10) << r[i] << endl;
        }
    }
}

// random
int inline genRandom(int low, int high)
{
    random_device rd;
    default_random_engine eng(rd());
    uniform_int_distribution<int> distr(low, high);

    return distr(eng);
}

// init
template <typename T>
void initData(const size_t length, T* a, T* b)
{
    for (size_t i = 0; i < length; i++)
    {
        a[i] = (T)genRandom(-100, 100);
        b[i] = (T)genRandom(-100, 100);
    }
}

// init
template <typename T>
void initData(const size_t length, T* a)
{
    for (size_t i = 0; i < length; i++)
    {
        a[i] = (T)genRandom(-100, 100);
    }
}