It requires an efficient data layout, which must be done manually.

Glossary

  • Scalar Operation: a computation that operates on individual data elements
  • Vector Operation: operate on multiple data elements simultaneously.

Example

// compile with -c Release
using System.Numerics;
 
namespace test 
{
    class P 
    {
        static void Main(string[] args)
        {
            var lanes = Vector<int>.Count;
            int[] a = { 1, 2, 3, 4, 5, 6, 7, 8 };
            int[] b = { 1, 1, 1, 1, 1, 1, 1, 1 };
            int[] c = new int[a.Length];
 
            for (int i = 0; i < a.Length; i += lanes)
            {
                var a8 = new Vector<int>(a, i);
                var b8 = new Vector<int>(b, i);
                (a8 + b8).CopyTo(c, i);
            }
        }
    }
}
#include <stdio.h>
#include <xmmintrin.h>  // SSE header
#include <immintrin.h>  // AVX and AVX-512 header
 
#define SIZE 16
 
int main() {
    // Define two arrays of 16 floats each
    float a[SIZE] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
    float b[SIZE] = {16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f};
    float result[SIZE];
 
    // Load 16 floats from each array into SIMD registers
    __m512 a16 = _mm512_loadu_ps(&a[0]);
    __m512 b16 = _mm512_loadu_ps(&b[0]);
 
	// Load 4 floats from each array into SIMD registers
	// __m128 a4 = _mm_loadu_ps(&a[i]);
	// __m128 b4 = _mm_loadu_ps(&b[i]);
 
    // Perform vectorized addition
    __m512 result16 = _mm512_add_ps(a16, b16);
	// __m128 result4 = _mm_add_ps(a4, b4);
 
 
    // Store the result back into the result array
    _mm512_storeu_ps(&result[0], result16);
    // _mm_storeu_ps(&result[i], result4);
 
    // Print the result
    printf("Result: ");
    for (int i = 0; i < SIZE; i++) {
        printf("%f ", result[i]);
    }
    printf("\n");
 
    return 0;
}