- A kind of Parallelism, that is Instruction Level Parallelism
It requires an efficient data layout, which must be done manually.
Glossary
- Scalar Operation: a computation that operates on individual data elements
- Vector Operation: operate on multiple data elements simultaneously.
Example
// compile with -c Release
using System.Numerics;
namespace test
{
class P
{
static void Main(string[] args)
{
var lanes = Vector<int>.Count;
int[] a = { 1, 2, 3, 4, 5, 6, 7, 8 };
int[] b = { 1, 1, 1, 1, 1, 1, 1, 1 };
int[] c = new int[a.Length];
for (int i = 0; i < a.Length; i += lanes)
{
var a8 = new Vector<int>(a, i);
var b8 = new Vector<int>(b, i);
(a8 + b8).CopyTo(c, i);
}
}
}
}
#include <stdio.h>
#include <xmmintrin.h> // SSE header
#include <immintrin.h> // AVX and AVX-512 header
#define SIZE 16
int main() {
// Define two arrays of 16 floats each
float a[SIZE] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
float b[SIZE] = {16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f};
float result[SIZE];
// Load 16 floats from each array into SIMD registers
__m512 a16 = _mm512_loadu_ps(&a[0]);
__m512 b16 = _mm512_loadu_ps(&b[0]);
// Load 4 floats from each array into SIMD registers
// __m128 a4 = _mm_loadu_ps(&a[i]);
// __m128 b4 = _mm_loadu_ps(&b[i]);
// Perform vectorized addition
__m512 result16 = _mm512_add_ps(a16, b16);
// __m128 result4 = _mm_add_ps(a4, b4);
// Store the result back into the result array
_mm512_storeu_ps(&result[0], result16);
// _mm_storeu_ps(&result[i], result4);
// Print the result
printf("Result: ");
for (int i = 0; i < SIZE; i++) {
printf("%f ", result[i]);
}
printf("\n");
return 0;
}