A kind of Parallelism, that is Instruction Level Parallelism It requires an efficient data layout, which must be done manually. Glossary Scalar Operation: a computation that operates on individual data elements Vector Operation: operate on multiple data elements simultaneously. Example // compile with -c Release using System.Numerics; namespace test { class P { static void Main(string[] args) { var lanes = Vector<int>.Count; int[] a = { 1, 2, 3, 4, 5, 6, 7, 8 }; int[] b = { 1, 1, 1, 1, 1, 1, 1, 1 }; int[] c = new int[a.Length]; for (int i = 0; i < a.Length; i += lanes) { var a8 = new Vector<int>(a, i); var b8 = new Vector<int>(b, i); (a8 + b8).CopyTo(c, i); } } } } #include <stdio.h> #include <xmmintrin.h> // SSE header #include <immintrin.h> // AVX and AVX-512 header #define SIZE 16 int main() { // Define two arrays of 16 floats each float a[SIZE] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}; float b[SIZE] = {16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}; float result[SIZE]; // Load 16 floats from each array into SIMD registers __m512 a16 = _mm512_loadu_ps(&a[0]); __m512 b16 = _mm512_loadu_ps(&b[0]); // Load 4 floats from each array into SIMD registers // __m128 a4 = _mm_loadu_ps(&a[i]); // __m128 b4 = _mm_loadu_ps(&b[i]); // Perform vectorized addition __m512 result16 = _mm512_add_ps(a16, b16); // __m128 result4 = _mm_add_ps(a4, b4); // Store the result back into the result array _mm512_storeu_ps(&result[0], result16); // _mm_storeu_ps(&result[i], result4); // Print the result printf("Result: "); for (int i = 0; i < SIZE; i++) { printf("%f ", result[i]); } printf("\n"); return 0; } Links Practical SIMD Programming assembly - Why doesn’t gcc resolve _mm256_loadu_pd as single vmovupd? - Stack Overflow AVX512 SSE SIMD Example Performance Share