How do I achieve the theoretical maximum of 4 FLOPs per cycle?

c++ assembly x86-64 cpu-architecture flops

84,576

Solution 1

I've done this exact task before. But it was mainly to measure power consumption and CPU temperatures. The following code (which is fairly long) achieves close to optimal on my Core i7 2600K.

The key thing to note here is the massive amount of manual loop-unrolling as well as interleaving of multiplies and adds...

The full project can be found on my GitHub: https://github.com/Mysticial/Flops

Warning:

If you decide to compile and run this, pay attention to your CPU temperatures!!!
Make sure you don't overheat it. And make sure CPU-throttling doesn't affect your results!

Furthermore, I take no responsibility for whatever damage that may result from running this code.

Notes:

This code is optimized for x64. x86 doesn't have enough registers for this to compile well.
This code has been tested to work well on Visual Studio 2010/2012 and GCC 4.6.
ICC 11 (Intel Compiler 11) surprisingly has trouble compiling it well.
These are for pre-FMA processors. In order to achieve peak FLOPS on Intel Haswell and AMD Bulldozer processors (and later), FMA (Fused Multiply Add) instructions will be needed. These are beyond the scope of this benchmark.

#include <emmintrin.h>
#include <omp.h>
#include <iostream>
using namespace std;

typedef unsigned long long uint64;

double test_dp_mac_SSE(double x,double y,uint64 iterations){
    register __m128d r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;

    //  Generate starting data.
    r0 = _mm_set1_pd(x);
    r1 = _mm_set1_pd(y);

    r8 = _mm_set1_pd(-0.0);

    r2 = _mm_xor_pd(r0,r8);
    r3 = _mm_or_pd(r0,r8);
    r4 = _mm_andnot_pd(r8,r0);
    r5 = _mm_mul_pd(r1,_mm_set1_pd(0.37796447300922722721));
    r6 = _mm_mul_pd(r1,_mm_set1_pd(0.24253562503633297352));
    r7 = _mm_mul_pd(r1,_mm_set1_pd(4.1231056256176605498));
    r8 = _mm_add_pd(r0,_mm_set1_pd(0.37796447300922722721));
    r9 = _mm_add_pd(r1,_mm_set1_pd(0.24253562503633297352));
    rA = _mm_sub_pd(r0,_mm_set1_pd(4.1231056256176605498));
    rB = _mm_sub_pd(r1,_mm_set1_pd(4.1231056256176605498));

    rC = _mm_set1_pd(1.4142135623730950488);
    rD = _mm_set1_pd(1.7320508075688772935);
    rE = _mm_set1_pd(0.57735026918962576451);
    rF = _mm_set1_pd(0.70710678118654752440);

    uint64 iMASK = 0x800fffffffffffffull;
    __m128d MASK = _mm_set1_pd(*(double*)&iMASK);
    __m128d vONE = _mm_set1_pd(1.0);

    uint64 c = 0;
    while (c < iterations){
        size_t i = 0;
        while (i < 1000){
            //  Here's the meat - the part that really matters.

            r0 = _mm_mul_pd(r0,rC);
            r1 = _mm_add_pd(r1,rD);
            r2 = _mm_mul_pd(r2,rE);
            r3 = _mm_sub_pd(r3,rF);
            r4 = _mm_mul_pd(r4,rC);
            r5 = _mm_add_pd(r5,rD);
            r6 = _mm_mul_pd(r6,rE);
            r7 = _mm_sub_pd(r7,rF);
            r8 = _mm_mul_pd(r8,rC);
            r9 = _mm_add_pd(r9,rD);
            rA = _mm_mul_pd(rA,rE);
            rB = _mm_sub_pd(rB,rF);

            r0 = _mm_add_pd(r0,rF);
            r1 = _mm_mul_pd(r1,rE);
            r2 = _mm_sub_pd(r2,rD);
            r3 = _mm_mul_pd(r3,rC);
            r4 = _mm_add_pd(r4,rF);
            r5 = _mm_mul_pd(r5,rE);
            r6 = _mm_sub_pd(r6,rD);
            r7 = _mm_mul_pd(r7,rC);
            r8 = _mm_add_pd(r8,rF);
            r9 = _mm_mul_pd(r9,rE);
            rA = _mm_sub_pd(rA,rD);
            rB = _mm_mul_pd(rB,rC);

            r0 = _mm_mul_pd(r0,rC);
            r1 = _mm_add_pd(r1,rD);
            r2 = _mm_mul_pd(r2,rE);
            r3 = _mm_sub_pd(r3,rF);
            r4 = _mm_mul_pd(r4,rC);
            r5 = _mm_add_pd(r5,rD);
            r6 = _mm_mul_pd(r6,rE);
            r7 = _mm_sub_pd(r7,rF);
            r8 = _mm_mul_pd(r8,rC);
            r9 = _mm_add_pd(r9,rD);
            rA = _mm_mul_pd(rA,rE);
            rB = _mm_sub_pd(rB,rF);

            r0 = _mm_add_pd(r0,rF);
            r1 = _mm_mul_pd(r1,rE);
            r2 = _mm_sub_pd(r2,rD);
            r3 = _mm_mul_pd(r3,rC);
            r4 = _mm_add_pd(r4,rF);
            r5 = _mm_mul_pd(r5,rE);
            r6 = _mm_sub_pd(r6,rD);
            r7 = _mm_mul_pd(r7,rC);
            r8 = _mm_add_pd(r8,rF);
            r9 = _mm_mul_pd(r9,rE);
            rA = _mm_sub_pd(rA,rD);
            rB = _mm_mul_pd(rB,rC);

            i++;
        }

        //  Need to renormalize to prevent denormal/overflow.
        r0 = _mm_and_pd(r0,MASK);
        r1 = _mm_and_pd(r1,MASK);
        r2 = _mm_and_pd(r2,MASK);
        r3 = _mm_and_pd(r3,MASK);
        r4 = _mm_and_pd(r4,MASK);
        r5 = _mm_and_pd(r5,MASK);
        r6 = _mm_and_pd(r6,MASK);
        r7 = _mm_and_pd(r7,MASK);
        r8 = _mm_and_pd(r8,MASK);
        r9 = _mm_and_pd(r9,MASK);
        rA = _mm_and_pd(rA,MASK);
        rB = _mm_and_pd(rB,MASK);
        r0 = _mm_or_pd(r0,vONE);
        r1 = _mm_or_pd(r1,vONE);
        r2 = _mm_or_pd(r2,vONE);
        r3 = _mm_or_pd(r3,vONE);
        r4 = _mm_or_pd(r4,vONE);
        r5 = _mm_or_pd(r5,vONE);
        r6 = _mm_or_pd(r6,vONE);
        r7 = _mm_or_pd(r7,vONE);
        r8 = _mm_or_pd(r8,vONE);
        r9 = _mm_or_pd(r9,vONE);
        rA = _mm_or_pd(rA,vONE);
        rB = _mm_or_pd(rB,vONE);

        c++;
    }

    r0 = _mm_add_pd(r0,r1);
    r2 = _mm_add_pd(r2,r3);
    r4 = _mm_add_pd(r4,r5);
    r6 = _mm_add_pd(r6,r7);
    r8 = _mm_add_pd(r8,r9);
    rA = _mm_add_pd(rA,rB);

    r0 = _mm_add_pd(r0,r2);
    r4 = _mm_add_pd(r4,r6);
    r8 = _mm_add_pd(r8,rA);

    r0 = _mm_add_pd(r0,r4);
    r0 = _mm_add_pd(r0,r8);


    //  Prevent Dead Code Elimination
    double out = 0;
    __m128d temp = r0;
    out += ((double*)&temp)[0];
    out += ((double*)&temp)[1];

    return out;
}

void test_dp_mac_SSE(int tds,uint64 iterations){

    double *sum = (double*)malloc(tds * sizeof(double));
    double start = omp_get_wtime();

#pragma omp parallel num_threads(tds)
    {
        double ret = test_dp_mac_SSE(1.1,2.1,iterations);
        sum[omp_get_thread_num()] = ret;
    }

    double secs = omp_get_wtime() - start;
    uint64 ops = 48 * 1000 * iterations * tds * 2;
    cout << "Seconds = " << secs << endl;
    cout << "FP Ops  = " << ops << endl;
    cout << "FLOPs   = " << ops / secs << endl;

    double out = 0;
    int c = 0;
    while (c < tds){
        out += sum[c++];
    }

    cout << "sum = " << out << endl;
    cout << endl;

    free(sum);
}

int main(){
    //  (threads, iterations)
    test_dp_mac_SSE(8,10000000);

    system("pause");
}

Output (1 thread, 10000000 iterations) - Compiled with Visual Studio 2010 SP1 - x64 Release:

Seconds = 55.5104
FP Ops  = 960000000000
FLOPs   = 1.7294e+010
sum = 2.22652

The machine is a Core i7 2600K @ 4.4 GHz. Theoretical SSE peak is 4 flops * 4.4 GHz = 17.6 GFlops. This code achieves 17.3 GFlops - not bad.

Output (8 threads, 10000000 iterations) - Compiled with Visual Studio 2010 SP1 - x64 Release:

Seconds = 117.202
FP Ops  = 7680000000000
FLOPs   = 6.55279e+010
sum = 17.8122

Theoretical SSE peak is 4 flops * 4 cores * 4.4 GHz = 70.4 GFlops. Actual is 65.5 GFlops.

Let's take this one step further. AVX...

#include <immintrin.h>
#include <omp.h>
#include <iostream>
using namespace std;

typedef unsigned long long uint64;

double test_dp_mac_AVX(double x,double y,uint64 iterations){
    register __m256d r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;

    //  Generate starting data.
    r0 = _mm256_set1_pd(x);
    r1 = _mm256_set1_pd(y);

    r8 = _mm256_set1_pd(-0.0);

    r2 = _mm256_xor_pd(r0,r8);
    r3 = _mm256_or_pd(r0,r8);
    r4 = _mm256_andnot_pd(r8,r0);
    r5 = _mm256_mul_pd(r1,_mm256_set1_pd(0.37796447300922722721));
    r6 = _mm256_mul_pd(r1,_mm256_set1_pd(0.24253562503633297352));
    r7 = _mm256_mul_pd(r1,_mm256_set1_pd(4.1231056256176605498));
    r8 = _mm256_add_pd(r0,_mm256_set1_pd(0.37796447300922722721));
    r9 = _mm256_add_pd(r1,_mm256_set1_pd(0.24253562503633297352));
    rA = _mm256_sub_pd(r0,_mm256_set1_pd(4.1231056256176605498));
    rB = _mm256_sub_pd(r1,_mm256_set1_pd(4.1231056256176605498));

    rC = _mm256_set1_pd(1.4142135623730950488);
    rD = _mm256_set1_pd(1.7320508075688772935);
    rE = _mm256_set1_pd(0.57735026918962576451);
    rF = _mm256_set1_pd(0.70710678118654752440);

    uint64 iMASK = 0x800fffffffffffffull;
    __m256d MASK = _mm256_set1_pd(*(double*)&iMASK);
    __m256d vONE = _mm256_set1_pd(1.0);

    uint64 c = 0;
    while (c < iterations){
        size_t i = 0;
        while (i < 1000){
            //  Here's the meat - the part that really matters.

            r0 = _mm256_mul_pd(r0,rC);
            r1 = _mm256_add_pd(r1,rD);
            r2 = _mm256_mul_pd(r2,rE);
            r3 = _mm256_sub_pd(r3,rF);
            r4 = _mm256_mul_pd(r4,rC);
            r5 = _mm256_add_pd(r5,rD);
            r6 = _mm256_mul_pd(r6,rE);
            r7 = _mm256_sub_pd(r7,rF);
            r8 = _mm256_mul_pd(r8,rC);
            r9 = _mm256_add_pd(r9,rD);
            rA = _mm256_mul_pd(rA,rE);
            rB = _mm256_sub_pd(rB,rF);

            r0 = _mm256_add_pd(r0,rF);
            r1 = _mm256_mul_pd(r1,rE);
            r2 = _mm256_sub_pd(r2,rD);
            r3 = _mm256_mul_pd(r3,rC);
            r4 = _mm256_add_pd(r4,rF);
            r5 = _mm256_mul_pd(r5,rE);
            r6 = _mm256_sub_pd(r6,rD);
            r7 = _mm256_mul_pd(r7,rC);
            r8 = _mm256_add_pd(r8,rF);
            r9 = _mm256_mul_pd(r9,rE);
            rA = _mm256_sub_pd(rA,rD);
            rB = _mm256_mul_pd(rB,rC);

            r0 = _mm256_mul_pd(r0,rC);
            r1 = _mm256_add_pd(r1,rD);
            r2 = _mm256_mul_pd(r2,rE);
            r3 = _mm256_sub_pd(r3,rF);
            r4 = _mm256_mul_pd(r4,rC);
            r5 = _mm256_add_pd(r5,rD);
            r6 = _mm256_mul_pd(r6,rE);
            r7 = _mm256_sub_pd(r7,rF);
            r8 = _mm256_mul_pd(r8,rC);
            r9 = _mm256_add_pd(r9,rD);
            rA = _mm256_mul_pd(rA,rE);
            rB = _mm256_sub_pd(rB,rF);

            r0 = _mm256_add_pd(r0,rF);
            r1 = _mm256_mul_pd(r1,rE);
            r2 = _mm256_sub_pd(r2,rD);
            r3 = _mm256_mul_pd(r3,rC);
            r4 = _mm256_add_pd(r4,rF);
            r5 = _mm256_mul_pd(r5,rE);
            r6 = _mm256_sub_pd(r6,rD);
            r7 = _mm256_mul_pd(r7,rC);
            r8 = _mm256_add_pd(r8,rF);
            r9 = _mm256_mul_pd(r9,rE);
            rA = _mm256_sub_pd(rA,rD);
            rB = _mm256_mul_pd(rB,rC);

            i++;
        }

        //  Need to renormalize to prevent denormal/overflow.
        r0 = _mm256_and_pd(r0,MASK);
        r1 = _mm256_and_pd(r1,MASK);
        r2 = _mm256_and_pd(r2,MASK);
        r3 = _mm256_and_pd(r3,MASK);
        r4 = _mm256_and_pd(r4,MASK);
        r5 = _mm256_and_pd(r5,MASK);
        r6 = _mm256_and_pd(r6,MASK);
        r7 = _mm256_and_pd(r7,MASK);
        r8 = _mm256_and_pd(r8,MASK);
        r9 = _mm256_and_pd(r9,MASK);
        rA = _mm256_and_pd(rA,MASK);
        rB = _mm256_and_pd(rB,MASK);
        r0 = _mm256_or_pd(r0,vONE);
        r1 = _mm256_or_pd(r1,vONE);
        r2 = _mm256_or_pd(r2,vONE);
        r3 = _mm256_or_pd(r3,vONE);
        r4 = _mm256_or_pd(r4,vONE);
        r5 = _mm256_or_pd(r5,vONE);
        r6 = _mm256_or_pd(r6,vONE);
        r7 = _mm256_or_pd(r7,vONE);
        r8 = _mm256_or_pd(r8,vONE);
        r9 = _mm256_or_pd(r9,vONE);
        rA = _mm256_or_pd(rA,vONE);
        rB = _mm256_or_pd(rB,vONE);

        c++;
    }

    r0 = _mm256_add_pd(r0,r1);
    r2 = _mm256_add_pd(r2,r3);
    r4 = _mm256_add_pd(r4,r5);
    r6 = _mm256_add_pd(r6,r7);
    r8 = _mm256_add_pd(r8,r9);
    rA = _mm256_add_pd(rA,rB);

    r0 = _mm256_add_pd(r0,r2);
    r4 = _mm256_add_pd(r4,r6);
    r8 = _mm256_add_pd(r8,rA);

    r0 = _mm256_add_pd(r0,r4);
    r0 = _mm256_add_pd(r0,r8);

    //  Prevent Dead Code Elimination
    double out = 0;
    __m256d temp = r0;
    out += ((double*)&temp)[0];
    out += ((double*)&temp)[1];
    out += ((double*)&temp)[2];
    out += ((double*)&temp)[3];

    return out;
}

void test_dp_mac_AVX(int tds,uint64 iterations){

    double *sum = (double*)malloc(tds * sizeof(double));
    double start = omp_get_wtime();

#pragma omp parallel num_threads(tds)
    {
        double ret = test_dp_mac_AVX(1.1,2.1,iterations);
        sum[omp_get_thread_num()] = ret;
    }

    double secs = omp_get_wtime() - start;
    uint64 ops = 48 * 1000 * iterations * tds * 4;
    cout << "Seconds = " << secs << endl;
    cout << "FP Ops  = " << ops << endl;
    cout << "FLOPs   = " << ops / secs << endl;

    double out = 0;
    int c = 0;
    while (c < tds){
        out += sum[c++];
    }

    cout << "sum = " << out << endl;
    cout << endl;

    free(sum);
}

int main(){
    //  (threads, iterations)
    test_dp_mac_AVX(8,10000000);

    system("pause");
}

Output (1 thread, 10000000 iterations) - Compiled with Visual Studio 2010 SP1 - x64 Release:

Seconds = 57.4679
FP Ops  = 1920000000000
FLOPs   = 3.34099e+010
sum = 4.45305

Theoretical AVX peak is 8 flops * 4.4 GHz = 35.2 GFlops. Actual is 33.4 GFlops.

Output (8 threads, 10000000 iterations) - Compiled with Visual Studio 2010 SP1 - x64 Release:

Seconds = 111.119
FP Ops  = 15360000000000
FLOPs   = 1.3823e+011
sum = 35.6244

Theoretical AVX peak is 8 flops * 4 cores * 4.4 GHz = 140.8 GFlops. Actual is 138.2 GFlops.

Now for some explanations:

The performance critical part is obviously the 48 instructions inside the inner loop. You'll notice that it's broken into 4 blocks of 12 instructions each. Each of these 12 instructions blocks are completely independent from each other - and take on average 6 cycles to execute.

So there's 12 instructions and 6 cycles between issue-to-use. The latency of multiplication is 5 cycles, so it's just enough to avoid latency stalls.

The normalization step is needed to keep the data from over/underflowing. This is needed since the do-nothing code will slowly increase/decrease the magnitude of the data.

So it's actually possible to do better than this if you just use all zeros and get rid of the normalization step. However, since I wrote the benchmark to measure power consumption and temperature, I had to make sure the flops were on "real" data, rather than zeros - as the execution units may very well have special case-handling for zeros that use less power and produce less heat.

More Results:

Intel Core i7 920 @ 3.5 GHz
Windows 7 Ultimate x64
Visual Studio 2010 SP1 - x64 Release

Threads: 1

Seconds = 72.1116
FP Ops  = 960000000000
FLOPs   = 1.33127e+010
sum = 2.22652

Theoretical SSE Peak: 4 flops * 3.5 GHz = 14.0 GFlops. Actual is 13.3 GFlops.

Threads: 8

Seconds = 149.576
FP Ops  = 7680000000000
FLOPs   = 5.13452e+010
sum = 17.8122

Theoretical SSE Peak: 4 flops * 4 cores * 3.5 GHz = 56.0 GFlops. Actual is 51.3 GFlops.

My processor temps hit 76C on the multi-threaded run! If you runs these, be sure the results aren't affected by CPU throttling.

2 x Intel Xeon X5482 Harpertown @ 3.2 GHz
Ubuntu Linux 10 x64
GCC 4.5.2 x64 - (-O2 -msse3 -fopenmp)

Threads: 1

Seconds = 78.3357
FP Ops  = 960000000000
FLOPs   = 1.22549e+10
sum = 2.22652

Theoretical SSE Peak: 4 flops * 3.2 GHz = 12.8 GFlops. Actual is 12.3 GFlops.

Threads: 8

Seconds = 78.4733
FP Ops  = 7680000000000
FLOPs   = 9.78676e+10
sum = 17.8122

Theoretical SSE Peak: 4 flops * 8 cores * 3.2 GHz = 102.4 GFlops. Actual is 97.9 GFlops.

Solution 2

There's a point in the Intel architecture that people often forget, the dispatch ports are shared between Int and FP/SIMD. This means that you will only get a certain amount of bursts of FP/SIMD before the loop logic will create bubbles in your floating point stream. Mystical got more flops out of his code, because he used longer strides in his unrolled loop.

If you look at the Nehalem/Sandy Bridge architecture here http://www.realworldtech.com/page.cfm?ArticleID=RWT091810191937&p=6 it's quite clear what happens.

In contrast, it should be easier to reach peak performance on AMD (Bulldozer) as the INT and FP/SIMD pipes have separate issue ports with their own scheduler.

This is only theoretical as I have neither of these processors to test.

Solution 3

Branches can definitely keep you from sustaining peak theoretical performance. Do you see a difference if you manually do some loop-unrolling? For example, if you put 5 or 10 times as many ops per loop iteration:

for(int i=0; i<loops/5; i++) {
      mul1*=mul; mul2*=mul; mul3*=mul; mul4*=mul; mul5*=mul;
      sum1+=add; sum2+=add; sum3+=add; sum4+=add; sum5+=add;
      mul1*=mul; mul2*=mul; mul3*=mul; mul4*=mul; mul5*=mul;
      sum1+=add; sum2+=add; sum3+=add; sum4+=add; sum5+=add;
      mul1*=mul; mul2*=mul; mul3*=mul; mul4*=mul; mul5*=mul;
      sum1+=add; sum2+=add; sum3+=add; sum4+=add; sum5+=add;
      mul1*=mul; mul2*=mul; mul3*=mul; mul4*=mul; mul5*=mul;
      sum1+=add; sum2+=add; sum3+=add; sum4+=add; sum5+=add;
      mul1*=mul; mul2*=mul; mul3*=mul; mul4*=mul; mul5*=mul;
      sum1+=add; sum2+=add; sum3+=add; sum4+=add; sum5+=add;
   }

Solution 4

Using Intels icc Version 11.1 on a 2.4GHz Intel Core 2 Duo I get

Macintosh:~ mackie$ icc -O3 -mssse3 -oaddmul addmul.cc && ./addmul 1000
addmul:  0.105 s, 9.525 Gflops, res=0.000000
Macintosh:~ mackie$ icc -v
Version 11.1

That is very close to the ideal 9.6 Gflops.

EDIT:

Oops, looking at the assembly code it seems that icc not only vectorized the multiplication, but also pulled the additions out of the loop. Forcing a stricter fp semantics the code is no longer vectorized:

Macintosh:~ mackie$ icc -O3 -mssse3 -oaddmul addmul.cc -fp-model precise && ./addmul 1000
addmul:  0.516 s, 1.938 Gflops, res=1.326463

EDIT2:

As requested:

Macintosh:~ mackie$ clang -O3 -mssse3 -oaddmul addmul.cc && ./addmul 1000
addmul:  0.209 s, 4.786 Gflops, res=1.326463
Macintosh:~ mackie$ clang -v
Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)
Target: x86_64-apple-darwin11.2.0
Thread model: posix

The inner loop of clang's code looks like this:

        .align  4, 0x90
LBB2_4:                                 ## =>This Inner Loop Header: Depth=1
        addsd   %xmm2, %xmm3
        addsd   %xmm2, %xmm14
        addsd   %xmm2, %xmm5
        addsd   %xmm2, %xmm1
        addsd   %xmm2, %xmm4
        mulsd   %xmm2, %xmm0
        mulsd   %xmm2, %xmm6
        mulsd   %xmm2, %xmm7
        mulsd   %xmm2, %xmm11
        mulsd   %xmm2, %xmm13
        incl    %eax
        cmpl    %r14d, %eax
        jl      LBB2_4

EDIT3:

Finally, two suggestions: First, if you like this type of benchmarking, consider using the rdtsc instruction istead of gettimeofday(2). It is much more accurate and delivers the time in cycles, which is usually what you are interested in anyway. For gcc and friends you can define it like this:

#include <stdint.h>

static __inline__ uint64_t rdtsc(void)
{
        uint64_t rval;
        __asm__ volatile ("rdtsc" : "=A" (rval));
        return rval;
}

Second, you should run your benchmark program several times and use the best performance only. In modern operating systems many things happen in parallel, the cpu may be in a low frequency power saving mode, etc. Running the program repeatedly gives you a result that is closer to the ideal case.

View more solutions

84,576

Author by

user1059432

Updated on July 08, 2022

Comments

user1059432 almost 2 years
How can the theoretical peak performance of 4 floating point operations (double precision) per cycle be achieved on a modern x86-64 Intel CPU?

As far as I understand it takes three cycles for an SSE add and five cycles for a mul to complete on most of the modern Intel CPUs (see for example Agner Fog's 'Instruction Tables' ). Due to pipelining, one can get a throughput of one add per cycle, if the algorithm has at least three independent summations. Since that is true for both packed addpd as well as the scalar addsd versions and SSE registers can contain two double's, the throughput can be as much as two flops per cycle.

Furthermore, it seems (although I've not seen any proper documentation on this) add's and mul's can be executed in parallel giving a theoretical max throughput of four flops per cycle.

However, I've not been able to replicate that performance with a simple C/C++ programme. My best attempt resulted in about 2.7 flops/cycle. If anyone can contribute a simple C/C++ or assembler programme which demonstrates peak performance, that'd be greatly appreciated.

My attempt:
```
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>

double stoptime(void) {
   struct timeval t;
   gettimeofday(&t,NULL);
   return (double) t.tv_sec + t.tv_usec/1000000.0;
}

double addmul(double add, double mul, int ops){
   // Need to initialise differently otherwise compiler might optimise away
   double sum1=0.1, sum2=-0.1, sum3=0.2, sum4=-0.2, sum5=0.0;
   double mul1=1.0, mul2= 1.1, mul3=1.2, mul4= 1.3, mul5=1.4;
   int loops=ops/10;          // We have 10 floating point operations inside the loop
   double expected = 5.0*add*loops + (sum1+sum2+sum3+sum4+sum5)
               + pow(mul,loops)*(mul1+mul2+mul3+mul4+mul5);

   for (int i=0; i<loops; i++) {
      mul1*=mul; mul2*=mul; mul3*=mul; mul4*=mul; mul5*=mul;
      sum1+=add; sum2+=add; sum3+=add; sum4+=add; sum5+=add;
   }
   return  sum1+sum2+sum3+sum4+sum5+mul1+mul2+mul3+mul4+mul5 - expected;
}

int main(int argc, char** argv) {
   if (argc != 2) {
      printf("usage: %s <num>\n", argv[0]);
      printf("number of operations: <num> millions\n");
      exit(EXIT_FAILURE);
   }
   int n = atoi(argv[1]) * 1000000;
   if (n<=0)
       n=1000;

   double x = M_PI;
   double y = 1.0 + 1e-8;
   double t = stoptime();
   x = addmul(x, y, n);
   t = stoptime() - t;
   printf("addmul:\t %.3f s, %.3f Gflops, res=%f\n", t, (double)n/t/1e9, x);
   return EXIT_SUCCESS;
}
```
Compiled with:
```
g++ -O2 -march=native addmul.cpp ; ./a.out 1000
```
produces the following output on an Intel Core i5-750, 2.66 GHz:
```
addmul:  0.270 s, 3.707 Gflops, res=1.326463
```
That is, just about 1.4 flops per cycle. Looking at the assembler code with g++ -S -O2 -march=native -masm=intel addmul.cpp the main loop seems kind of optimal to me.
```
.L4:
inc    eax
mulsd    xmm8, xmm3
mulsd    xmm7, xmm3
mulsd    xmm6, xmm3
mulsd    xmm5, xmm3
mulsd    xmm1, xmm3
addsd    xmm13, xmm2
addsd    xmm12, xmm2
addsd    xmm11, xmm2
addsd    xmm10, xmm2
addsd    xmm9, xmm2
cmp    eax, ebx
jne    .L4
```
Changing the scalar versions with packed versions (addpd and mulpd) would double the flop count without changing the execution time and so I'd get just short of 2.8 flops per cycle. Is there a simple example which achieves four flops per cycle?

Nice little programme by Mysticial; here are my results (run just for a few seconds though):
- gcc -O2 -march=nocona: 5.6 Gflops out of 10.66 Gflops (2.1 flops/cycle)
- cl /O2, openmp removed: 10.1 Gflops out of 10.66 Gflops (3.8 flops/cycle)
It all seems a bit complex, but my conclusions so far:
- gcc -O2 changes the order of independent floating point operations with the aim of alternating addpd and mulpd's if possible. Same applies to gcc-4.6.2 -O2 -march=core2.
- gcc -O2 -march=nocona seems to keep the order of floating point operations as defined in the C++ source.
- cl /O2, the 64-bit compiler from the SDK for Windows 7 does loop-unrolling automatically and seems to try and arrange operations so that groups of three addpd's alternate with three mulpd's (well, at least on my system and for my simple programme).
- My Core i5 750 (Nehalem architecture) doesn't like alternating add's and mul's and seems unable to run both operations in parallel. However, if grouped in 3's, it suddenly works like magic.
- Other architectures (possibly Sandy Bridge and others) appear to be able to execute add/mul in parallel without problems if they alternate in the assembly code.
- Although difficult to admit, but on my system cl /O2 does a much better job at low-level optimising operations for my system and achieves close to peak performance for the little C++ example above. I measured between 1.85-2.01 flops/cycle (have used clock() in Windows which is not that precise. I guess, need to use a better timer - thanks Mackie Messer).
- The best I managed with gcc was to manually loop unroll and arrange additions and multiplications in groups of three. With g++ -O2 -march=nocona addmul_unroll.cpp I get at best 0.207s, 4.825 Gflops which corresponds to 1.8 flops/cycle which I'm quite happy with now.
In the C++ code I've replaced the for loop with:
```
   for (int i=0; i<loops/3; i++) {
       mul1*=mul; mul2*=mul; mul3*=mul;
       sum1+=add; sum2+=add; sum3+=add;
       mul4*=mul; mul5*=mul; mul1*=mul;
       sum4+=add; sum5+=add; sum1+=add;

       mul2*=mul; mul3*=mul; mul4*=mul;
       sum2+=add; sum3+=add; sum4+=add;
       mul5*=mul; mul1*=mul; mul2*=mul;
       sum5+=add; sum1+=add; sum2+=add;

       mul3*=mul; mul4*=mul; mul5*=mul;
       sum3+=add; sum4+=add; sum5+=add;
   }
```
And the assembly now looks like:
```
.L4:
mulsd    xmm8, xmm3
mulsd    xmm7, xmm3
mulsd    xmm6, xmm3
addsd    xmm13, xmm2
addsd    xmm12, xmm2
addsd    xmm11, xmm2
mulsd    xmm5, xmm3
mulsd    xmm1, xmm3
mulsd    xmm8, xmm3
addsd    xmm10, xmm2
addsd    xmm9, xmm2
addsd    xmm13, xmm2
...
```
Weaver over 12 years

I may be mistaken, but I believe g++ with -O2 will attempt to automatically unwind the loop (I think it uses Duff's Device).
user1059432 over 12 years

Yes, thanks indeed it improves somewhat. I now get about 4.1-4.3 Gflops, or 1.55 flops per cycle. And no, in this example -O2 didn't loop unroll.
jim mcnamara over 12 years

Weaver is correct about loop unrolling, I believe. So manually unrolling is probably not necessary
user1059432 over 12 years

See assembly output above, there are no signs of loop unrolling.
jim mcnamara over 12 years

Have you considered threading (poor man's parallelization), assuming you have multicore cpus?
user1059432 over 12 years

Indeed threading should give me another factor of 4 on this cpu but I'd like to get the maximum from one core first.
user1059432 over 12 years

Automatic unrolling also improves to average of 4.2 Gflops, but requires -funroll-loops option which is not even included in -O3. See g++ -c -Q -O2 --help=optimizers | grep unroll.
Bahbar over 12 years

and what does the disassembly look like ?
user1059432 over 12 years

Your results are very impressive. I've compiled your code with g++ on my older system but don't get nearly as good results: 100k iterations, 1.814s, 5.292 Gflops, sum=0.448883 out of a peak 10.68 Gflops or just short of 2.0 flops per cycle. Seems add/mul are not executed in parallel. When I change your code and always add/multiply with the same register, say rC, it suddenly achieves almost peak: 0.953s, 10.068 Gflops, sum=0 or 3.8 flops/cycle. Very strange.
Mysticial over 12 years

Yes, since I'm not using inline assembly, the performance is indeed very sensitive to the compiler. The code I have here has been tuned for VC2010. And if I recall correctly, the Intel Compiler gives just as good results. As you have noticed, you may have to tweak it a bit to get it to compile well.
Mysticial over 12 years

Actually, are you sure you're compiling for x64? If it's x86 it will spill like crazy unless you do what you did - use the same register, and let the hardware handle it with register renaming.
user1059432 over 12 years

Looking at the assembly code g++ keeps the order of instructions of your main inner loop, i.e. mulpd xmm0, xmm5, addpd xmm1, xmm4, mulpd xmm15, xmm3, subpd xmm14, xmm2, ..., so I think it might not be a compiler issue but maybe an architectural difference between my Nehalem and your Sandy Bridge cpu or maybe OS? Have you tested your programme on different cpu's? And yes, I'm compiling 64-bit mode and checked the assembly is fine.
Mysticial over 12 years

I just ran it on my Core i7 920 @ 3.5 GHz. Theoretical = 14 GFlops. Actual: 13.3 GFlops. Still under Windows + VS2010. So there's definitely something funny going on... Unfortunately, I don't have a Linux machine in front of me this moment.
Mysticial over 12 years

I just tested this is with Intel Compiler on Windows. It looks like it also has trouble overlapping the mul and adds. As it only gets 11.1 GFlops vs. 14.0 theoretical. When I look at the assembly, it "pushes" the additions further down the loop so there's no overlap at the beginning or end... So I do think this is very compiler-dependent. Apparently I incorrectly recalled that the Intel Compiler gave just as good results - it must have been on a different benchmark.
user1059432 over 12 years

Interesting, that's less than 1 flop/cycle. Does the compiler mix the addsd's and mulsd's or are they in groups as in my assembly output? I also get just about 1 flop/cycle when the compiler mixes them (which I get without -march=native). How does the performance change if you add a line add=mul; at the beginning of the function addmul(...)?
Mackie Messer over 12 years

@user1059432: The addsd and subsd instructions are indeed mixed in the precise version. I tried clang 3.0 too, it doesn't mix instructions and it comes very close to 2 flops/cycle on the core 2 duo. When i run the same code on my laptops core i5, mixing the code makes no difference. I get about 3 flops/cycle in either case.
Mackie Messer over 12 years

@user1059432: In the end it is all about tricking the compiler into generating "meaningful" code for a synthetic benchmark. This is harder than it seems at first look. (i.e. icc outsmarts your benchmark) If all you want is to run some code at 4 flops/cycle the easiest thing is to write a small assembly loop. Much less headake. :-)
user1059432 over 12 years

Ok, so you get close to 2 flops/cycle with an assembly code similar to what I've quoted above? How close to 2? I only get 1.4 so that's significant. I don't think you get 3 flops/cycle on your laptop unless the compiler does optimisations as you've seen with icc before, can you double check the assembly?
Mackie Messer over 12 years

@user1059432: The 3 flops/cycle for the laptop was wrong. I assumed that it runs at 1.7GHz, but apparently in turbo mode it goes up to 2.7GHz. So it runs at slightly less than 2 flops/cycle too. Well that's as good as it gets without vectorizing, right?
Mackie Messer over 12 years

There are only three instructions of loop overhead: inc, cmp, and jl. All of these can go to port #5 and don't interfere with either vectorized fadd or fmul. I would rather suspect that the decoder (sometimes) gets into the way. It needs to sustain between two and three instructions per cycle. I don't remember the exact limitations but instruction length, prefixes and alignment come all into play.
Jarosław Bielawski over 12 years

cmp and jl certainly go to port 5, inc not so sure as it comes always in group with the 2 others. But you are right, it's hard to tell where the bottleneck is and the decoders can also be part of it.
Mysticial over 12 years

I just added some new results on my server. Compiled with GCC 4.5.2, it also gives very good results. However, I want to mention that this test does tend to heat up the processor to extreme levels. So if you're seeing bad results, make sure it isn't because of CPU throttling.
Mysticial over 12 years

Another thing that I noticed... Pay attention to CPU throttling. These pure do-nothing Flops tests produce a lot of heat. I just tried running my code on my laptop and it throttled down and overheated... I killed it before the temperatures got above 95C.
Mackie Messer over 12 years

I played around a bit with the basic loop: the ordering of the instructions does matter. Some arrangements take 13 cycles instead of the minimal 5 cycles. Time to look at the performance event counters I guess...
user1059432 over 12 years

I can confirm your results on Windows 7 using cl /O2 (64-bit from windows sdk) and even my example runs close to peak for scalar operations (1.9 flops/cycle) there. The compiler loop-unrolls and reorders but that might not be the reason need to look into this a bit more. Throttling not a problem I'm nice to my cpu and keep iterations at 100k. :)
jørgensen over 12 years

You declare r0 as "register", but then take the address later with &r0. Clang throws an error on that, naturally.
Mysticial over 12 years

@jørgensen You could get rid of the register declaration or you could change the &r0. I don't think any modern compiler pays attention to it anymore. I put it there more as a readability thing. But in any case, I only tested this on MSVC and g++.
greyfade over 11 years

It should be pointed out in this answer that the design of AMD's FX CPUs and A-series APUs provides only two 128-bit SSE or one 256-bit AVX pipeline for every two CPU cores.
Mysticial over 11 years

@greyfade Not only that, but to achieve peak on those, this benchmark would need to be updated to use FMA instructions. As much as I'd love to play with that, I don't actually have access to an AMD Bulldozer architecture machine.
Mysticial over 11 years

In any case, this question seems to be getting some unusual attention today and the last time I've touched it was over a year ago. I'll see if I can update the code to be more copy-paste friendly.
greyfade over 11 years

@Mysticial: It showed up on the r/coding subreddit today.
haylem almost 11 years

+1. Awesome. Some questions though: Does the CPU melt or the computer take-off when you run it? :)
Mysticial almost 11 years

@haylem It either melts or it takes off. Never both. If there's enough cooling, it will get airtime. Otherwise, it just melts. :)
user541686 over 10 years

@Mysticial: Do you happen to have similar code for Haswell? I have an i7-4700MQ I'd like to melt!
Mysticial over 10 years

@Mehrdad Not yet. But I do have the hardware for it. I just need to find the time to sit down and actually do it. :)
user541686 over 10 years

@Mysticial: Haha okay. If (er, when) you do it please let me know! Thanks! lol
Mysticial over 10 years

@Mehrdad FMA3 added. But it doesn't run as hot as I had expected. My Pi program's stress-test seems to run hotter. I suspect the lack of cache and memory access is "hurting" this benchmark in the "heat department". :(
Admin about 9 years

How important is the instruction cache when writing this kind of peak efficiency code? I'm always wondering with these kinds of aggressively unrolled examples how relevant the instruction cache is. An 'inter-icache' miss when exiting the function seems trivial given how much time is spent in this one function, but at what level does it become insane to unroll so much for 'intra-icache' misses within the same function? Is this generally not a concern?
Mysticial about 9 years

@Ike The icache is actually pretty large. It will easily fit the unrolled loop in this benchmark. Admittedly, it's tricky to benchmark icache since when the code becomes that large, you start to hit decoder limitations at the same time.
Tushar Sudake over 8 years

@Mysticial What's the multiplication factor 2 on line 'uint64 ops = 48 * 1000 * iterations * tds * 2;' ?
Mysticial over 8 years

@TusharSudake It's for the SIMD. Each of those SSE2 instructions do two ops.
Tushar Sudake over 8 years

@Mysticial Thanks! Is there any official source to assume factor 2 or 4 (ops per instruction)? I tried my best, but couldn't find one.
Mysticial over 8 years

@TusharSudake Just look up the instruction. For example, if you google for _mm_mul_pd() it says that it multiplies two pairs of doubles.
markhahn about 7 years

old thread, but for the record: rdtsc needs to come after a serializing instruction, since otherwise it gets executed out of order.
tambre over 6 years

using namespace std; is a bad practice, never use it.
Adrian Maire over 4 years

@tambre only in headers
ShadowRanger over 4 years

@AdrianMaire: It's extra bad in headers, but it's bad everywhere. The std namespace is huge, and contains tons of common names in it; the odds of a collision with some other name in your program or a third party header (since so many other things fail to namespace at all) is unacceptably high to recommend it for any purpose.
Noah about 3 years

@Mysticial making the inner loop do less than branch history table size iterations (I unrolled 8x so 125 inner iterations) sped up code by a fair amount on my Tigerlake. FLOPs = 5.62003e+11 vs FLOPs = 4.49786e+11. You might run into issues if you do this by increasing outer loop iterations instead of inner loop iterations because the inner loop may run out of the LSD which gurantees a branch miss per inner loop. (this is for the AVX benchmark).
Noah about 3 years

Currently inner loop runs out of the LSD (on machines w/ LSD at least) so you get 1 branch miss per inner iteration. So inner loop iterations should probably be smaller to fully fit in BHT and size of inner loop larger to bypass LSD.
Asher almost 3 years

@ShadowRanger This is not enterprise code, and the whole thread is about benchmarking. Given the context, I think using namespace std here is fine.