/*The moral of this story is simple
inst level optimizations
don't matter much on a modern Intel CPU
because they convert complex insts
to a stream of RISC insts.

I learned this the hard way when I thought
I was greatly improving my compiler by
cutting code by a third.  No significant
speed-up.  Depressing.
*/

#define SAMPLES (8*10000000+1)

asm {

LIMIT:: DU64    SAMPLES;        //Memory reference should be bad, right?

_BADLY_UNOPTIMIZED::
        MOV     RAX,0
        MOV     RCX,1
@@05:   MOV     RDX,RCX
        INC     RCX             //if no dependencies, Free!
        ADD     RAX,RDX
        MOV     RDX,LIMIT-16    //added 16 displacement to make it worse
        CMP     RCX,U64 16[RDX]
        JB      @@05
        RET

_WELL_OPTIMIZED1::
        XOR     RAX,RAX
        MOV     RCX,SAMPLES-1
@@05:   ADD     RAX,RCX
        DEC     RCX
        JNZ     @@05
        RET

_WELL_OPTIMIZED2:: //Unrolled
        XOR     RAX,RAX
        MOV     RCX,SAMPLES-1
@@05:   ADD     RAX,RCX
        DEC     RCX
        ADD     RAX,RCX
        DEC     RCX
        ADD     RAX,RCX
        DEC     RCX
        ADD     RAX,RCX
        DEC     RCX
        ADD     RAX,RCX
        DEC     RCX
        ADD     RAX,RCX
        DEC     RCX
        ADD     RAX,RCX
        DEC     RCX
        ADD     RAX,RCX
        DEC     RCX
        JNZ     @@05
        RET

_WELL_OPTIMIZED3::
        XOR     RAX,RAX
        MOV     RCX,SAMPLES-1
@@05:   ADD     RAX,RCX
        LOOP    @@05    //Inst has slow speed, but saves code size.
        RET
}

_extern _BADLY_UNOPTIMIZED I64 Loop1();
_extern _WELL_OPTIMIZED1   I64 Loop2();
_extern _WELL_OPTIMIZED2   I64 Loop3();
_extern _WELL_OPTIMIZED3   I64 Loop4();

I64 i;
F64 t0;

CPURep;

"Bad Code\n";
t0=tS;
i=Loop1;
"Res:%d Time:%9.6f\n",i,tS-t0;

"Good Code #1\n";
t0=tS;
i=Loop2;
"Res:%d Time:%9.6f\n",i,tS-t0;

"Good Code #2\n";
t0=tS;
i=Loop3;
"Res:%d Time:%9.6f\n",i,tS-t0;

"Good Code #3\n";
t0=tS;
i=Loop4;
"Res:%d Time:%9.6f\n",i,tS-t0;

/*  Program Output
8 Cores 2.660GHz
Bad Code
Res:3200000040000000 Time: 0.069966
Good Code #1
Res:3200000040000000 Time: 0.062567
Good Code #2
Res:3200000040000000 Time: 0.062907
Good Code #3
Res:3200000040000000 Time: 0.156359
*/