/*The moral of this story is simple
inst level optimizations
don't matter much on a modern Intel CPU
because they convert complex insts
to a stream of RISC insts.

I learned this the hard way when I thought
I was greatly improving my compiler by
cutting code by a third.  No significant
speed-up.  Depressing.
*/

#define SAMPLES	(8*10000000+1)

asm {

LIMIT::	DU64	SAMPLES;	//Memory reference should be bad, right?

_BADLY_UNOPTIMIZED::
	MOV	RAX,0
	MOV	RCX,1
@@05:	MOV	RDX,RCX
	INC	RCX		//if no dependencies, Free!
	ADD	RAX,RDX
	MOV	RDX,LIMIT-16	//added 16 displacement to make it worse
	CMP	RCX,U64 16[RDX]
	JB	@@05
	RET

_WELL_OPTIMIZED1::
	XOR	RAX,RAX
	MOV	RCX,SAMPLES-1
@@05:	ADD	RAX,RCX
	DEC	RCX
	JNZ	@@05
	RET

_WELL_OPTIMIZED2:: //Unrolled
	XOR	RAX,RAX
	MOV	RCX,SAMPLES-1
@@05:	ADD	RAX,RCX
	DEC	RCX
	ADD	RAX,RCX
	DEC	RCX
	ADD	RAX,RCX
	DEC	RCX
	ADD	RAX,RCX
	DEC	RCX
	ADD	RAX,RCX
	DEC	RCX
	ADD	RAX,RCX
	DEC	RCX
	ADD	RAX,RCX
	DEC	RCX
	ADD	RAX,RCX
	DEC	RCX
	JNZ	@@05
	RET

_WELL_OPTIMIZED3::
	XOR	RAX,RAX
	MOV	RCX,SAMPLES-1
@@05:	ADD	RAX,RCX
	LOOP	@@05	//Inst has slow speed, but saves code size.
	RET
}

_extern _BADLY_UNOPTIMIZED I64 Loop1();
_extern _WELL_OPTIMIZED1   I64 Loop2();
_extern _WELL_OPTIMIZED2   I64 Loop3();
_extern _WELL_OPTIMIZED3   I64 Loop4();

I64 i;
F64 t0;

CPURep;

"Bad Code\n";
t0=tS;
i=Loop1;
"Res:%d Time:%9.6f\n",i,tS-t0;

"Good Code #1\n";
t0=tS;
i=Loop2;
"Res:%d Time:%9.6f\n",i,tS-t0;

"Good Code #2\n";
t0=tS;
i=Loop3;
"Res:%d Time:%9.6f\n",i,tS-t0;

"Good Code #3\n";
t0=tS;
i=Loop4;
"Res:%d Time:%9.6f\n",i,tS-t0;

/*  Program Output$HL,0$$WW+H,1$$FD,1$
8 Cores 2.660GHz
Bad Code
Res:3200000040000000 Time: 0.069966
Good Code #1
Res:3200000040000000 Time: 0.062567
Good Code #2
Res:3200000040000000 Time: 0.062907
Good Code #3
Res:3200000040000000 Time: 0.156359
$HL,1$*/