templeos-info/public/Wb/Demo/Lectures/Optimization.HC

109 lines
1.8 KiB
HolyC
Executable File

/*The moral of this story is simple
inst level optimizations
don't matter much on a modern Intel CPU
because they convert complex insts
to a stream of RISC insts.
I learned this the hard way when I thought
I was greatly improving my compiler by
cutting code by a third. No significant
speed-up. Depressing.
*/
#define SAMPLES (8*10000000+1)
asm {
LIMIT:: DU64 SAMPLES; //Memory reference should be bad, right?
_BADLY_UNOPTIMIZED::
MOV RAX,0
MOV RCX,1
@@05: MOV RDX,RCX
INC RCX //if no dependencies, Free!
ADD RAX,RDX
MOV RDX,LIMIT-16 //added 16 displacement to make it worse
CMP RCX,U64 16[RDX]
JB @@05
RET
_WELL_OPTIMIZED1::
XOR RAX,RAX
MOV RCX,SAMPLES-1
@@05: ADD RAX,RCX
DEC RCX
JNZ @@05
RET
_WELL_OPTIMIZED2:: //Unrolled
XOR RAX,RAX
MOV RCX,SAMPLES-1
@@05: ADD RAX,RCX
DEC RCX
ADD RAX,RCX
DEC RCX
ADD RAX,RCX
DEC RCX
ADD RAX,RCX
DEC RCX
ADD RAX,RCX
DEC RCX
ADD RAX,RCX
DEC RCX
ADD RAX,RCX
DEC RCX
ADD RAX,RCX
DEC RCX
JNZ @@05
RET
_WELL_OPTIMIZED3::
XOR RAX,RAX
MOV RCX,SAMPLES-1
@@05: ADD RAX,RCX
LOOP @@05 //Inst has slow speed, but saves code size.
RET
}
_extern _BADLY_UNOPTIMIZED I64 Loop1();
_extern _WELL_OPTIMIZED1 I64 Loop2();
_extern _WELL_OPTIMIZED2 I64 Loop3();
_extern _WELL_OPTIMIZED3 I64 Loop4();
I64 i;
F64 t0;
CPURep;
"Bad Code\n";
t0=tS;
i=Loop1;
"Res:%d Time:%9.6f\n",i,tS-t0;
"Good Code #1\n";
t0=tS;
i=Loop2;
"Res:%d Time:%9.6f\n",i,tS-t0;
"Good Code #2\n";
t0=tS;
i=Loop3;
"Res:%d Time:%9.6f\n",i,tS-t0;
"Good Code #3\n";
t0=tS;
i=Loop4;
"Res:%d Time:%9.6f\n",i,tS-t0;
/* Program Output$HL,0$$WW+H,1$$FD,1$
8 Cores 2.660GHz
Bad Code
Res:3200000040000000 Time: 0.069966
Good Code #1
Res:3200000040000000 Time: 0.062567
Good Code #2
Res:3200000040000000 Time: 0.062907
Good Code #3
Res:3200000040000000 Time: 0.156359
$HL,1$*/