140 lines
11 KiB
HTML
Executable File
140 lines
11 KiB
HTML
Executable File
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="generator" content="TempleOS V5.03">
|
|
<meta name="viewport" content="width=device-width">
|
|
<link rel="stylesheet" href="/style/templeos.css">
|
|
<script src="/script/templeos.js"></script>
|
|
<style type="text/css">
|
|
.cF0{color:#000000;background-color:#ffffff;}
|
|
.cF1{color:#0000aa;background-color:#ffffff;}
|
|
.cF2{color:#00aa00;background-color:#ffffff;}
|
|
.cF3{color:#00aaaa;background-color:#ffffff;}
|
|
.cF4{color:#aa0000;background-color:#ffffff;}
|
|
.cF5{color:#aa00aa;background-color:#ffffff;}
|
|
.cF6{color:#aa5500;background-color:#ffffff;}
|
|
.cF7{color:#aaaaaa;background-color:#ffffff;}
|
|
.cF8{color:#555555;background-color:#ffffff;}
|
|
.cF9{color:#5555ff;background-color:#ffffff;}
|
|
.cFA{color:#55ff55;background-color:#ffffff;}
|
|
.cFB{color:#55ffff;background-color:#ffffff;}
|
|
.cFC{color:#ff5555;background-color:#ffffff;}
|
|
.cFD{color:#ff55ff;background-color:#ffffff;}
|
|
.cFE{color:#ffff55;background-color:#ffffff;}
|
|
.cFF{color:#ffffff;background-color:#ffffff;}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<pre id="content">
|
|
<a name="l1"></a><span class=cF2>/*The moral of this story is simple</span><span class=cF0>
|
|
<a name="l2"></a></span><span class=cF2>inst level optimizations</span><span class=cF0>
|
|
<a name="l3"></a></span><span class=cF2>don't matter much on a modern Intel CPU</span><span class=cF0>
|
|
<a name="l4"></a></span><span class=cF2>because they convert complex insts</span><span class=cF0>
|
|
<a name="l5"></a></span><span class=cF2>to a stream of RISC insts.</span><span class=cF0>
|
|
<a name="l6"></a>
|
|
<a name="l7"></a></span><span class=cF2>I learned this the hard way when I thought</span><span class=cF0>
|
|
<a name="l8"></a></span><span class=cF2>I was greatly improving my compiler by</span><span class=cF0>
|
|
<a name="l9"></a></span><span class=cF2>cutting code by a third. No significant</span><span class=cF0>
|
|
<a name="l10"></a></span><span class=cF2>speed-up. Depressing.</span><span class=cF0>
|
|
<a name="l11"></a></span><span class=cF2>*/</span><span class=cF0>
|
|
<a name="l12"></a>
|
|
<a name="l13"></a>#</span><span class=cF1>define</span><span class=cF0> SAMPLES (8*10000000+1)
|
|
<a name="l14"></a>
|
|
<a name="l15"></a></span><span class=cF1>asm</span><span class=cF0> {
|
|
<a name="l16"></a>
|
|
<a name="l17"></a>LIMIT:: </span><span class=cF1>DU64</span><span class=cF0> SAMPLES; </span><span class=cF2>//Memory reference should be bad, right?</span><span class=cF0>
|
|
<a name="l18"></a>
|
|
<a name="l19"></a>_BADLY_UNOPTIMIZED::
|
|
<a name="l20"></a> </span><span class=cF1>MOV</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,0
|
|
<a name="l21"></a> </span><span class=cF1>MOV</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>,1
|
|
<a name="l22"></a>@@05: </span><span class=cF1>MOV</span><span class=cF0> </span><span class=cFC>RDX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l23"></a> </span><span class=cF1>INC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0> </span><span class=cF2>//if no dependencies, Free!</span><span class=cF0>
|
|
<a name="l24"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RDX</span><span class=cF0>
|
|
<a name="l25"></a> </span><span class=cF1>MOV</span><span class=cF0> </span><span class=cFC>RDX</span><span class=cF0>,LIMIT-16 </span><span class=cF2>//added 16 displacement to make it worse</span><span class=cF0>
|
|
<a name="l26"></a> </span><span class=cF1>CMP</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>,</span><span class=cF9>U64</span><span class=cF0> 16[</span><span class=cFC>RDX</span><span class=cF0>]
|
|
<a name="l27"></a> </span><span class=cF1>JB</span><span class=cF0> @@05
|
|
<a name="l28"></a> </span><span class=cF1>RET</span><span class=cF0>
|
|
<a name="l29"></a>
|
|
<a name="l30"></a>_WELL_OPTIMIZED1::
|
|
<a name="l31"></a> </span><span class=cF1>XOR</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RAX</span><span class=cF0>
|
|
<a name="l32"></a> </span><span class=cF1>MOV</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>,SAMPLES-1
|
|
<a name="l33"></a>@@05: </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l34"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l35"></a> </span><span class=cF1>JNZ</span><span class=cF0> @@05
|
|
<a name="l36"></a> </span><span class=cF1>RET</span><span class=cF0>
|
|
<a name="l37"></a>
|
|
<a name="l38"></a>_WELL_OPTIMIZED2:: </span><span class=cF2>//Unrolled</span><span class=cF0>
|
|
<a name="l39"></a> </span><span class=cF1>XOR</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RAX</span><span class=cF0>
|
|
<a name="l40"></a> </span><span class=cF1>MOV</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>,SAMPLES-1
|
|
<a name="l41"></a>@@05: </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l42"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l43"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l44"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l45"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l46"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l47"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l48"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l49"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l50"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l51"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l52"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l53"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l54"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l55"></a> </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l56"></a> </span><span class=cF1>DEC</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l57"></a> </span><span class=cF1>JNZ</span><span class=cF0> @@05
|
|
<a name="l58"></a> </span><span class=cF1>RET</span><span class=cF0>
|
|
<a name="l59"></a>
|
|
<a name="l60"></a>_WELL_OPTIMIZED3::
|
|
<a name="l61"></a> </span><span class=cF1>XOR</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RAX</span><span class=cF0>
|
|
<a name="l62"></a> </span><span class=cF1>MOV</span><span class=cF0> </span><span class=cFC>RCX</span><span class=cF0>,SAMPLES-1
|
|
<a name="l63"></a>@@05: </span><span class=cF1>ADD</span><span class=cF0> </span><span class=cFC>RAX</span><span class=cF0>,</span><span class=cFC>RCX</span><span class=cF0>
|
|
<a name="l64"></a> </span><span class=cF1>LOOP</span><span class=cF0> @@05 </span><span class=cF2>//Inst has slow speed, but saves code size.</span><span class=cF0>
|
|
<a name="l65"></a> </span><span class=cF1>RET</span><span class=cF0>
|
|
<a name="l66"></a>}
|
|
<a name="l67"></a>
|
|
<a name="l68"></a></span><span class=cF1>_extern</span><span class=cF0> _BADLY_UNOPTIMIZED </span><span class=cF9>I64</span><span class=cF0> Loop1();
|
|
<a name="l69"></a></span><span class=cF1>_extern</span><span class=cF0> _WELL_OPTIMIZED1 </span><span class=cF9>I64</span><span class=cF0> Loop2();
|
|
<a name="l70"></a></span><span class=cF1>_extern</span><span class=cF0> _WELL_OPTIMIZED2 </span><span class=cF9>I64</span><span class=cF0> Loop3();
|
|
<a name="l71"></a></span><span class=cF1>_extern</span><span class=cF0> _WELL_OPTIMIZED3 </span><span class=cF9>I64</span><span class=cF0> Loop4();
|
|
<a name="l72"></a>
|
|
<a name="l73"></a></span><span class=cF9>I64</span><span class=cF0> i;
|
|
<a name="l74"></a></span><span class=cF1>F64</span><span class=cF0> t0;
|
|
<a name="l75"></a>
|
|
<a name="l76"></a></span><span class=cF5>CPURep</span><span class=cF0>;
|
|
<a name="l77"></a>
|
|
<a name="l78"></a></span><span class=cF6>"Bad Code\n"</span><span class=cF0>;
|
|
<a name="l79"></a>t0=</span><span class=cF5>tS</span><span class=cF0>;
|
|
<a name="l80"></a>i=Loop1;
|
|
<a name="l81"></a></span><span class=cF6>"Res:%d Time:%9.6f\n"</span><span class=cF0>,i,</span><span class=cF5>tS</span><span class=cF0>-t0;
|
|
<a name="l82"></a>
|
|
<a name="l83"></a></span><span class=cF6>"Good Code #1\n"</span><span class=cF0>;
|
|
<a name="l84"></a>t0=</span><span class=cF5>tS</span><span class=cF0>;
|
|
<a name="l85"></a>i=Loop2;
|
|
<a name="l86"></a></span><span class=cF6>"Res:%d Time:%9.6f\n"</span><span class=cF0>,i,</span><span class=cF5>tS</span><span class=cF0>-t0;
|
|
<a name="l87"></a>
|
|
<a name="l88"></a></span><span class=cF6>"Good Code #2\n"</span><span class=cF0>;
|
|
<a name="l89"></a>t0=</span><span class=cF5>tS</span><span class=cF0>;
|
|
<a name="l90"></a>i=Loop3;
|
|
<a name="l91"></a></span><span class=cF6>"Res:%d Time:%9.6f\n"</span><span class=cF0>,i,</span><span class=cF5>tS</span><span class=cF0>-t0;
|
|
<a name="l92"></a>
|
|
<a name="l93"></a></span><span class=cF6>"Good Code #3\n"</span><span class=cF0>;
|
|
<a name="l94"></a>t0=</span><span class=cF5>tS</span><span class=cF0>;
|
|
<a name="l95"></a>i=Loop4;
|
|
<a name="l96"></a></span><span class=cF6>"Res:%d Time:%9.6f\n"</span><span class=cF0>,i,</span><span class=cF5>tS</span><span class=cF0>-t0;
|
|
<a name="l97"></a>
|
|
<a name="l98"></a></span><span class=cF2>/* Program Output</span><span class=cF1>
|
|
<a name="l99"></a>8 Cores 2.660GHz
|
|
<a name="l100"></a>Bad Code
|
|
<a name="l101"></a>Res:3200000040000000 Time: 0.069966
|
|
<a name="l102"></a>Good Code #1
|
|
<a name="l103"></a>Res:3200000040000000 Time: 0.062567
|
|
<a name="l104"></a>Good Code #2
|
|
<a name="l105"></a>Res:3200000040000000 Time: 0.062907
|
|
<a name="l106"></a>Good Code #3
|
|
<a name="l107"></a>Res:3200000040000000 Time: 0.156359
|
|
<a name="l108"></a></span><span class=cF2>*/</span><span class=cF1>
|
|
</span></pre></body>
|
|
</html>
|