GCT1015
让我们来看看GCC 4.8对它做了什么无__builtin_expect#include "stdio.h"#include "time.h"int main() { /* Use time to prevent it from being optimized away. */ int i = !time(NULL); if (i) printf("%d\n", i); puts("a"); return 0;}用GCC 4.8.2x86_64 Linux编译和反编译:gcc -c -O3 -std=gnu11 main.cobjdump -dr main.o产出:0000000000000000 <main>: 0: 48 83 ec 08 sub $0x8,%rsp 4: 31 ff xor %edi,%edi 6: e8 00 00 00 00 callq b <main+0xb> 7: R_X86_64_PC32 time-0x4 b: 48 85 c0 test %rax,%rax e: 75 14 jne 24 <main+0x24> 10: ba 01 00 00 00 mov $0x1,%edx 15: be 00 00 00 00 mov $0x0,%esi 16: R_X86_64_32 .rodata.str1.1 1a: bf 01 00 00 00 mov $0x1,%edi 1f: e8 00 00 00 00 callq 24 <main+0x24> 20: R_X86_64_PC32 __printf_chk-0x4 24: bf 00 00 00 00 mov $0x0,%edi 25: R_X86_64_32 .rodata.str1.1+0x4 29: e8 00 00 00 00 callq 2e <main+0x2e> 2a: R_X86_64_PC32 puts-0x4 2e: 31 c0 xor %eax,%eax 30: 48 83 c4 08 add $0x8,%rsp 34: c3 retq内存中的指令顺序保持不变:首先,printf然后puts而retq回去吧。带着__builtin_expect现在替换if (i)有:if (__builtin_expect(i, 0))我们得到:0000000000000000 <main>: 0: 48 83 ec 08 sub $0x8,%rsp 4: 31 ff xor %edi,%edi 6: e8 00 00 00 00 callq b <main+0xb> 7: R_X86_64_PC32 time-0x4 b: 48 85 c0 test %rax,%rax e: 74 11 je 21 <main+0x21> 10: bf 00 00 00 00 mov $0x0,%edi 11: R_X86_64_32 .rodata.str1.1+0x4 15: e8 00 00 00 00 callq 1a <main+0x1a> 16: R_X86_64_PC32 puts-0x4 1a: 31 c0 xor %eax,%eax 1c: 48 83 c4 08 add $0x8,%rsp 20: c3 retq 21: ba 01 00 00 00 mov $0x1,%edx 26: be 00 00 00 00 mov $0x0,%esi 27: R_X86_64_32 .rodata.str1.1 2b: bf 01 00 00 00 mov $0x1,%edi 30: e8 00 00 00 00 callq 35 <main+0x35> 31: R_X86_64_PC32 __printf_chk-0x4 35: eb d9 jmp 10 <main+0x10>这个printf(汇编成__printf_chk)被移到函数的末尾,之后puts以及其他答案中提到的改进分支预测的回报。所以基本上是一样的:int i = !time(NULL);if (i) goto printf;puts:puts("a");return 0;printf:printf("%d\n", i);goto puts;这个优化没有用-O0.但是,在编写一个运行速度更快的示例时,祝您好运。__builtin_expect比没有,那些时候CPU真的很聪明..我天真的尝试在这里.