为什么元素级的添加在单独的循环中比在组合循环中要快得多?
a1
, b1
, c1
d1
const int n = 100000;for (int j = 0; j < n; j++) { a1[j] += b1[j]; c1[j] += d1[j];}
for
for (int j = 0; j < n; j++) { a1[j] += b1[j];}for (int j = 0; j < n; j++) { c1[j] += d1[j];}
movsd xmm0,mmword ptr [edx+18h]addsd xmm0,mmword ptr [ecx+20h]movsd mmword ptr [ecx+20h],xmm0 movsd xmm0,mmword ptr [esi+10h]addsd xmm0,mmword ptr [eax+30h]movsd mmword ptr [eax+30h],xmm0 movsd xmm0,mmword ptr [edx+20h]addsd xmm0,mmword ptr [ecx+28h]movsd mmword ptr [ecx+28h],xmm0 movsd xmm0,mmword ptr [esi+18h]addsd xmm0,mmword ptr [eax+38h]
addsd xmm0,mmword ptr [eax+28h]movsd mmword ptr [eax+28h],xmm0 movsd xmm0,mmword ptr [ecx+20h]addsd xmm0,mmword ptr [eax+30h]movsd mmword ptr [eax+30h],xmm0 movsd xmm0,mmword ptr [ecx+28h]addsd xmm0,mmword ptr [eax+38h]movsd mmword ptr [eax+38h],xmm0 movsd xmm0,mmword ptr [ecx+30h]addsd xmm0,mmword ptr [eax+40h]movsd mmword ptr [eax+40h],xmm0
您能提供一些关于导致不同缓存行为的详细信息,如下图中的五个区域所示吗?
通过为CPU提供类似的图表,指出CPU/缓存体系结构之间的差异也可能很有趣。
凤凰求蛊
猛跑小猪