qq_笑_17
您可以利用一些较小的并行机制来获得较大的寄存器长度和较小的和。例如,将16个1字节的值相加(恰好适合一个sse寄存器),仅需要对数2 16的加法和相等的移位数。数量不多,但比15个依赖的附加项和附加的内存访问快。__m128i x = _mm_set_epi8(3,1,7,0,4,1,6,3,3,1,7,0,4,1,6,3);x = _mm_add_epi8(x, _mm_srli_si128(x, 1));x = _mm_add_epi8(x, _mm_srli_si128(x, 2));x = _mm_add_epi8(x, _mm_srli_si128(x, 4));x = _mm_add_epi8(x, _mm_srli_si128(x, 8));// x == 3, 4, 11, 11, 15, 16, 22, 25, 28, 29, 36, 36, 40, 41, 47, 50如果总和更长,则可以通过利用指令级并行性并利用指令重新排序来隐藏依赖项。编辑:类似__m128i x0 = _mm_set_epi8(3,1,7,0,4,1,6,3,3,1,7,0,4,1,6,3);__m128i x1 = _mm_set_epi8(3,1,7,0,4,1,6,3,3,1,7,0,4,1,6,3);__m128i x2 = _mm_set_epi8(3,1,7,0,4,1,6,3,3,1,7,0,4,1,6,3);__m128i x3 = _mm_set_epi8(3,1,7,0,4,1,6,3,3,1,7,0,4,1,6,3);__m128i mask = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);x0 = _mm_add_epi8(x0, _mm_srli_si128(x0, 1));x1 = _mm_add_epi8(x1, _mm_srli_si128(x1, 1));x2 = _mm_add_epi8(x2, _mm_srli_si128(x2, 1));x3 = _mm_add_epi8(x3, _mm_srli_si128(x3, 1));x0 = _mm_add_epi8(x0, _mm_srli_si128(x0, 2));x1 = _mm_add_epi8(x1, _mm_srli_si128(x1, 2));x2 = _mm_add_epi8(x2, _mm_srli_si128(x2, 2));x3 = _mm_add_epi8(x3, _mm_srli_si128(x3, 2));x0 = _mm_add_epi8(x0, _mm_srli_si128(x0, 4));x1 = _mm_add_epi8(x1, _mm_srli_si128(x1, 4));x2 = _mm_add_epi8(x2, _mm_srli_si128(x2, 4));x3 = _mm_add_epi8(x3, _mm_srli_si128(x3, 4));x0 = _mm_add_epi8(x0, _mm_srli_si128(x0, 8));x1 = _mm_add_epi8(x1, _mm_srli_si128(x1, 8));x2 = _mm_add_epi8(x2, _mm_srli_si128(x2, 8));x3 = _mm_add_epi8(x3, _mm_srli_si128(x3, 8));x1 = _mm_add_epi8(_mm_shuffle_epi8(x0, mask), x1);x2 = _mm_add_epi8(_mm_shuffle_epi8(x1, mask), x2);x3 = _mm_add_epi8(_mm_shuffle_epi8(x2, mask), x3);