AVX 2基于面罩的最有效的打包方法是什么？

如果您的目标是AMD Zen，这个方法可能是首选的，因为非常慢的pdepandpext在ryzen(每个周期18个周期)。我想出了这个方法，它使用一个压缩LUT，它是768(+1填充)字节，而不是8k。它需要一个单一标量值的广播，然后在每个车道上移动一个不同的量，然后隐藏到较低的3位，这提供了一个0-7 LUT。这里是本质版本，以及构建LUT的代码。//Generate Move mask via: _mm256_movemask_ps(_mm256_castsi256_ps(mask)); etc__m256i MoveMaskToIndices(u32 moveMask) {     u8 *adr = g_pack_left_table_u8x3 + moveMask * 3;     __m256i indices = _mm256_set1_epi32(*reinterpret_cast<u32*>(adr));//lower 24 bits has our LUT    // __m256i m = _mm256_sllv_epi32(indices, _mm256_setr_epi32(29, 26, 23, 20, 17, 14, 11, 8));     //now shift it right to get 3 bits at bottom     //__m256i shufmask = _mm256_srli_epi32(m, 29);     //Simplified version suggested by wim     //shift each lane so desired 3 bits are a bottom     //There is leftover data in the lane, but _mm256_permutevar8x32_ps  only examines the first 3 bits so this is ok     __m256i shufmask = _mm256_srlv_epi32 (indices, _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21));     return shufmask;}u32 get_nth_bits(int a) {     u32 out = 0;     int c = 0;     for (int i = 0; i < 8; ++i) {         auto set = (a >> i) & 1;         if (set) {             out |= (i << (c * 3));             c++;         }     }     return out;}u8 g_pack_left_table_u8x3[256 * 3 + 1];void BuildPackMask() {     for (int i = 0; i < 256; ++i) {         *reinterpret_cast<u32*>(&g_pack_left_table_u8x3[i * 3]) = get_nth_bits(i);     }}下面是MSVC生成的程序集：  lea ecx, DWORD PTR [rcx+rcx*2]   lea rax, OFFSET FLAT:unsigned char * g_pack_left_table_u8x3 ; g_pack_left_table_u8x3   vpbroadcastd ymm0, DWORD PTR [rcx+rax]   vpsrlvd ymm0, ymm0, YMMWORD PTR __ymm@00000015000000120000000f0000000c00000009000000060000000300000000

AVX 2基于面罩的最有效的打包方法是什么？

3回答