如何汇总 FloatResidentArray 并将值检索到设备或主机

我正在使用 Hybridizer 计算 FloatResidentArray 的总和,但由于在最终的 AtomicExpr.apply 语句中需要一个 ref 语句,所以我无法将计算出的总和返回给设备(或主机)。考虑以下基于 Altimesh 提供的 GenericReduce 示例的代码。该代码采用长度为N的浮点数的设备驻留数组a并计算总数——该值放在total[0]中。


[Kernel]

 public static void Total(FloatResidentArray a, int N, float[] total)

 {

    var cache = new SharedMemoryAllocator<float>().allocate(blockDim.x);


    int tid = threadIdx.x + blockDim.x * blockIdx.x;

    int cacheIndex = threadIdx.x;

    float sum = 0f;           

    while (tid < N)

    {

       sum = sum + a[tid];               

       tid += blockDim.x * gridDim.x;

     }

     cache[cacheIndex] = sum;          

     CUDAIntrinsics.__syncthreads();

     int i = blockDim.x / 2;

     while (i != 0)

     {

        if (cacheIndex < i)

        {

            cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i];

        }

        CUDAIntrinsics.__syncthreads();

        i >>= 1;

     }


     if (cacheIndex == 0)

     {

          AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y);

     }

  }

上面的代码无法编译,因为您不能在同一参数列表中传递 float[] 和 FloatResidentArray。


如果 total 定义为 FloatResidentArray 本身,那么编译器将不允许在最后一行代码中使用 ref 关键字。


如果我只是传递一个浮点数,则返回的变量不会用总数更新。


如果我传递一个 ref float - 然后程序在 HybRunner 包装上面的代码以创建动态的地方抛出一个运行时错误 - 错误消息是


不支持引用的值类型


我如何返回总数?– 无论是设备还是主机内存 – 两者都是可以接受的。


慕沐林林
浏览 102回答 1
1回答

白猪掌柜的

那么,您需要了解编组的工作原理在 .Net 中创建的对象和数组(甚至常驻数组)都是宿主。然后我们在内核执行之前编组它们(固定主机内存,分配设备内存并将主机复制到设备)。对于 float[],这将自动完成对于 IntPtr,我们什么都不做,用户必须确保 IntPtr 是包含数据的有效设备指针对于常驻数组,我们什么都不做,用户在想要来回获取数据时必须手动调用 RefreshDevice() 和 RefreshHost。支持混合 ResidentArray 和 float[],如生成的 dll 的屏幕截图所示:不支持的是:混合托管类型和 IntPtr。这是您的代码工作的完整版本,并返回正确的结果:using Hybridizer.Runtime.CUDAImports;using System;using System.Runtime.InteropServices;namespace SimpleMetadataDecorator{&nbsp; &nbsp; class Program&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; [EntryPoint]&nbsp; &nbsp; &nbsp; &nbsp; public static void Total(FloatResidentArray a, int N, float[] total)&nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; var cache = new SharedMemoryAllocator<float>().allocate(blockDim.x);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; int tid = threadIdx.x + blockDim.x * blockIdx.x;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; int cacheIndex = threadIdx.x;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; float sum = 0f;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; while (tid < N)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; sum = sum + a[tid];&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; tid += blockDim.x * gridDim.x;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; cache[cacheIndex] = sum;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; CUDAIntrinsics.__syncthreads();&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; int i = blockDim.x / 2;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; while (i != 0)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (cacheIndex < i)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i];&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; CUDAIntrinsics.__syncthreads();&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; i >>= 1;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (cacheIndex == 0)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; static void Main(string[] args)&nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; const int N = 1024 * 1024 * 32;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; FloatResidentArray arr = new FloatResidentArray(N);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; float[] res = new float[1];&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for (int i = 0; i < N; ++i)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; arr[i] = 1.0F;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; arr.RefreshDevice();&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; var runner = HybRunner.Cuda();&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; cudaDeviceProp prop;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; cuda.GetDeviceProperties(out prop, 0);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; runner.SetDistrib(16 * prop.multiProcessorCount, 1, 128, 1, 1, 128 * sizeof(float));&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; var wrapped = runner.Wrap(new Program());&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; runner.saveAssembly();&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; cuda.ERROR_CHECK((cudaError_t)(int)wrapped.Total(arr, N, res));&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; cuda.ERROR_CHECK(cuda.DeviceSynchronize());&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Console.WriteLine(res[0]);&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }}
打开App,查看更多内容
随时随地看视频慕课网APP