素胚勾勒不出你
				这是我先前回答的更新。从Thrust 1.8.1开始,CUDA Thrust原语可以与thrust::device执行策略结合起来,以利用CUDA 动态并行性在单个CUDA线程中并行运行。下面,举一个例子。#include <stdio.h>#include <thrust/reduce.h>#include <thrust/execution_policy.h>#include "TimingGPU.cuh"#include "Utilities.cuh"#define BLOCKSIZE_1D    256#define BLOCKSIZE_2D_X  32#define BLOCKSIZE_2D_Y  32/*************************//* TEST KERNEL FUNCTIONS *//*************************/__global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);}__global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);}/********//* MAIN *//********/int main() {    const int Nrows = 64;    const int Ncols = 2048;    gpuErrchk(cudaFree(0));//    size_t DevQueue;//    gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount));//    DevQueue *= 128;//    gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue));    float *h_data       = (float *)malloc(Nrows * Ncols * sizeof(float));    float *h_results    = (float *)malloc(Nrows *         sizeof(float));    float *h_results1   = (float *)malloc(Nrows *         sizeof(float));    float *h_results2   = (float *)malloc(Nrows *         sizeof(float));    float sum = 0.f;    for (int i=0; i<Nrows; i++) {        h_results[i] = 0.f;        for (int j=0; j<Ncols; j++) {            h_data[i*Ncols+j] = i;            h_results[i] = h_results[i] + h_data[i*Ncols+j];        }    }    TimingGPU timerGPU;    float *d_data;          gpuErrchk(cudaMalloc((void**)&d_data,     Nrows * Ncols * sizeof(float)));    float *d_results1;      gpuErrchk(cudaMalloc((void**)&d_results1, Nrows         * sizeof(float)));    float *d_results2;      gpuErrchk(cudaMalloc((void**)&d_results2, Nrows         * sizeof(float)));    gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));    timerGPU.StartCounter();    test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);    gpuErrchk(cudaPeekAtLastError());    gpuErrchk(cudaDeviceSynchronize());    printf("Timing approach nr. 1 = %f\n", timerGPU.GetCounter());    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));    for (int i=0; i<Nrows; i++) {        if (h_results1[i] != h_results[i]) {            printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);            return 0;        }    }    timerGPU.StartCounter();    test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);    gpuErrchk(cudaPeekAtLastError());    gpuErrchk(cudaDeviceSynchronize());    printf("Timing approach nr. 2 = %f\n", timerGPU.GetCounter());    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));    for (int i=0; i<Nrows; i++) {        if (h_results1[i] != h_results[i]) {            printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);            return 0;        }    }    printf("Test passed!\n");}上面的示例对矩阵的行进行缩减的方式与使用CUDA减少矩阵行的意义相同,但此操作与以上文章不同,即直接从用户编写的内核中调用CUDA Thrust原语。此外,以上示例还用于比较在执行两个执行策略(即thrust::seq和)时相同操作的性能thrust::device。下面,一些图表显示了性能差异。性能已在开普勒K20c和Maxwell GeForce GTX 850M上进行了评估。
				
			
			
			
				
				慕码人2483693
				我想对此问题提供更新的答案。从Thrust 1.8开始,CUDA Thrust原语可以与thrust::seq执行策略结合使用,以在单个CUDA线程中顺序运行(或在单个CPU线程中顺序运行)。下面,举一个例子。如果要在线程内并行执行,则可以考虑使用CUB,它提供了可从线程块内调用的简化例程,只要您的卡启用了动态并行性。这是推力的例子#include <stdio.h>#include <thrust/reduce.h>#include <thrust/execution_policy.h>/********************//* CUDA ERROR CHECK *//********************/#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true){   if (code != cudaSuccess)    {      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);      if (abort) exit(code);   }}__global__ void test(float *d_A, int N) {    float sum = thrust::reduce(thrust::seq, d_A, d_A + N);    printf("Device side result = %f\n", sum);}int main() {    const int N = 16;    float *h_A = (float*)malloc(N * sizeof(float));    float sum = 0.f;    for (int i=0; i<N; i++) {        h_A[i] = i;        sum = sum + h_A[i];    }    printf("Host side result = %f\n", sum);    float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float)));    gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice));    test<<<1,1>>>(d_A, N);}