猿问

减少OpenMP中的数组

我正在尝试并行化以下程序,但不知道如何在数组上进行归约。我知道这是不可能的,但是有替代方法吗?谢谢。(我增加了对m的归纳,这是错误的,但想对如何做提出建议。)


#include <iostream>

#include <stdio.h>

#include <time.h>

#include <omp.h>

using namespace std;


int main ()

{

  int A [] = {84, 30, 95, 94, 36, 73, 52, 23, 2, 13};

  int S [10];


  time_t start_time = time(NULL);

  #pragma omp parallel for private(m) reduction(+:m)

  for (int n=0 ; n<10 ; ++n ){

    for (int m=0; m<=n; ++m){

      S[n] += A[m];

    }

  }

  time_t end_time = time(NULL);

  cout << end_time-start_time;


  return 0;

}


拉莫斯之舞
浏览 736回答 3
3回答

慕桂英4014372

是的,可以使用OpenMP进行阵列缩减。在Fortran中,它甚至对此具有构造。在C / C ++中,您必须自己做。这有两种方法。第一种方法S为每个线程创建的私有版本,并行填充它们,然后将它们合并到S关键部分(请参见下面的代码)。第二种方法创建一个具有10 * nthread个尺寸的数组。并行填充此数组,然后在S不使用关键节的情况下将其合并到其中。第二种方法要复杂得多,如果不小心,可能会出现缓存问题,尤其是在多插槽系统上。有关更多详细信息,请参见此填充直方图(数组缩减)与OpenMP并行使用,而无需使用关键部分第一种方法int A [] = {84, 30, 95, 94, 36, 73, 52, 23, 2, 13};int S [10] = {0};#pragma omp parallel{&nbsp; &nbsp; int S_private[10] = {0};&nbsp; &nbsp; #pragma omp for&nbsp; &nbsp; for (int n=0 ; n<10 ; ++n ) {&nbsp; &nbsp; &nbsp; &nbsp; for (int m=0; m<=n; ++m){&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; S_private[n] += A[m];&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }&nbsp; &nbsp; #pragma omp critical&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; for(int n=0; n<10; ++n) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; S[n] += S_private[n];&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }}第二种方法int A [] = {84, 30, 95, 94, 36, 73, 52, 23, 2, 13};int S [10] = {0};int *S_private;#pragma omp parallel{&nbsp; &nbsp; const int nthreads = omp_get_num_threads();&nbsp; &nbsp; const int ithread = omp_get_thread_num();&nbsp; &nbsp; #pragma omp single&nbsp;&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; S_private = new int[10*nthreads];&nbsp; &nbsp; &nbsp; &nbsp; for(int i=0; i<(10*nthreads); i++) S_private[i] = 0;&nbsp; &nbsp; }&nbsp; &nbsp; #pragma omp for&nbsp; &nbsp; for (int n=0 ; n<10 ; ++n )&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; for (int m=0; m<=n; ++m){&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; S_private[ithread*10+n] += A[m];&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }&nbsp; &nbsp; #pragma omp for&nbsp; &nbsp; for(int i=0; i<10; i++) {&nbsp; &nbsp; &nbsp; &nbsp; for(int t=0; t<nthreads; t++) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; S[i] += S_private[10*t + i];&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }}delete[] S_private;

慕莱坞森

关于Zboson的答案,我有两点评论:1.方法1当然是正确的,但是归约循环实际上是串行运行的,因为#pragma ompcritical必不可少,因为部分线程对于每个线程都是局部的,并且相应的归约具有通过线程来完成矩阵。2.方法2:初始化循环可以移到单个部分的外面,因此可以并行化。以下程序使用openMP v4.0用户定义的还原工具实现 阵列还原:/* Compile with:&nbsp; &nbsp; &nbsp;gcc -Wall -fopenmp -o ar ar.c&nbsp; &nbsp;Run with:&nbsp; &nbsp; &nbsp;OMP_DISPLAY_ENV=TRUE OMP_NUM_THREADS=10 OMP_NESTED=TRUE ./ar*/#include <stdio.h>#include <omp.h>struct m10x1 {int v[10];};int A [] =&nbsp; &nbsp; &nbsp; &nbsp;{84, 30, 95, 94, 36, 73, 52, 23, 2, 13};&nbsp;&nbsp;struct m10x1 S = {{ 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0, 0,&nbsp; 0}};int n,m=0;void print_m10x1(struct m10x1 x){&nbsp; int i;&nbsp; for(i=0;i<10;i++) printf("%d ",x.v[i]);&nbsp; printf("\n");}struct m10x1 add_m10x1(struct m10x1 x,struct m10x1 y){&nbsp; struct m10x1 r ={{ 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0, 0,&nbsp; 0}};&nbsp; int i;&nbsp; for (i=0;i<10;i++) r.v[i]=x.v[i]+y.v[i];&nbsp; return r;}#pragma omp declare reduction(m10x1Add: struct m10x1: \omp_out=add_m10x1(omp_out, omp_in)) initializer( \omp_priv={{ 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0, 0,&nbsp; 0}} )int main (){&nbsp; #pragma omp parallel for reduction(m10x1Add: S)&nbsp; for ( n=0 ; n<10 ; ++n )&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; for (m=0; m<=n; ++m){&nbsp; &nbsp; &nbsp; &nbsp; S.v[n] += A[m];&nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }&nbsp; print_m10x1(S);}这完全按照OpenMP 4.0功能第97页的复数减少示例进行。尽管并行版本可以正常工作,但可能存在性能问题,我尚未调查:add_m10x1输入和输出按值传递。add_m10x1中的循环按顺序运行。所说的“性能问题”是我自己造成的,完全不介绍它们就很简单:add_m10x1的参数应通过引用传递(通过C中的指针,C ++中的引用)add_m10x1中的计算应就位。应该将add_m10x1声明为void,并删除return语句。结果通过第一个参数返回。应该相应地减少声明减少编译指示,合并器应该只是函数调用而不是赋值(v4.0规范p181第9,10行)。add_m10x1中的for循环可以通过omp parallel for pragma并行化应启用并行嵌套(例如,通过OMP_NESTED = TRUE)然后,代码的修改部分为:void add_m10x1(struct m10x1 * x,struct m10x1 * y){&nbsp; int i;&nbsp; #pragma omp parallel for&nbsp; for (i=0;i<10;i++) x->v[i] += y->v[i];}#pragma omp declare reduction(m10x1Add: struct m10x1: \add_m10x1(&omp_out, &omp_in)) initializer( \omp_priv={{ 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0,&nbsp; 0, 0,&nbsp; 0}} )
随时随地看视频慕课网APP
我要回答