汪汪一只猫
3-为了提高性能,您可以在不使用的情况下按 CPU 内核划分任务lock sync.RWMutex:+30x使用通道和 进行优化runtime.NumCPU(),这需要2ms2 核和993µs8 核,而您的示例代码需要61ms2 核和40ms8 核:请参阅此工作示例代码和输出:package mainimport ( "fmt" "math" "runtime" "time")func main() { nCPU := runtime.NumCPU() fmt.Println("nCPU =", nCPU) ch := make(chan float64, nCPU) startTime := time.Now() a := 0.0 b := 1.0 n := 100000.0 deltax := (b - a) / n stepPerCPU := n / float64(nCPU) for start := 0.0; start < n; { stop := start + stepPerCPU go f(start, stop, a, deltax, ch) start = stop } integral := 0.0 for i := 0; i < nCPU; i++ { integral += <-ch } fmt.Println(time.Now().Sub(startTime)) fmt.Println(deltax * integral)}func f(start, stop, a, deltax float64, ch chan float64) { result := 0.0 for i := start; i < stop; i++ { result += math.Sqrt(a + deltax*(i+0.5)) } ch <- result}2核输出:nCPU = 22.0001ms0.66666666859004858核输出:nCPU = 8993µs0.6666666685900456您的示例代码,2 核输出:0.666666668590042461.0035ms您的示例代码,8 核输出:0.666666668590041540.9964ms2-为了获得良好的基准统计数据,请使用大量样本(大 n):正如您在此处看到的,使用 2 个内核需要2 个内核,但在使用 1 个内核的同110ms一个 CPU 上,这需要: 215msn := 10000000.0使用n := 10000000.0单个 goroutine,请参阅此工作示例代码:package mainimport ( "fmt" "math" "time")func main() { now := time.Now() a := 0.0 b := 1.0 n := 10000000.0 deltax := (b - a) / n result := 0.0 for i := 0.0; i < n; i++ { result += math.Sqrt(a + deltax*(i+0.5)) } fmt.Println(time.Now().Sub(now)) fmt.Println(deltax * result)}输出:215.0123ms0.6666666666685884使用n := 10000000.0和 2 个 goroutine,请参阅此工作示例代码:package mainimport ( "fmt" "math" "runtime" "time")func main() { nCPU := runtime.NumCPU() fmt.Println("nCPU =", nCPU) ch := make(chan float64, nCPU) startTime := time.Now() a := 0.0 b := 1.0 n := 10000000.0 deltax := (b - a) / n stepPerCPU := n / float64(nCPU) for start := 0.0; start < n; { stop := start + stepPerCPU go f(start, stop, a, deltax, ch) start = stop } integral := 0.0 for i := 0; i < nCPU; i++ { integral += <-ch } fmt.Println(time.Now().Sub(startTime)) fmt.Println(deltax * integral)}func f(start, stop, a, deltax float64, ch chan float64) { result := 0.0 for i := start; i < stop; i++ { result += math.Sqrt(a + deltax*(i+0.5)) } ch <- result}输出:nCPU = 2110.0063ms0.66666666666860731- Goroutines 的数量有一个最佳点,从这一点开始,增加 Goroutines 的数量并不会减少程序执行时间:在 2 核 CPU 上,使用以下代码,结果是:nCPU: 1, 2, 4, 8, 16Time: 2.1601236s, 1.1220642s, 1.1060633s, 1.1140637s, 1.1380651s正如你所看到的nCPU=1 ,nCPU=2减少量已经足够大,但在此之后它并不多,所以nCPU=22 Cores CPU 是此示例代码的最佳点,所以在这里使用 nCPU := runtime.NumCPU()就足够了。package mainimport ( "fmt" "math" "time")func main() { nCPU := 2 //2.1601236s@1 1.1220642s@2 1.1060633s@4 1.1140637s@8 1.1380651s@16 fmt.Println("nCPU =", nCPU) ch := make(chan float64, nCPU) startTime := time.Now() a := 0.0 b := 1.0 n := 100000000.0 deltax := (b - a) / n stepPerCPU := n / float64(nCPU) for start := 0.0; start < n; { stop := start + stepPerCPU go f(start, stop, a, deltax, ch) start = stop } integral := 0.0 for i := 0; i < nCPU; i++ { integral += <-ch } fmt.Println(time.Now().Sub(startTime)) fmt.Println(deltax * integral)}func f(start, stop, a, deltax float64, ch chan float64) { result := 0.0 for i := start; i < stop; i++ { result += math.Sqrt(a + deltax*(i+0.5)) } ch <- result}
拉风的咖菲猫
除非 goroutine 中的活动花费的时间比切换上下文、执行任务和使用互斥锁更新值所需的时间多得多,否则串行执行会更快。看一个稍微修改过的版本。我所做的只是在f()函数中添加 1 微秒的延迟。package mainimport ( "fmt" "math" "sync" "time")type Result struct { result float64 lock sync.RWMutex}var wg sync.WaitGroupvar result Resultfunc main() { fmt.Println("concurrent") concurrent() result.result = 0 fmt.Println("serial") serial()}func concurrent() { now := time.Now() a := 0.0 b := 1.0 n := 100000.0 deltax := (b - a) / n wg.Add(int(n)) for i := 0.0; i < n; i++ { go f(a, deltax, i, true) } wg.Wait() fmt.Println(deltax * result.result) fmt.Println(time.Now().Sub(now))}func serial() { now := time.Now() a := 0.0 b := 1.0 n := 100000.0 deltax := (b - a) / n for i := 0.0; i < n; i++ { f(a, deltax, i, false) } fmt.Println(deltax * result.result) fmt.Println(time.Now().Sub(now))}func f(a, deltax, i float64, concurrent bool) { time.Sleep(1 * time.Microsecond) fx := math.Sqrt(a + deltax*(i+0.5)) if concurrent { result.lock.Lock() result.result += fx result.lock.Unlock() wg.Done() } else { result.result += fx }}加上延迟,结果如下(并发版本快很多):concurrent0.6666666685900424624.914165msserial0.66666666859004225.609195767s事不宜迟:concurrent0.666666668590042850.771275msserial0.6666666685900422749.166µs正如你所看到的,完成一项任务所花费的时间越长,如果可能的话,同时执行它就越有意义。