在这个例子中，爬虫多线程是如何工作的？

在这个例子中，爬虫多线程是如何工作的？

我正在尝试解决有关使用缓存并行获取 URL 以避免重复的任务。我找到了正确的解决方案并且可以理解它。我看到正确的答案包含通道，并且 gorutine 通过 chan 将 URL 推送到缓存中。但为什么我的简单代码不能正常工作？我不知道哪里出错了。

package main

import (

"fmt"

"sync"

)

type Fetcher interface {

// Fetch returns the body of URL and

// a slice of URLs found on that page.

Fetch(url string) (body string, urls []string, err error)

}

var cache = struct {

cache map[string]int

mux sync.Mutex

}{cache: make(map[string]int)}

// Crawl uses fetcher to recursively crawl

// pages starting with url, to a maximum of depth.

func Crawl(url string, depth int, fetcher Fetcher) {

// TODO: Fetch URLs in parallel.

// TODO: Don't fetch the same URL twice.

// This implementation doesn't do either:

if depth <= 0 {

return

}

cache.mux.Lock()

cache.cache[url] = 1 //put url in cache

cache.mux.Unlock()

body, urls, err := fetcher.Fetch(url)

if err != nil {

fmt.Println(err)

return

}

fmt.Printf("found: %s %q\n", url, body)

for _, u := range urls {

cache.mux.Lock()

if _, ok := cache.cache[u]; !ok { //check if url already in cache

cache.mux.Unlock()

go Crawl(u, depth-1, fetcher)

} else {

cache.mux.Unlock()

}

}

return

}

func main() {

Crawl("http://golang.org/", 4, fetcher)

}

// fakeFetcher is Fetcher that returns canned results.

type fakeFetcher map[string]*fakeResult

type fakeResult struct {

body string

urls []string

}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {

if res, ok := f[url]; ok {

return res.body, res.urls, nil

}

return "", nil, fmt.Errorf("not found: %s", url)

}

// fetcher is a populated fakeFetcher.

var fetcher = fakeFetcher{

"http://golang.org/": &fakeResult{

"The Go Programming Language",

[]string{

"http://golang.org/pkg/",

"http://golang.org/cmd/",

},

},

长风秋雁

浏览 221回答 1

1回答

慕尼黑5688855

在所有调用完成之前，您main()不会阻塞，因此退出。go Crawl()您可以使用 async.WaitGroup或通道来同步程序，以完成所有 goroutine。u我还发现goroutine 中使用的变量存在问题；当 goroutine 执行时，u范围循环可能会或可能不会更改的值。结束Crawl可能会像这样解决这两个问题；wg := sync.WaitGroup{}fmt.Printf("found: %s %q\n", url, body)for _, u := range urls {    cache.mux.Lock()    if _, ok := cache.cache[u]; !ok { //check if url already in cache        cache.mux.Unlock()        wg.Add(1)        go func(url string) {            Crawl(url, depth-1, fetcher)            wg.Done()        }(u)    } else {        cache.mux.Unlock()    }}// Block until all goroutines are donewg.Wait()return

0

0

随时随地看视频慕课网APP

相关分类

Go