第二步 :采集分站后在采集所有分站下的分页列表地址到city_company_with_page.txt
package main import ( "fmt" "io/ioutil" "strings" "strconv" "github.com/PuerkitoBio/goquery" "os" ) func main () { dataPath := "data/city_site.txt" content, err := ioutil.ReadFile(dataPath) if err != nil { panic(err) } //读取所有的内容 str := string(content) citys := strings.Split(str, "\n") //保存所有city映射地址的连接 cityUrls := map[string]string{} for _, city := range citys { cityUrl := strings.Split(city, " ") cityUrls[cityUrl[1]] = cityUrl[2]+"/company/" } var index int = 0 var pagelasturl string = "" fd, _ := os.OpenFile("data/city_company_with_page.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) for _, cityUrl := range cityUrls { pageurl := cityUrl+"?p=" for { index++ temp := strconv.Itoa(index) pagelasturl = pageurl + temp doc, _ := goquery.NewDocument(pagelasturl) h := doc.Find(".page a").Last().Text() if h != "下一页" { index = 0 fmt.Println("采集最后一页"+pagelasturl) fd.WriteString(pagelasturl + "\n") //其他分页先入库 break } if index == 1{ fmt.Println("采集到本地"+cityUrl) fd.WriteString(cityUrl + "\n") //其他分页先入库 }else{ fmt.Println("采集到本地"+pagelasturl) fd.WriteString(pagelasturl + "\n") //其他分页先入库 } temp = "" pagelasturl = "" } } }