第二步 :采集分站后在采集所有分站下的分页列表地址到city_company_with_page.txt
package main
import (
"fmt"
"io/ioutil"
"strings"
"strconv"
"github.com/PuerkitoBio/goquery"
"os"
)
func main () {
dataPath := "data/city_site.txt"
content, err := ioutil.ReadFile(dataPath)
if err != nil {
panic(err)
}
//读取所有的内容
str := string(content)
citys := strings.Split(str, "\n")
//保存所有city映射地址的连接
cityUrls := map[string]string{}
for _, city := range citys {
cityUrl := strings.Split(city, " ")
cityUrls[cityUrl[1]] = cityUrl[2]+"/company/"
}
var index int = 0
var pagelasturl string = ""
fd, _ := os.OpenFile("data/city_company_with_page.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644)
for _, cityUrl := range cityUrls {
pageurl := cityUrl+"?p="
for {
index++
temp := strconv.Itoa(index)
pagelasturl = pageurl + temp
doc, _ := goquery.NewDocument(pagelasturl)
h := doc.Find(".page a").Last().Text()
if h != "下一页" {
index = 0
fmt.Println("采集最后一页"+pagelasturl)
fd.WriteString(pagelasturl + "\n") //其他分页先入库
break
}
if index == 1{
fmt.Println("采集到本地"+cityUrl)
fd.WriteString(cityUrl + "\n") //其他分页先入库
}else{
fmt.Println("采集到本地"+pagelasturl)
fd.WriteString(pagelasturl + "\n") //其他分页先入库
}
temp = ""
pagelasturl = ""
}
}
}