getURL() 函数从原始 URL 创建一个抓取 URL 的数组。getSubURL() 然后循环遍历该数组并抓取所有这些页面的 URL。目前,此代码可以很好地输出到控制台,但我不知道如何等待我的数据解析,以便我可以将所有收集的数据推送到单个数组。目前,当我尝试返回站点然后推送到数组时,它只推送最后一个值。我相信这是一种promise.all(map) 的情况,但我不知道如何正确编写而不出现错误。理想情况下,我完成的抓取可以在另一个函数中调用。如果可以的话请看一下
const cheerio = require('cheerio');
const axios = require('axios');
let URL = 'https://toscrape.com';
const getURLS = async () => {
try {
const res = await axios.get(URL);
const data = res.data;
const $ = cheerio.load(data);
const urlQueue = [];
$("a[href^='http']").each((i, elem) => {
const link = $(elem).attr('href');
if (urlQueue.indexOf(link) === -1) {
urlQueue.push(link);
}
});
return urlQueue;
} catch (err) {
console.log(`Error fetching and parsing data: `, err);
}
};
const getSubURLs = async () => {
let urls = await getURLS();
try {
//loop through each url in array
for (const url of urls) {
//fetch all html from the current url
const res = await axios.get(url);
const data = res.data;
const $ = cheerio.load(data);
//create object and push that url into that object
let sites = {};
sites.url = url;
let links = [];
//scrape all links and save in links array
$("a[href^='/']").each((i, elem) => {
const link = $(elem).attr('href');
if (links.indexOf(link) === -1) {
links.push(link);
}
//save scraped data in object
sites.links = links;
});
// returns list of {url:'url', links:[link1,link2,link3]}
console.log(sites);
}
} catch (err) {
console.log(`Error fetching and parsing data: `, err);
}
};
Cats萌萌
相关分类