慕桂英546537
然后是循环 RSS 源的简单情况。import feedparserfrom bs4 import BeautifulSoupimport urllib.parse, xml.saximport pandas as pd# get some RSS feeds....resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/")soup = BeautifulSoup(resp.content.decode(), "html.parser")rawfeeds = soup.find_all("h2")feeds = {}for rf in rawfeeds: a = rf.find("a") if a is not None: feeds[a.string.replace("RSS Feed", "").strip()] = urllib.parse.parse_qs(a['href'])["q"][0].replace("site:","") # now source them all into a dataframedf = pd.DataFrame()for k, url in feeds.items(): try: df = pd.concat([df, pd.json_normalize(feedparser.parse(url)["entries"]).assign(Source=k)]) except (Exception, xml.sax.SAXParseException): print(f"invalid xml: {url}")可重入使用etag和修改的功能feedparser持久化数据帧,以便再次运行时它会从上次停止的地方开始我会使用线程,这样它就不是纯粹顺序的。显然,对于线程,您需要考虑同步您的保存点。然后,您只需在调度程序中运行即可定期在 RSS 源中获取新项目并获取相关文章。import feedparser, requests, newspaperfrom bs4 import BeautifulSoupimport urllib.parse, xml.saxfrom pathlib import Pathimport pandas as pdif not Path().cwd().joinpath("news").is_dir(): Path.cwd().joinpath("news").mkdir()p = Path().cwd().joinpath("news") # get some RSS feeds....if p.joinpath("rss.pickle").is_file(): dfrss = pd.read_pickle(p.joinpath("rss.pickle"))else: resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/") soup = BeautifulSoup(resp.content.decode(), "html.parser") rawfeeds = soup.find_all("h2") feeds = [] for rf in rawfeeds: a = rf.find("a") if a is not None: feeds.append({"name":a.string.replace("RSS Feed", "").strip(), "url":urllib.parse.parse_qs(a['href'])["q"][0].replace("site:",""), "etag":"","status":0, "dubug_msg":"", "modified":""}) dfrss = pd.DataFrame(feeds).set_index("url")if p.joinpath("rssdata.pickle").is_file(): df = pd.read_pickle(p.joinpath("rssdata.pickle"))else: df = pd.DataFrame({"id":[],"link":[]})# now source them all into a dataframe. head() is there for testing purposesfor r in dfrss.head(5).itertuples():# print(r.Index) try: fp = feedparser.parse(r.Index, etag=r.etag, modified=r.modified) if fp.bozo==1: raise Exception(fp.bozo_exception) except Exception as e: fp = feedparser.FeedParserDict(**{"etag":r.etag, "entries":[], "status":500, "debug_message":str(e)}) # keep meta information of what has already been sourced from a RSS feed if "etag" in fp.keys(): dfrss.loc[r.Index,"etag"] = fp.etag dfrss.loc[r.Index,"status"] = fp.status if "debug_message" in fp.keys(): dfrss.loc[r.Index,"debug_mgs"] = fp.debug_message # 304 means upto date... getting 301 and entries hence test len... if len(fp["entries"])>0: dft = pd.json_normalize(fp["entries"]).assign(Source=r.Index) # don't capture items that have already been captured... df = pd.concat([df, dft[~dft["link"].isin(df["link"])]])# save to make re-entrant...dfrss.to_pickle(p.joinpath("rss.pickle"))df.to_pickle(p.joinpath("rssdata.pickle"))# finally get the text...if p.joinpath("text.pickle").is_file(): dftext = pd.read_pickle(p.joinpath("text.pickle"))else: dftext = pd.DataFrame({"link":[], "text":[]})# head() is there for testing purposesfor r in df[~df["link"].isin(dftext["link"])].head(5).itertuples(): a = newspaper.Article(r.link) a.download() a.parse() dftext = dftext.append({"link":r.link, "text":a.text},ignore_index=True) dftext.to_pickle(p.joinpath("text.pickle"))对检索到的数据进行分析。