猿问

PyQt5加载网页内容时返回None值

我正在尝试获取网页部分的内容。该部分中的数据由 JavaScript 动态加载。我在这里找到了一些代码,对其进行了编辑,但是当我运行脚本时我返回None


这是代码


import bs4 as bs

import sys

import urllib.request

from PyQt5.QtWebEngineWidgets import QWebEnginePage

from PyQt5.QtWidgets import QApplication

from PyQt5.QtCore import QUrl

from pprint import pprint


class Page(QWebEnginePage):

    def __init__(self, url):

        self.app = QApplication(sys.argv)

        QWebEnginePage.__init__(self)

        self.html = ''

        self.loadFinished.connect(self._on_load_finished)

        self.load(QUrl(url))

        self.app.exec_()

        


    def _on_load_finished(self):

        self.html = self.toHtml(self.Callable)

        print('Load finished')


    def Callable(self, html_str):

        self.html = html_str

        self.app.quit()


def main():

    page = Page('https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all')

    soup = bs.BeautifulSoup(page.html, 'html.parser')

    section = soup.find('table', {'id' : 'DataTables_Table_0'})

    pprint (section)


if __name__ == '__main__': main()

这是输出


Load finished

None


慕的地8271018
浏览 146回答 1
1回答

蝴蝶不菲

loadFinished 信号仅指示页面已加载,但之后可以创建更多 DOM 元素,这就是 id 为“DataTables_Table_0”的元素的情况,该元素是在页面加载后立即创建的。一个可能的解决方案是注入一个脚本来检查该元素是否存在,并发出通知以便获取 HTML。import sysfrom functools import cached_propertyfrom PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannelfrom pprint import pprintimport bs4 as bsdef get_webchannel_source():    file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")    if not file.open(QtCore.QIODevice.ReadOnly):        return ""    content = file.readAll()    file.close()    return content.data().decode()class Manager(QtCore.QObject):    def __init__(self, *, offline=True, visible=False, parent=None):        super().__init__(parent)        self._html = ""        self._is_finished = False        self.app        self._profile = (            QtWebEngineWidgets.QWebEngineProfile()            if offline            else QtWebEngineWidgets.QWebEngineProfile.defaultProfile()        )        self.view.resize(640, 480)        if not visible:            self.view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)        self.view.show()        self.webchannel.registerObject("manager", self)        self.view.page().setWebChannel(self.webchannel)    @cached_property    def app(self):        return QtWidgets.QApplication(sys.argv)    @property    def profile(self):        return self._profile    @cached_property    def view(self):        view = QtWebEngineWidgets.QWebEngineView()        page = QtWebEngineWidgets.QWebEnginePage(self.profile, self)        view.setPage(page)        return view    @cached_property    def webchannel(self):        return QtWebChannel.QWebChannel(self)    @property    def html(self):        return self._html    def set_script(self, script):        qscript = QtWebEngineWidgets.QWebEngineScript()        qscript.setName("qscript")        qscript.setSourceCode(get_webchannel_source() + "\n" + script)        qscript.setInjectionPoint(QtWebEngineWidgets.QWebEngineScript.DocumentReady)        qscript.setWorldId(QtWebEngineWidgets.QWebEngineScript.MainWorld)        self.profile.scripts().insert(qscript)    def start(self, url):        self.view.load(QtCore.QUrl.fromUserInput(url))        self.app.exec_()    @QtCore.pyqtSlot()    def save_html(self):        if not self._is_finished:            self.view.page().toHtml(self.html_callable)            self._is_finished = True    def html_callable(self, html):        self._html = html        self.app.quit()JS = """var manager = null;function find_element() {  var e = document.getElementById('DataTables_Table_0');  console.log("try verify", e, manager);  if (e != null && manager != null) {    console.log(e)    manager.save_html()  } else {    setTimeout(find_element, 100);  }}(function wait_qt() {  if (typeof qt != 'undefined') {    console.log("Qt loaded");    new QWebChannel(qt.webChannelTransport, function (channel) {      manager = channel.objects.manager;      find_element();    });  } else {    setTimeout(wait_qt, 100);  }})();"""def main():    manager = Manager()    manager.set_script(JS)    manager.start(        "https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all"    )    soup = bs.BeautifulSoup(manager.html, "html.parser")    section = soup.find("table", {"id": "DataTables_Table_0"})    pprint(section)if __name__ == "__main__":    main()
随时随地看视频慕课网APP

相关分类

Python
我要回答