HUH函数
其他答案有助于了解如何维护此类会话。另外,我想提供一个类,该类可以使会话在脚本的不同运行(带有缓存文件)上得以维护。这意味着仅在需要时才执行正确的“登录”(超时或缓存中不存在会话)。它还支持在随后的“ get”或“ post”调用中的代理设置。已通过Python3测试。使用它作为您自己的代码的基础。GPL v3发行了以下片段import pickleimport datetimeimport osfrom urllib.parse import urlparseimport requests class MyLoginSession: """ a class which handles and saves login sessions. It also keeps track of proxy settings. It does also maintine a cache-file for restoring session data from earlier script executions. """ def __init__(self, loginUrl, loginData, loginTestUrl, loginTestString, sessionFileAppendix = '_session.dat', maxSessionTimeSeconds = 30 * 60, proxies = None, userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1', debug = True, forceLogin = False, **kwargs): """ save some information needed to login the session you'll have to provide 'loginTestString' which will be looked for in the responses html to make sure, you've properly been logged in 'proxies' is of format { 'https' : 'https://user:pass@server:port', 'http' : ... 'loginData' will be sent as post data (dictionary of id : value). 'maxSessionTimeSeconds' will be used to determine when to re-login. """ urlData = urlparse(loginUrl) self.proxies = proxies self.loginData = loginData self.loginUrl = loginUrl self.loginTestUrl = loginTestUrl self.maxSessionTime = maxSessionTimeSeconds self.sessionFile = urlData.netloc + sessionFileAppendix self.userAgent = userAgent self.loginTestString = loginTestString self.debug = debug self.login(forceLogin, **kwargs) def modification_date(self, filename): """ return last file modification date as datetime object """ t = os.path.getmtime(filename) return datetime.datetime.fromtimestamp(t) def login(self, forceLogin = False, **kwargs): """ login to a session. Try to read last saved session from cache file. If this fails do proper login. If the last cache access was too old, also perform a proper login. Always updates session cache file. """ wasReadFromCache = False if self.debug: print('loading or generating session...') if os.path.exists(self.sessionFile) and not forceLogin: time = self.modification_date(self.sessionFile) # only load if file less than 30 minutes old lastModification = (datetime.datetime.now() - time).seconds if lastModification < self.maxSessionTime: with open(self.sessionFile, "rb") as f: self.session = pickle.load(f) wasReadFromCache = True if self.debug: print("loaded session from cache (last access %ds ago) " % lastModification) if not wasReadFromCache: self.session = requests.Session() self.session.headers.update({'user-agent' : self.userAgent}) res = self.session.post(self.loginUrl, data = self.loginData, proxies = self.proxies, **kwargs) if self.debug: print('created new session with login' ) self.saveSessionToCache() # test login res = self.session.get(self.loginTestUrl) if res.text.lower().find(self.loginTestString.lower()) < 0: raise Exception("could not log into provided site '%s'" " (did not find successful login string)" % self.loginUrl) def saveSessionToCache(self): """ save session to a cache file """ # always save (to update timeout) with open(self.sessionFile, "wb") as f: pickle.dump(self.session, f) if self.debug: print('updated session cache-file %s' % self.sessionFile) def retrieveContent(self, url, method = "get", postData = None, **kwargs): """ return the content of the url with respect to the session. If 'method' is not 'get', the url will be called with 'postData' as a post request. """ if method == 'get': res = self.session.get(url , proxies = self.proxies, **kwargs) else: res = self.session.post(url , data = postData, proxies = self.proxies, **kwargs) # the session has been updated on the server, so also update in cache self.saveSessionToCache() return res使用上述类的代码片段可能如下所示:if __name__ == "__main__": # proxies = {'https' : 'https://user:pass@server:port', # 'http' : 'http://user:pass@server:port'} loginData = {'user' : 'usr', 'password' : 'pwd'} loginUrl = 'https://...' loginTestUrl = 'https://...' successStr = 'Hello Tom' s = MyLoginSession(loginUrl, loginData, loginTestUrl, successStr, #proxies = proxies ) res = s.retrieveContent('https://....') print(res.text) # if, for instance, login via JSON values required try this: s = MyLoginSession(loginUrl, None, loginTestUrl, successStr, #proxies = proxies, json = loginData)