侃侃无极
Twitter v2 API 包括用于随机采样的端点和用于过滤推文的端点。import requestsimport osimport jsonimport pandas as pd# To set your enviornment variables in your terminal run the following line:# export 'BEARER_TOKEN'='<your_bearer_token>'data = []counter = 0def create_headers(bearer_token): headers = {"Authorization": "Bearer {}".format(bearer_token)} return headersdef get_rules(headers, bearer_token): response = requests.get( "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers ) if response.status_code != 200: raise Exception( "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text) ) print(json.dumps(response.json())) return response.json()def delete_all_rules(headers, bearer_token, rules): if rules is None or "data" not in rules: return None ids = list(map(lambda rule: rule["id"], rules["data"])) payload = {"delete": {"ids": ids}} response = requests.post( "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers, json=payload ) if response.status_code != 200: raise Exception( "Cannot delete rules (HTTP {}): {}".format( response.status_code, response.text ) ) print(json.dumps(response.json()))def set_rules(headers, delete, bearer_token): # You can adjust the rules if needed sample_rules = [ {"value": "dog has:images", "tag": "dog pictures"}, {"value": "cat has:images -grumpy", "tag": "cat pictures"}, ] payload = {"add": sample_rules} response = requests.post( "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers, json=payload, ) if response.status_code != 201: raise Exception( "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text) ) print(json.dumps(response.json()))def get_stream(headers, set, bearer_token): global data, counter response = requests.get( "https://api.twitter.com/2/tweets/search/stream", headers=headers, stream=True, ) print(response.status_code) if response.status_code != 200: raise Exception( "Cannot get stream (HTTP {}): {}".format( response.status_code, response.text ) ) for response_line in response.iter_lines(): if response_line: json_response = json.loads(response_line) print(json.dumps(json_response, indent=4, sort_keys=True)) data.append(json_response['data']) if len(data) % 100 == 0: print('storing data') pd.read_json(json.dumps(data), orient='records').to_json(f'tw_example_{counter}.json', orient='records') data = [] counter +=1def main(): bearer_token = os.environ.get("BEARER_TOKEN") headers = create_headers(bearer_token) rules = get_rules(headers, bearer_token) delete = delete_all_rules(headers, bearer_token, rules) set = set_rules(headers, delete, bearer_token) get_stream(headers, set, bearer_token)if __name__ == "__main__": main()然后,将 pandas dataframe 中的数据加载为 df = pd.read_json('tw_example.json', orient='records').