https://rocketreach.co/horizon -blue-cross-blue-shield-of-new-jersey-email-format_b5c604a3f42e0c54 这是我试图从中获取信息的链接。我需要提取表中的格式“first '_' last”“first_initial last”等等。如果不是全部,那么至少是最上面的格式。
这是我到目前为止所拥有的:
def search_on_google(key_word, driver):
driver.get("https://www.google.com/")
searchBoard = driver.find_element_by_name('q')
searchBoard.send_keys(key_word + " Rocketreach.co")
searchBoard.send_keys(Keys.TAB)
searchBoard.send_keys(Keys.ENTER)
driver.find_element_by_tag_name("cite").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
for link in soup.find_all('meta'):
content = link.get('content')
print(content)
编辑:
for i in range(1):
driver.find_element_by_tag_name("cite").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.XPATH, "//table/tbody/tr[1]/td[1][not(contains(text(), 'Lorem ipsum...'))]")))
table_id = driver.find_element(By.TAG_NAME, "tbody")
rows = table_id.find_elements(By.TAG_NAME, "tr")
for row in rows:
tds = row.find_elements(By.TAG_NAME, "td")
top_format.append(tds[0].text)
domain.append(tds[1].text)
print(top_format)
print(domain)
break
return top_format
不负相思意
相关分类