猿问
下载APP

如何在PHP中创建一个简单的爬虫?

我有一个包含大量链接的网页。我想编写一个脚本,将脚本中包含的所有数据转储到本地文件中。


有人用PHP做过吗?一般准则和陷阱就足以作为答案。


红颜莎娜
浏览 68回答 3
3回答

偶然的你

咩。不要使用正则表达式解析HTML。这是一个受Tatu's启发的DOM版本:<?phpfunction crawl_page($url, $depth = 5){&nbsp; &nbsp; static $seen = array();&nbsp; &nbsp; if (isset($seen[$url]) || $depth === 0) {&nbsp; &nbsp; &nbsp; &nbsp; return;&nbsp; &nbsp; }&nbsp; &nbsp; $seen[$url] = true;&nbsp; &nbsp; $dom = new DOMDocument('1.0');&nbsp; &nbsp; @$dom->loadHTMLFile($url);&nbsp; &nbsp; $anchors = $dom->getElementsByTagName('a');&nbsp; &nbsp; foreach ($anchors as $element) {&nbsp; &nbsp; &nbsp; &nbsp; $href = $element->getAttribute('href');&nbsp; &nbsp; &nbsp; &nbsp; if (0 !== strpos($href, 'http')) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $path = '/' . ltrim($href, '/');&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (extension_loaded('http')) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href = http_build_url($url, array('path' => $path));&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; } else {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $parts = parse_url($url);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href = $parts['scheme'] . '://';&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (isset($parts['user']) && isset($parts['pass'])) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= $parts['user'] . ':' . $parts['pass'] . '@';&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= $parts['host'];&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (isset($parts['port'])) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= ':' . $parts['port'];&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= dirname($parts['path'], 1).$path;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; crawl_page($href, $depth - 1);&nbsp; &nbsp; }&nbsp; &nbsp; echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL;}crawl_page("http://hobodave.com", 2);

慕函数4003404

这里我的实现基于上面的例子/答案。它是基于阶级的使用卷曲支持HTTP身份验证跳过不属于基本域的URL返回每页的Http标头响应代码每页的返回时间CRAWL CLASS:class crawler{&nbsp; &nbsp; protected $_url;&nbsp; &nbsp; protected $_depth;&nbsp; &nbsp; protected $_host;&nbsp; &nbsp; protected $_useHttpAuth = false;&nbsp; &nbsp; protected $_user;&nbsp; &nbsp; protected $_pass;&nbsp; &nbsp; protected $_seen = array();&nbsp; &nbsp; protected $_filter = array();&nbsp; &nbsp; public function __construct($url, $depth = 5)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; $this->_url = $url;&nbsp; &nbsp; &nbsp; &nbsp; $this->_depth = $depth;&nbsp; &nbsp; &nbsp; &nbsp; $parse = parse_url($url);&nbsp; &nbsp; &nbsp; &nbsp; $this->_host = $parse['host'];&nbsp; &nbsp; }&nbsp; &nbsp; protected function _processAnchors($content, $url, $depth)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; $dom = new DOMDocument('1.0');&nbsp; &nbsp; &nbsp; &nbsp; @$dom->loadHTML($content);&nbsp; &nbsp; &nbsp; &nbsp; $anchors = $dom->getElementsByTagName('a');&nbsp; &nbsp; &nbsp; &nbsp; foreach ($anchors as $element) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href = $element->getAttribute('href');&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (0 !== strpos($href, 'http')) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $path = '/' . ltrim($href, '/');&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (extension_loaded('http')) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href = http_build_url($url, array('path' => $path));&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; } else {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $parts = parse_url($url);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href = $parts['scheme'] . '://';&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (isset($parts['user']) && isset($parts['pass'])) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= $parts['user'] . ':' . $parts['pass'] . '@';&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= $parts['host'];&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (isset($parts['port'])) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= ':' . $parts['port'];&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $href .= $path;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; // Crawl only link that belongs to the start domain&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; $this->crawl_page($href, $depth - 1);&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }&nbsp; &nbsp; protected function _getContent($url)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; $handle = curl_init($url);&nbsp; &nbsp; &nbsp; &nbsp; if ($this->_useHttpAuth) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_pass);&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; // follows 302 redirect, creates problem wiht authentication//&nbsp; &nbsp; &nbsp; &nbsp; curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);&nbsp; &nbsp; &nbsp; &nbsp; // return the content&nbsp; &nbsp; &nbsp; &nbsp; curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);&nbsp; &nbsp; &nbsp; &nbsp; /* Get the HTML or whatever is linked in $url. */&nbsp; &nbsp; &nbsp; &nbsp; $response = curl_exec($handle);&nbsp; &nbsp; &nbsp; &nbsp; // response total time&nbsp; &nbsp; &nbsp; &nbsp; $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);&nbsp; &nbsp; &nbsp; &nbsp; /* Check for 404 (file not found). */&nbsp; &nbsp; &nbsp; &nbsp; $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);&nbsp; &nbsp; &nbsp; &nbsp; curl_close($handle);&nbsp; &nbsp; &nbsp; &nbsp; return array($response, $httpCode, $time);&nbsp; &nbsp; }&nbsp; &nbsp; protected function _printResult($url, $depth, $httpcode, $time)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; ob_end_flush();&nbsp; &nbsp; &nbsp; &nbsp; $currentDepth = $this->_depth - $depth;&nbsp; &nbsp; &nbsp; &nbsp; $count = count($this->_seen);&nbsp; &nbsp; &nbsp; &nbsp; echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>";&nbsp; &nbsp; &nbsp; &nbsp; ob_start();&nbsp; &nbsp; &nbsp; &nbsp; flush();&nbsp; &nbsp; }&nbsp; &nbsp; protected function isValid($url, $depth)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; if (strpos($url, $this->_host) === false&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; || $depth === 0&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; || isset($this->_seen[$url])&nbsp; &nbsp; &nbsp; &nbsp; ) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return false;&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; foreach ($this->_filter as $excludePath) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (strpos($url, $excludePath) !== false) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return false;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; return true;&nbsp; &nbsp; }&nbsp; &nbsp; public function crawl_page($url, $depth)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; if (!$this->isValid($url, $depth)) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return;&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; // add to the seen URL&nbsp; &nbsp; &nbsp; &nbsp; $this->_seen[$url] = true;&nbsp; &nbsp; &nbsp; &nbsp; // get Content and Return Code&nbsp; &nbsp; &nbsp; &nbsp; list($content, $httpcode, $time) = $this->_getContent($url);&nbsp; &nbsp; &nbsp; &nbsp; // print Result for current Page&nbsp; &nbsp; &nbsp; &nbsp; $this->_printResult($url, $depth, $httpcode, $time);&nbsp; &nbsp; &nbsp; &nbsp; // process subPages&nbsp; &nbsp; &nbsp; &nbsp; $this->_processAnchors($content, $url, $depth);&nbsp; &nbsp; }&nbsp; &nbsp; public function setHttpAuth($user, $pass)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; $this->_useHttpAuth = true;&nbsp; &nbsp; &nbsp; &nbsp; $this->_user = $user;&nbsp; &nbsp; &nbsp; &nbsp; $this->_pass = $pass;&nbsp; &nbsp; }&nbsp; &nbsp; public function addFilterPath($path)&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; $this->_filter[] = $path;&nbsp; &nbsp; }&nbsp; &nbsp; public function run()&nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; $this->crawl_page($this->_url, $this->_depth);&nbsp; &nbsp; }}用法:// USAGE$startURL = 'http://YOUR_URL/';$depth = 6;$username = 'YOURUSER';$password = 'YOURPASS';$crawler = new crawler($startURL, $depth);$crawler->setHttpAuth($username, $password);// Exclude path with the following structure to be processed&nbsp;$crawler->addFilterPath('customer/account/login/referer');$crawler->run();

www说

以它最简单的形式:function crawl_page($url, $depth = 5) {&nbsp; &nbsp; if($depth > 0) {&nbsp; &nbsp; &nbsp; &nbsp; $html = file_get_contents($url);&nbsp; &nbsp; &nbsp; &nbsp; preg_match_all('~<a.*?href="(.*?)".*?>~', $html, $matches);&nbsp; &nbsp; &nbsp; &nbsp; foreach($matches[1] as $newurl) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; crawl_page($newurl, $depth - 1);&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; file_put_contents('results.txt', $newurl."\n\n".$html."\n\n", FILE_APPEND);&nbsp; &nbsp; }}crawl_page('http://www.domain.com/index.php', 5);该函数将从页面获取内容,然后抓取所有找到的链接并将内容保存到“results.txt”。函数接受第二个参数depth,它定义了应该遵循链接的时间。如果您只想解析给定页面中的链接,请在那里传递1。
打开App,查看更多内容
随时随地看视频慕课网APP
我要回答