采集反爬虫网站,就是采集网址的更新内容的链接,采集不到内容!

1 //以下是部分代码 2 List Weburllist = new List(); 3 List Weburllistzx = new List(); 4 StringBuilder weburlSB = new StringBuilder(); 5 bool IsGenxin = false; 6 MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline); 7 foreach (Match m in mcexplain) 8 { 9 Weburllist.Add(m.Value); 10 } 11 12 System.Net.WebRequest newswebrequest = System.Net.WebRequest.Create(sjurlDR["LinkUrl"].ToString()); 13 14 Uri uri = new Uri(sjurlDR["LinkUrl"].ToString()); 15 SetHeaderValue(newswebrequest.Headers, "Host", uri.Host); 16 SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"); 17 SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); 18 SetHeaderValue(newswebrequest.Headers, "Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); 19 SetHeaderValue(newswebrequest.Headers, "Accept-Encoding", "gzip, deflate, sdch"); 20 SetHeaderValue(newswebrequest.Headers, "Cookie:", "gscu_792856215=62888640q5c56420; _gscbrs_792856215=1"); 21 SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive"); 22 SetHeaderValue(newswebrequest.Headers, "Cache-Control", "max-age=0"); 23 24 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate, sdch"); 25 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8"); 26 //newswebrequest.Headers.Add(HttpRequestHeader.CacheControl, "max-age=0"); 27 //SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 28 //SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive"); 29 //newswebrequest.Headers.Add(HttpRequestHeader.Cookie, "_gscu_792856215=62888640q5c56420; _gscbrs_792856215=1"); 30 //SetHeaderValue(newswebrequest.Headers, "Host", "zjks.com"); 31 //SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"); 32 33 34 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse(); 35 System.IO.Stream newsstream = newswebresponse.GetResponseStream(); 36 System.IO.StreamReader sr = new StreamReader(newsstream, System.Text.Encoding.UTF8); 37 string ProductionContent = string.Empty; 38 ProductionContent = sr.ReadToEnd(); 39 sr.Close(); 40 41 Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?"); 42 string wangzhanyuming = reg.Match(sjurlDR["LinkUrl"].ToString(), 0).Value; 43 MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=\"./", "href= 44 45 \""+ wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline); 46 int Index = 1; 47 foreach (Match m in mc) 48 { 49 MatchCollection mc1 = Regex.Matches(m.Value.Replace("\"", "'"), @"[a-zA-z]+://[^']*", RegexOptions.Singleline); 50 if (mc1.Count > 0) 51 { 52 foreach (Match m1 in mc1) 53 { 54 string linkurlstr = string.Empty; 55 linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", ""); 56 weburlSB.Append("$-$"); 57 weburlSB.Append(linkurlstr); 58 weburlSB.Append("$_$"); 59 if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr)) 60 { 61 IsGenxin = true; 62 Weburllistzx.Add(linkurlstr); 63 linkSb.AppendFormat("{0}
", linkurlstr); 64 } 65 } 66 } 67 else 68 { 69 if (m.Value.IndexOf("javascript") == -1) 70 { 71 string amstr = string.Empty; 72 string wangzhanxiangduilujin = string.Empty; 73 wangzhanxiangduilujin = sjurlDR["LinkUrl"].ToString().Substring(0, sjurlDR["LinkUrl"].ToString().LastIndexOf("/") + 1); 74 amstr = m.Value.Replace("href=\"", "href=\"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin); 75 MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline); 76 77 foreach (Match m1 in mc11) 78 { 79 string linkurlstr = string.Empty; 80 linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", ""); 81 weburlSB.Append("$-$"); 82 weburlSB.Append(linkurlstr); 83 weburlSB.Append("$_$"); 84 if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr)) 85 { 86 IsGenxin = true; 87 Weburllistzx.Add(linkurlstr); 88 linkSb.AppendFormat("{0}
", linkurlstr); 89 } 90 } 91 } 92 } 93 Index++; 94 } 95 System.Threading.Thread.Sleep(1000); 96 if (IsGenxin) 97 { 98 originlinksInfo oinfo = new originlinksInfo(); 99 oinfo = originlinksLogic.Get(int.Parse(sjurlDR["ID"].ToString())); 100 oinfo.LinkContentnext = oinfo.LinkContent; 101 oinfo.LinkContent = weburlSB.ToString(); 102 originlinksLogic.Update(oinfo); 103 System.Threading.Thread.Sleep(2000); 104 } 105 106 //如http://www.zjks.com/,这个网站总是采集失败,在这句代码 107 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();//这里在采集时总是跳出 跪求思路!
慕田峪4524236
浏览 491回答 10
10回答

哈士奇WWW

是不是你后面请求没有带上前面生成的cookie又或者是请求太快了。系统拒绝了。

慕侠2389804

对比正常请求和你的采集请求,找到差异点,进行修改。HTTP协议无状态,根本是请求本身有差异,才会被对方识别到。

慕虎7371278

主要的方法BtnCaiji_Click 这里面

慕桂英546537

你直接改cookie怕是不算登录吧?

繁花不似锦

wo也不晓得,菜鸟求大神指点!
打开App,查看更多内容
随时随地看视频慕课网APP