采集反爬虫网站,就是采集网址的更新内容的链接,采集不到内容!
1 //以下是部分代码
2 List Weburllist = new List();
3 List Weburllistzx = new List();
4 StringBuilder weburlSB = new StringBuilder();
5 bool IsGenxin = false;
6 MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline);
7 foreach (Match m in mcexplain)
8 {
9 Weburllist.Add(m.Value);
10 }
11
12 System.Net.WebRequest newswebrequest = System.Net.WebRequest.Create(sjurlDR["LinkUrl"].ToString());
13
14 Uri uri = new Uri(sjurlDR["LinkUrl"].ToString());
15 SetHeaderValue(newswebrequest.Headers, "Host", uri.Host);
16 SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0");
17 SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
18 SetHeaderValue(newswebrequest.Headers, "Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
19 SetHeaderValue(newswebrequest.Headers, "Accept-Encoding", "gzip, deflate, sdch");
20 SetHeaderValue(newswebrequest.Headers, "Cookie:", "gscu_792856215=62888640q5c56420; _gscbrs_792856215=1");
21 SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive");
22 SetHeaderValue(newswebrequest.Headers, "Cache-Control", "max-age=0");
23
24 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate, sdch");
25 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8");
26 //newswebrequest.Headers.Add(HttpRequestHeader.CacheControl, "max-age=0");
27 //SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
28 //SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive");
29 //newswebrequest.Headers.Add(HttpRequestHeader.Cookie, "_gscu_792856215=62888640q5c56420; _gscbrs_792856215=1");
30 //SetHeaderValue(newswebrequest.Headers, "Host", "zjks.com");
31 //SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36");
32
33
34 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();
35 System.IO.Stream newsstream = newswebresponse.GetResponseStream();
36 System.IO.StreamReader sr = new StreamReader(newsstream, System.Text.Encoding.UTF8);
37 string ProductionContent = string.Empty;
38 ProductionContent = sr.ReadToEnd();
39 sr.Close();
40
41 Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?");
42 string wangzhanyuming = reg.Match(sjurlDR["LinkUrl"].ToString(), 0).Value;
43 MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=\"./", "href=
44
45 \""+ wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline);
46 int Index = 1;
47 foreach (Match m in mc)
48 {
49 MatchCollection mc1 = Regex.Matches(m.Value.Replace("\"", "'"), @"[a-zA-z]+://[^']*", RegexOptions.Singleline);
50 if (mc1.Count > 0)
51 {
52 foreach (Match m1 in mc1)
53 {
54 string linkurlstr = string.Empty;
55 linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
56 weburlSB.Append("$-$");
57 weburlSB.Append(linkurlstr);
58 weburlSB.Append("$_$");
59 if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
60 {
61 IsGenxin = true;
62 Weburllistzx.Add(linkurlstr);
63 linkSb.AppendFormat("{0}
", linkurlstr);
64 }
65 }
66 }
67 else
68 {
69 if (m.Value.IndexOf("javascript") == -1)
70 {
71 string amstr = string.Empty;
72 string wangzhanxiangduilujin = string.Empty;
73 wangzhanxiangduilujin = sjurlDR["LinkUrl"].ToString().Substring(0, sjurlDR["LinkUrl"].ToString().LastIndexOf("/") + 1);
74 amstr = m.Value.Replace("href=\"", "href=\"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin);
75 MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline);
76
77 foreach (Match m1 in mc11)
78 {
79 string linkurlstr = string.Empty;
80 linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
81 weburlSB.Append("$-$");
82 weburlSB.Append(linkurlstr);
83 weburlSB.Append("$_$");
84 if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
85 {
86 IsGenxin = true;
87 Weburllistzx.Add(linkurlstr);
88 linkSb.AppendFormat("{0}
", linkurlstr);
89 }
90 }
91 }
92 }
93 Index++;
94 }
95 System.Threading.Thread.Sleep(1000);
96 if (IsGenxin)
97 {
98 originlinksInfo oinfo = new originlinksInfo();
99 oinfo = originlinksLogic.Get(int.Parse(sjurlDR["ID"].ToString()));
100 oinfo.LinkContentnext = oinfo.LinkContent;
101 oinfo.LinkContent = weburlSB.ToString();
102 originlinksLogic.Update(oinfo);
103 System.Threading.Thread.Sleep(2000);
104 }
105
106 //如http://www.zjks.com/,这个网站总是采集失败,在这句代码
107 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();//这里在采集时总是跳出
跪求思路!