C# 网络爬虫（C# 爬虫）

公司编辑妹子需要爬取网页内容，叫我帮忙做了一简单的爬取工具

C# 网络爬虫（C# 爬虫）

这是爬取网页内容，像是这对大家来说都是不难得，但是在这里有一些小改动，代码献上，大家参考

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 private string gethttpwebrequest(string url) { httpwebresponse result; string strhtml = string.empty; try { uri uri = new uri(url); webrequest webreq = webrequest.create(uri); webresponse webres = webreq.getresponse(); httpwebrequest myreq = (httpwebrequest)webreq; myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; myreq.accept = "*/*"; myreq.keepalive = true; myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5"); result = (httpwebresponse)myreq.getresponse(); stream recevicestream = result.getresponsestream(); streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("utf-8")); strhtml = readerofstream.readtoend(); readerofstream.close(); recevicestream.close(); result.close(); } catch { uri uri = new uri(url); webrequest webreq = webrequest.create(uri); httpwebrequest myreq = (httpwebrequest)webreq; myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"; myreq.accept = "*/*"; myreq.keepalive = true; myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5"); //result = (httpwebresponse)myreq.getresponse(); try { result = (httpwebresponse)myreq.getresponse(); } catch (webexception ex) { result = (httpwebresponse)ex.response; } stream recevicestream = result.getresponsestream(); streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("gb2312")); strhtml = readerofstream.readtoend(); readerofstream.close(); recevicestream.close(); result.close(); } return strhtml; }

这是根据url爬取网页远吗，有一些小改动，很多网页有不同的编码格式，甚至有些网站做了反爬取的防范，这个方法经过能够改动也能爬去

C# 网络爬虫（C# 爬虫）

以下是爬取网页所有的网址链接

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /// <summary> /// 提取html代码中的网址 /// </summary> /// <param name="htmlcode"></param> /// <returns></returns> private static list<string> gethyperlinks(string htmlcode, string url) { arraylist al = new arraylist(); bool isgenxin = false; stringbuilder weburlsb = new stringbuilder();//sql stringbuilder linksb = new stringbuilder();//展示数据 list<string> weburllistzx = new list<string>();//新增 list<string> weburllist = new list<string>();//旧的 string productioncontent = htmlcode; regex reg = new regex(@"http(s)?://([\w-]+\.)+[\w-]+/?"); string wangzhanyuming = reg.match(url, 0).value; matchcollection mc = regex.matches(productioncontent.replace("href=\"/", "href=\"" + wangzhanyuming).replace("href='/", "href='" + wangzhanyuming).replace("href=/", "href=" + wangzhanyuming).replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aa][^>]* href=[^>]*>", regexoptions.singleline); int index = 1; foreach (match m in mc) { matchcollection mc1 = regex.matches(m.value, @"[a-za-z]+://[^\s]*", regexoptions.singleline); if (mc1.count > 0) { foreach (match m1 in mc1) { string linkurlstr = string.empty; linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", ""); weburlsb.append("$-$"); weburlsb.append(linkurlstr); weburlsb.append("$_$"); if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) { isgenxin = true; weburllistzx.add(linkurlstr); linksb.appendformat("{0}<br/>", linkurlstr); } } } else { if (m.value.indexof("javascript") == -1) { string amstr = string.empty; string wangzhanxiangduilujin = string.empty; wangzhanxiangduilujin = url.substring(0, url.lastindexof("/") + 1); amstr = m.value.replace("href=\"", "href=\"" + wangzhanxiangduilujin).replace("href='", "href='" + wangzhanxiangduilujin); matchcollection mc11 = regex.matches(amstr, @"[a-za-z]+://[^\s]*", regexoptions.singleline); foreach (match m1 in mc11) { string linkurlstr = string.empty; linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", ""); weburlsb.append("$-$"); weburlsb.append(linkurlstr); weburlsb.append("$_$"); if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) { isgenxin = true; weburllistzx.add(linkurlstr); linksb.appendformat("{0}<br/>", linkurlstr); } } } } index++; } return weburllistzx; }

这块的技术其实就是简单的使用了正则去匹配！接下来献上获取标题，以及存储到xml文件的方法

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /// <summary> /// // 把网址写入xml文件 /// </summary> /// <param name="strurl"></param> /// <param name="alhyperlinks"></param> private static void writetoxml(string strurl, list<string> alhyperlinks) { xmltextwriter writer = new xmltextwriter(@"d:\hyperlinks.xml", encoding.utf8); writer.formatting = formatting.indented; writer.writestartdocument(false); writer.writedoctype("hyperlinks", null, "urls.dtd", null); writer.writecomment("提取自" + strurl + "的超链接"); writer.writestartelement("hyperlinks"); writer.writestartelement("hyperlinks", null); writer.writeattributestring("datetime", datetime.now.tostring()); foreach (string str in alhyperlinks) { string title = getdomain(str); string body = str; writer.writeelementstring(title, null, body); } writer.writeendelement(); writer.writeendelement(); writer.flush(); writer.close(); } /// <summary> /// 获取网址的域名后缀 /// </summary> /// <param name="strurl"></param> /// <returns></returns> private static string getdomain(string strurl) { string retval; string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"; regex r = new regex(strregex, regexoptions.ignorecase); match m = r.match(strurl); retval = m.tostring(); strregex = @"\.|/$"; retval = regex.replace(retval, strregex, "").tostring(); if (retval == "") retval = "other"; return retval; } /// <summary> /// 获取标题 /// </summary> /// <param name="html"></param> /// <returns></returns> private static string gettitle(string html) { string titlefilter = @"<title>[\s\s]*?</title>"; string h1filter = @"<h1.*?>.*?</h1>"; string clearfilter = @"<.*?>"; string title = ""; match match = regex.match(html, titlefilter, regexoptions.ignorecase); if (match.success) { title = regex.replace(match.groups[0].value, clearfilter, ""); } // 正文的标题一般在h1中，比title中的标题更干净 match = regex.match(html, h1filter, regexoptions.ignorecase); if (match.success) { string h1 = regex.replace(match.groups[0].value, clearfilter, ""); if (!string.isnullorempty(h1) && title.startswith(h1)) { title = h1; } } return title; }