公司编辑妹子需要爬取网页内容,叫我帮忙做了一简单的爬取工具

这是爬取网页内容,像是这对大家来说都是不难得,但是在这里有一些小改动,代码献上,大家参考
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
private string gethttpwebrequest(string url)
{
httpwebresponse result;
string strhtml = string.empty;
try
{
uri uri = new uri(url);
webrequest webreq = webrequest.create(uri);
webresponse webres = webreq.getresponse();
httpwebrequest myreq = (httpwebrequest)webreq;
myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705";
myreq.accept = "*/*";
myreq.keepalive = true;
myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5");
result = (httpwebresponse)myreq.getresponse();
stream recevicestream = result.getresponsestream();
streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("utf-8"));
strhtml = readerofstream.readtoend();
readerofstream.close();
recevicestream.close();
result.close();
}
catch
{
uri uri = new uri(url);
webrequest webreq = webrequest.create(uri);
httpwebrequest myreq = (httpwebrequest)webreq;
myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705";
myreq.accept = "*/*";
myreq.keepalive = true;
myreq.headers.add("accept-language", "zh-cn,en-us;q=0.5");
//result = (httpwebresponse)myreq.getresponse();
try
{
result = (httpwebresponse)myreq.getresponse();
}
catch (webexception ex)
{
result = (httpwebresponse)ex.response;
}
stream recevicestream = result.getresponsestream();
streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding("gb2312"));
strhtml = readerofstream.readtoend();
readerofstream.close();
recevicestream.close();
result.close();
}
return strhtml;
}
|
这是根据url爬取网页远吗,有一些小改动,很多网页有不同的编码格式,甚至有些网站做了反爬取的防范,这个方法经过能够改动也能爬去

以下是爬取网页所有的网址链接
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
/// <summary>
/// 提取html代码中的网址
/// </summary>
/// <param name="htmlcode"></param>
/// <returns></returns>
private static list<string> gethyperlinks(string htmlcode, string url)
{
arraylist al = new arraylist();
bool isgenxin = false;
stringbuilder weburlsb = new stringbuilder();//sql
stringbuilder linksb = new stringbuilder();//展示数据
list<string> weburllistzx = new list<string>();//新增
list<string> weburllist = new list<string>();//旧的
string productioncontent = htmlcode;
regex reg = new regex(@"http(s)?://([\w-]+\.)+[\w-]+/?");
string wangzhanyuming = reg.match(url, 0).value;
matchcollection mc = regex.matches(productioncontent.replace("href=\"/", "href=\"" + wangzhanyuming).replace("href='/", "href='" + wangzhanyuming).replace("href=/", "href=" + wangzhanyuming).replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aa][^>]* href=[^>]*>", regexoptions.singleline);
int index = 1;
foreach (match m in mc)
{
matchcollection mc1 = regex.matches(m.value, @"[a-za-z]+://[^\s]*", regexoptions.singleline);
if (mc1.count > 0)
{
foreach (match m1 in mc1)
{
string linkurlstr = string.empty;
linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", "");
weburlsb.append("$-$");
weburlsb.append(linkurlstr);
weburlsb.append("$_$");
if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))
{
isgenxin = true;
weburllistzx.add(linkurlstr);
linksb.appendformat("{0}<br/>", linkurlstr);
}
}
}
else
{
if (m.value.indexof("javascript") == -1)
{
string amstr = string.empty;
string wangzhanxiangduilujin = string.empty;
wangzhanxiangduilujin = url.substring(0, url.lastindexof("/") + 1);
amstr = m.value.replace("href=\"", "href=\"" + wangzhanxiangduilujin).replace("href='", "href='" + wangzhanxiangduilujin);
matchcollection mc11 = regex.matches(amstr, @"[a-za-z]+://[^\s]*", regexoptions.singleline);
foreach (match m1 in mc11)
{
string linkurlstr = string.empty;
linkurlstr = m1.value.replace("\"", "").replace("'", "").replace(">", "").replace(";", "");
weburlsb.append("$-$");
weburlsb.append(linkurlstr);
weburlsb.append("$_$");
if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))
{
isgenxin = true;
weburllistzx.add(linkurlstr);
linksb.appendformat("{0}<br/>", linkurlstr);
}
}
}
}
index++;
}
return weburllistzx;
}
|
这块的技术其实就是简单的使用了正则去匹配!接下来献上获取标题,以及存储到xml文件的方法
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
/// <summary>
/// // 把网址写入xml文件
/// </summary>
/// <param name="strurl"></param>
/// <param name="alhyperlinks"></param>
private static void writetoxml(string strurl, list<string> alhyperlinks)
{
xmltextwriter writer = new xmltextwriter(@"d:\hyperlinks.xml", encoding.utf8);
writer.formatting = formatting.indented;
writer.writestartdocument(false);
writer.writedoctype("hyperlinks", null, "urls.dtd", null);
writer.writecomment("提取自" + strurl + "的超链接");
writer.writestartelement("hyperlinks");
writer.writestartelement("hyperlinks", null);
writer.writeattributestring("datetime", datetime.now.tostring());
foreach (string str in alhyperlinks)
{
string title = getdomain(str);
string body = str;
writer.writeelementstring(title, null, body);
}
writer.writeendelement();
writer.writeendelement();
writer.flush();
writer.close();
}
/// <summary>
/// 获取网址的域名后缀
/// </summary>
/// <param name="strurl"></param>
/// <returns></returns>
private static string getdomain(string strurl)
{
string retval;
string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
regex r = new regex(strregex, regexoptions.ignorecase);
match m = r.match(strurl);
retval = m.tostring();
strregex = @"\.|/$";
retval = regex.replace(retval, strregex, "").tostring();
if (retval == "")
retval = "other";
return retval;
}
/// <summary>
/// 获取标题
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private static string gettitle(string html)
{
string titlefilter = @"<title>[\s\s]*?</title>";
string h1filter = @"<h1.*?>.*?</h1>";
string clearfilter = @"<.*?>";
string title = "";
match match = regex.match(html, titlefilter, regexoptions.ignorecase);
if (match.success)
{
title = regex.replace(match.groups[0].value, clearfilter, "");
}
// 正文的标题一般在h1中,比title中的标题更干净
match = regex.match(html, h1filter, regexoptions.ignorecase);
if (match.success)
{
string h1 = regex.replace(match.groups[0].value, clearfilter, "");
if (!string.isnullorempty(h1) && title.startswith(h1))
{
title = h1;
}
}
return title;
}
|
这就是所用的全部方法,还是有很多需要改进之处!大家如果有发现不足之处还请指出,谢谢!
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。








发表评论
◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。