学习java的正则表达式,抓取网页并解析HTML部分内容
- packagecom.xiaofeng.picup;
- importjava.io.BufferedReader;
- importjava.io.IOException;
- importjava.io.InputStreamReader;
- importjava.net.MalformedURLException;
- importjava.net.URL;
- importjava.util.ArrayList;
- importjava.util.HashMap;
- importjava.util.List;
- importjava.util.regex.Matcher;
- importjava.util.regex.Pattern;
- /***//**
- *
- *@抓取页面文章标题及内容(测试)手动输入网址抓取,可进一步自动抓取整个页面的全部内容
- *
- */
- publicclassWebContent...{
- /***//**
- *读取一个网页全部内容
- */
- publicStringgetOneHtml(Stringhtmlurl)throwsIOException...{
- URLurl;
- Stringtemp;
- StringBuffersb=newStringBuffer();
- try...{
- url=newURL(htmlurl);
- BufferedReaderin=newBufferedReader(newInputStreamReader(url
- .openStream(),"utf-8"));//读取网页全部内容
- while((temp=in.readLine())!=null)...{
- sb.append(temp);
- }
- in.close();
- }catch(MalformedURLExceptionme)...{
- System.out.println("你输入的URL格式有问题!请仔细输入");
- me.getMessage();
- throwme;
- }catch(IOExceptione)...{
- e.printStackTrace();
- throwe;
- }
- returnsb.toString();
- }
- /***//**
- *
- *@params
- *@return获得网页标题
- */
- publicStringgetTitle(Strings)...{
- Stringregex;
- Stringtitle="";
- List<String>list=newArrayList<String>();
- regex="<title>.*?</title>";
- Patternpa=Pattern.compile(regex,Pattern.CANON_EQ);
- Matcherma=pa.matcher(s);
- while(ma.find())...{
- list.add(ma.group());
- }
- for(inti=0;i<list.size();i++)...{
- title=title+list.get(i);
- }
- returnoutTag(title);
- }
- /***//**
- *
- *@params
- *@return获得链接
- */
- publicList<String>getLink(Strings)...{
- Stringregex;
- List<String>list=newArrayList<String>();
- regex="<a[^>]*href=("([^"]*)"|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>";
- Patternpa=Pattern.compile(regex,Pattern.DOTALL);
- Matcherma=pa.matcher(s);
- while(ma.find())...{
- list.add(ma.group());
- }
- returnlist;
- }
- /***//**
- *
- *@params
- *@return获得脚本代码
- */
- publicList<String>getScript(Strings)...{
- Stringregex;
- List<String>list=newArrayList<String>();
- regex="<script.*?</script>";
- Patternpa=Pattern.compile(regex,Pattern.DOTALL);
- Matcherma=pa.matcher(s);
- while(ma.find())...{
- list.add(ma.group());
- }
- returnlist;
- }
- /***//**
- *
- *@params
- *@return获得CSS
- */
- publicList<String>getCSS(Strings)...{
- Stringregex;
- List<String>list=newArrayList<String>();
- regex="<style.*?</style>";
- Patternpa=Pattern.compile(regex,Pattern.DOTALL);
- Matcherma=pa.matcher(s);
- while(ma.find())...{
- list.add(ma.group());
- }
- returnlist;
- }
- /***//**
- *
- *@params
- *@return去掉标记
- */
- publicStringoutTag(Strings)...{
- returns.replaceAll("<.*?>","");
- }








发表评论
◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。