当前位置:文档之家› 蜘蛛程序 网络爬虫 源代码

蜘蛛程序 网络爬虫 源代码

有一天突然看了htmlparser工具包发现果然强大。

由于不是很熟悉所以下面代码或许写的有点烂。

首先做准备工作先写一个实体beanpackage bean; import java.util.Date;/*** @author Jeson* blog * @date:Oct 9, 2009 3:09:19 PM* @version :1.0**/publicclass Artical {private String title;private String body;private String link;private String author;private String [] tags;private Date dCreate;public String getTitle() {return title;}publicvoid setTitle(String title) {this.title = title;}public String getBody() {return body;}publicvoid setBody(String body) {this.body = body;}public String getLink() {return link;}publicvoid setLink(String link) {this.link = link;}public String getAuthor() {return author;}publicvoid setAuthor(String author) {this.author = author;}public String[] getTags() {return tags;}publicvoid setTags(String[] tags) {this.tags = tags;}public Date getDCreate() {return dCreate;}publicvoid setDCreate(Date create) {dCreate = create;}}2 写一个我们下面要用到的字符串处理类package util;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import java.util.regex.Matcher;import java.util.regex.Pattern;/*** @author Jeson* blog * @date:Oct 9, 2009 3:09:19 PM * @version:1.0 */publicclass StringUtil {/*** 使用正则匹配字符串** @param regex* 正则表达式* @param txt* 要验证的字符串* @return匹配则返回真否则返回假*/publicstaticboolean useRegex(String regex, String txt) {Pattern p = pile(regex);Matcher m = p.matcher(txt);return m.matches();}/*** 使用正则匹配字符串** @param regex* 正则表达式 ** @param index* 要取第几个元素* @param txt* 要验证的字符串* @return返回匹配的字符串*/publicstatic String getByRegex(String regex, int index, String txt) {Pattern p = pile(regex);Matcher m = p.matcher(txt);if (m.find()) {return m.group(index);}returnnull;}/*** 使用正则匹配字符串** @param regex* 正则表达式 ** @param index* 要取第几个元素* @param txt* 要验证的字符串* @return返回匹配的字符串数组*/publicstatic String [] getStringsByRegex(String regex, int[] index, String txt) {String res [] = new String[index.length];Pattern p = pile(regex);Matcher m = p.matcher(txt);if (m.find()) {for(int i : index){res[i] = m.group(i);}}return res;}}3 下面是我们的核心类他会去抓取cnblogs的页面并保存package test; import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStream;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.util.NodeList;import bean.Artical;import util.StringUtil;/*** @author Jeson* @blog * @date:Oct 9, 2009 1:08:10 PM* @version :1.0**/publicclass Parse {privatestaticfinalint MAX_PAGE = 20;privatefinal String ENCODING = "UTF-8";/*** @param args*/publicstaticvoid main(String[] args) {try {for(int i=1;i<MAX_PAGE;i++){new Parse().testAttribute(i);}} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}privatevoid testAttribute(int pa) throws Exception{System.out.println("————开始解析页面:"+pa);Parser p = new Parser();p.setURL("/cate/java/?page="+pa);p.setEncoding("UTF-8");NodeFilter filter = new HasAttributeFilter("class","titlelnk");NodeList list = p.extractAllNodesThatMatch(filter);System.out.println(list.size());int cou = 0;for(int i=0 ; i<list.size();i++){String html = list.elementAt(i).toHtml(true);int [] index = {0,1,2};String [] bs = StringUtil.getStringsByRegex("<aclass=\\"titlelnk\\" href=\\"(.*)\\" target=\\"_blank\\">(.*)</a>", index, html);String title = bs[2];String url = bs[1];System.out.println(url);String content = getContent(bs[1]);if(content == null || "".equals(content)){continue;}Artical art = new Artical();art.setTitle(title);art.setBody(content);art.setLink(url);createFile(art);System.out.println("=========="+(i+1)+"============");System.out.println("title==>"+bs[2]);System.out.println("url==>"+bs[1]);System.out.println("content==>"+getContent(bs[1]));System.out.println("======================");System.out.println();cou++;}System.out.println("over"+cou);}private String getContent(String url) throws Exception{Parser p = new Parser();p.setURL(url);p.setEncoding(ENCODING);NodeFilter filter = new HasAttributeFilter("class","post");NodeList list = p.extractAllNodesThatMatch(filter);String a = list.toHtml();return a;}privatevoid createFile(Artical art){try {File d = new File("d:\\\\cnblog");if(!d.exists()){d.mkdir();}File f = newFile("d:\\\\cnblog\\\\"+art.getTitle()+".html");if(!f.exists()){f.createNewFile();System.out.println("——–>"+art.getTitle()+"文件已经创建");}OutputStream file = new FileOutputStream(f.getPath());file.write(art.getBody().getBytes());file.flush();file.close();System.out.println("文件写入完毕,地址"+f.getPath());} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();System.out.println(art.getLink()+" "+art.getTitle()+"文件写入失败");}}}。

相关主题