当前位置：文档之家› Java网络爬虫简单实现

Java网络爬虫简单实现

首先介绍每个类的功能：DownloadPage.java的功能是下载此超链接的页面源代码.FunctionUtils.java 的功能是提供不同的静态方法，包括：页面链接正则表达式匹配,获取URL链接的元素,判断是否创建文件,获取页面的Url并将其转换为规范的Url,截取网页网页源文件的目标内容。

HrefOfPage.java 的功能是获取页面源代码的超链接。

UrlDataHanding.java 的功能是整合各个给类，实现url到获取数据到数据处理类。

UrlQueue.java 的未访问Url队列。

VisitedUrlQueue.java 已访问过的URL队列。

下面介绍一下每个类的源代码：DownloadPage.java 此类要用到HttpClient组件。

1.package com.sreach.spider;2.3.import java.io.IOException;4.import org.apache.http.HttpEntity;5.import org.apache.http.HttpResponse;6.import org.apache.http.client.ClientProtocolException;7.import org.apache.http.client.HttpClient;8.import org.apache.http.client.methods.HttpGet;9.import org.apache.http.impl.client.DefaultHttpClient;10.import org.apache.http.util.EntityUtils;11.12.public class DownloadPage13.{14.15. /**16. * 根据URL抓取网页内容17. *18. * @param url19. * @return20. */21. public static String getContentFormUrl(String url)22. {23. /* 实例化一个HttpClient客户端 */24. HttpClient client = new DefaultHttpClient();25. HttpGet getHttp = new HttpGet(url);26.27. String content = null;28.29. HttpResponse response;30. try31. {32. /*获得信息载体*/33. response = client.execute(getHttp);34. HttpEntity entity = response.getEntity();35.36. VisitedUrlQueue.addElem(url);37.38. if (entity != null)39. {40. /* 转化为文本信息 */41. content = EntityUtils.toString(entity);42.43. /* 判断是否符合下载网页源代码到本地的条件 */44. if (FunctionUtils.isCreateFile(url)45. && FunctionUtils.isHasGoalContent(content) !=-1)46. {47. FunctionUtils.createFile(FunctionUtils48. .getGoalContent(content), url);49. }50. }51.52. } catch (ClientProtocolException e)53. {54. e.printStackTrace();55. } catch (IOException e)56. {57. e.printStackTrace();58. } finally59. {60. client.getConnectionManager().shutdown();61. }62.63. return content;64. }65.66.}复制代码FunctionUtils.java 此类的方法均为static方法1.package com.sreach.spider;2.3.import java.io.BufferedWriter;4.import java.io.File;5.import java.io.FileOutputStream;6.import java.io.IOException;7.import java.io.OutputStreamWriter;8.import java.util.regex.Matcher;9.import java.util.regex.Pattern;10.11.public class FunctionUtils12.{13.14. /**15. * 匹配超链接的正则表达式16. */17. private static String pat ="http://www\\.oschina\\.net/code/explore/.*/\\w+\\.[a-zA-Z]+";18. private static Pattern pattern = pile(pat);19.20. private static BufferedWriter writer = null;21.22. /**23. * 爬虫搜索深度24. */25. public static int depth = 0;26.27. /**28. * 以"/"来分割URL,获得超链接的元素29. *30. * @param url31. * @return32. */33. public static String[] divUrl(String url)34. {35. return url.split("/");36. }37.38. /**39. * 判断是否创建文件40. *41. * @param url42. * @return43. */44. public static boolean isCreateFile(String url)45. {46. Matcher matcher = pattern.matcher(url);47.48. return matcher.matches();49. }50.51. /**52. * 创建对应文件53. *54. * @param content55. * @param urlPath56. */57. public static void createFile(String content, String urlPath)58. {59. /* 分割url */60. String[] elems = divUrl(urlPath);61. StringBuffer path = new StringBuffer();62.63. File file = null;64. for (int i = 1; i < elems.length; i++)65. {66. if (i != elems.length - 1)67. {68.69. path.append(elems[i]);70. path.append(File.separator);71. file = new File("D:" + File.separator + path.toString());72.73. }74.75. if (i == elems.length - 1)76. {77. Pattern pattern = pile("\\w+\\.[a-zA-Z]+");78. Matcher matcher = pattern.matcher(elems[i]);79. if ((matcher.matches()))80. {81. if (!file.exists())82. {83. file.mkdirs();84. }85. String[] fileName = elems[i].split("\\.");86. file = new File("D:" + File.separator +path.toString()87. + File.separator + fileName[0] + ".txt");88. try89. {90. file.createNewFile();91. writer = new BufferedWriter(newOutputStreamWriter(92. new FileOutputStream(file)));93. writer.write(content);94. writer.flush();95. writer.close();96. System.out.println("创建文件成功");97. } catch (IOException e)98. {99. e.printStackTrace();100. }101.102. }103. }104.105. }106. }107.108. /**109. * 获取页面的超链接并将其转换为正式的A标签110. *111. * @param href112. * @return113. */114. public static String getHrefOfInOut(String href)115. {116. /* 内外部链接最终转化为完整的链接格式 */117. String resultHref = null;118.119. /* 判断是否为外部链接 */120. if (href.startsWith("http://"))121. {122. resultHref = href;123. } else124. {125. /* 如果是内部链接,则补充完整的链接地址,其他的格式忽略不处理,如：a href="#" */126. if (href.startsWith("/"))127. {128. resultHref = "" + href;129. }130. }131.132. return resultHref;133. }134.135. /**136. * 截取网页网页源文件的目标内容137. *138. * @param content139. * @return140. */141. public static String getGoalContent(String content) 142. {143. int sign = content.indexOf("<pre class=\"");144. String signContent = content.substring(sign);145.146. int start = signContent.indexOf(">");147. int end = signContent.indexOf("</pre>");148.149. return signContent.substring(start + 1, end);150. }151.152. /**153. * 检查网页源文件中是否有目标文件154. *155. * @param content156. * @return157. */158. public static int isHasGoalContent(String content) 159. {160. return content.indexOf("<pre class=\"");161. }162.163.}复制代码HrefOfPage.java 此类为获取页面的超链接1.package com.sreach.spider;2.3.public class HrefOfPage4.{5. /**6. * 获得页面源代码中超链接7. */8. public static void getHrefOfContent(String content)9. {10. System.out.println("开始");11. String[] contents = content.split("<a href=\"");12. for (int i = 1; i < contents.length; i++)13. {14. int endHref = contents[i].indexOf("\"");15.16. String aHref =FunctionUtils.getHrefOfInOut(contents[i].substring(17. 0, endHref));18.19. if (aHref != null)20. {21. String href = FunctionUtils.getHrefOfInOut(aHref);22.23. if (!UrlQueue.isContains(href)24. && href.indexOf("/code/explore") != -125. && !VisitedUrlQueue.isContains(href))26. {27. UrlQueue.addElem(href);28. }29. }30. }31.32. System.out.println(UrlQueue.size() + "--抓取到的连接数");33. System.out.println(VisitedUrlQueue.size() + "--已处理的页面数");34.35. }36.37.}复制代码UrlDataHanding.java 此类主要是从未访问队列中获取url,下载页面，分析url，保存已访问url等操作，实现Runnable接口1.package com.sreach.spider;2.3.public class UrlDataHanding implements Runnable4.{5. /**6. * 下载对应页面并分析出页面对应的URL放在未访问队列中。

e商务文档

Java网络爬虫简单实现

相关文档推荐：