当前位置:
文档之家› java实现爬取指定网站的数据
java实现爬取指定网站的数据
4.
mon.*;
6.
7.importjava.util.ArrayList;
8.importjava.util.List;
9.
10.importorg.apache.http.HttpEntity;
11.importorg.apache.http.HttpHost;
104.}//System.out.println(newsList.get(0).getContent());
105.
106.}
107.}
108.
109.publicvoidPaserHtmlForNewsContent(String contentHtmlAddr, NewsInfo newsTemp)//通过上面获得的标题信息的连接,抓取标题的正文部分。
12.importorg.apache.http.HttpResponse;
13.importorg.apache.http.HttpStatus;
14.importorg.apache.http.client.HttpClient;
15.importorg.apache.http.client.methods.HttpGet;
63.}
64.
65.returnhtml;
66.}
67.
68.publicvoidParseHtmlForNewsList()
69.{
70.String html = GetHtml("/");
71.
72.//hupu voice的第一个可以暂时去掉一个css,这样就不用处理空格了
89.for(Element ele:linkElements)
90.{Байду номын сангаас
91.
92.NewsInfo newsTemp =newNewsInfo(ele.text(), ele.absUrl("href"));
93.
94.PaserHtmlForNewsContent(newsTemp.getHtmlAddr(),newsTemp);
131.tempImgList.add(ele.attr("src"));
132.newsTemp.setImageAddrList(tempImgList);
133.}
134.
135.}
136.}
137.
138.publicstaticvoidmain(String[] args)
139.{
140.CrawlHupu crawlHupu =newCrawlHupu();
100.if(newsTemp.getImageAddrList() !=null)
101.System.out.println(newsTemp.getImageAddrList().get(0));
102.System.out.println(newsTemp.getContent());
103.
30.{
31.String html =null;
32.HttpClient httpClient =newDefaultHttpClient();
33.//set proxy ,because of nsn
34.// HttpHost proxy = new HttpHost("10.68.120.11", 3128);
124.for(Element ele:contentElements)
125.{
126.newsTemp.setContent(ele.html());
127.}
128.for(Element ele:imgElements)
129.{
130.List<String> tempImgList =newArrayList<>();
24.
25.publicclassCrawlHupu
26.{
27.privateList<NewsInfo> newsList =newArrayList<>();//用来存储爬取的信息对象
28.
29.publicString GetHtml(String url)//还方法是设置网络链接,是固定的用法
35.// httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
36.
37.//configuration timeout
38.httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,20000);
119.if(!html.isEmpty())
120.{
121.Document doc = Jsoup.parse(html);
122.Elements contentElements = doc.select(cssQueryContent);
123.Elements imgElements = doc.select(cssQueryphoto);
114.//+"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailIntr"
115."div#pageMain>div.pageMainLeft>div.detailWrap>div.detail";
116.//String cssQueryContent = "div.content>div.row>div.column>div#articlewrap.area";
39.
40.HttpGet httpGet =newHttpGet(url);
41.try
42.{
43.HttpResponse httpResponse = httpClient.execute(httpGet);
44.intresStatu = httpResponse.getStatusLine().getStatusCode();
这个类是用来解析网站的内容
重点是:"div#page>div#content>div#local>div#recommend>ul>li>a";
这里用用firefox的firebug组件查看网页的代码结构,不同的网页路径也不一样。
Java代码
1.packagezy.crawl.hupu;
2.
3.importjava.io.IOException;
141.crawlHupu.ParseHtmlForNewsList();
142.
143.}
144.
145.}
2.这个是要获取的信息的类。不多解释。
Java代码
mon;
2.
3.importjava.util.List;
4.
5.publicclassNewsInfo
73.//String cssQueryHupu = "div.content>div.row>div.column>div.row>div.column>div.uibox>div.uibox-con>ul.ui-list>li>a";
74.String cssQueryHupu ="div#mainbody>div.cjkx_mtsd>div.cjkx>ul.list_left>li>a";//这行是用来获取每条对象的标题信息
117.// String cssQueryphoto = "div.hp-wrap>div.voice-main>div.voice-item>ul>li>div.voice-read-detailed>div.voice-photoVideo>"
118.// + "div.voice-photo>div.small-img>img";
19.importorg.apache.http.util.EntityUtils;
20.importorg.jsoup.Jsoup;
21.importorg.jsoup.nodes.Document;
22.importorg.jsoup.nodes.Element;
23.importorg.jsoup.select.Elements;
77.//
78.// String cssQueryIteye = "div#page>div#content>div#local>div#recommend>ul>li>a";
79.if(!html.isEmpty())
80.{
81.Document doc = Jsoup.parse(html,"/");
45.if(resStatu == HttpStatus.SC_OK)
46.{
47.HttpEntity entity = httpResponse.getEntity();
48.if(entity !=null)
49.{
50.html = EntityUtils.toString(entity);
75.// String cssQueryHuxiu = "div.container-hx>div.row-fluid-wrap-hx>"