数据挖掘下面是一个例子,抓取新浪天气新闻的数据我做了个程序把新浪上的天气新闻抓过来存到本地,考虑访问速度问题,新闻中的图片也要保存到本地。
程序如下package .weather1;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import .URL;import .URLConnection;import java.util.regex.Matcher;import java.util.regex.Pattern;import mons.logging.Log;import mons.logging.LogFactory;import .update.Getdata;/*** 正则方式抓取新浪天气新闻上的新闻* 地址/weather/news/index.html* @param args*/public class Newlist {private static final Log log = LogFactory.getLog(Newlist.class);/*** 测试* @param args*/public static void main(String args[]){Newlist n=new Newlist();String[] k=n.getNewList();for (int i=0;i<k.length;i++){System.out.println(k[i].replace("href=\"", "href=\"newinfo2.jsp?url="));}String[] m=n.getNewinfo("news/2008/1119/35261.html");for (int l=0;l<m.length;l++){System.out.println(m[l]);}}/*** 由url地址获得新闻内容string[]* 新闻中的图片下载到本地,文中新闻地址改成本地地址* @param url* @return*/public String[] getNewinfo(String url){String URL="/"+url;//30是指取30段满足给出的正则条件的字符串,如果只找出10个,那数组后面的全为nullString[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);for (int i=0;i<s.length;i++){Pattern sp = pile("src=\"(.*?)\"");Matcher matcher = sp.matcher(s[i]);if (matcher.find()){String imageurl=analysis("src=\"(.*?)\"" , s[i] , 1)[0];if(!imageurl.startsWith("http://")){imageurl="/"+imageurl;}System.out.println("新闻有图片:"+imageurl);String content=getContent(imageurl);String[] images=imageurl.split("/");String imagename=images[images.length-1];System.out.println("图片名:"+imagename);try {File fwl = new File(imagename);PrintWriter outl = new PrintWriter(fwl);outl.println(content);outl.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}System.out.println("s[i]:"+s[i]);//修改文件图片地址s[i]=s[i].replace(analysis("src=\"(.*?)\"" , s[i] , 1)[0], imagename);}}return s;}public String[] getNewList(){String url="/weather/news/index.html";return getNewList(getContent(url));}private String[] getNewList(String content ){//String[] s = analysis("align=\"center\" valign=\"top\"><img src=\"../images/a(.*?).gif\" width=\"70\" height=\"65\"></td>" , content , 50);String[] s = analysis("<li>(.*?)</li>" , content , 50);return s;}private String[] analysis(String pattern, String match , int i){Pattern sp = pile(pattern);Matcher matcher = sp.matcher(match);String[] content = new String[i];for (int i1 = 0; matcher.find(); i1++){content[i1] = matcher.group(1);}//下面一段是为了剔除为空的串int l=0;for (int k=0;k<content.length;k++){if (content[k]==null){l=k;break;}}String[] content2;if (l!=0){content2=new String[l];for (int n=0;n<l;n++){content2[n]=content[n];}return content2;}else{return content;}}/*** 由地址获取网页内容* @param strUrl* @returnprivate String getContent(String strUrl){try{//URL url = new URL(strUrl);//BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));URLConnection uc = new URL(strUrl).openConnection();//通过修改http头的User-Agent来伪装成是通过浏览器提交的请求uc.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");System.out.println("-----------------------------------------");System.out.println("Content-Length: "+uc.getContentLength());System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie"));System.out.println("-----------------------------------------");//获取文件头信息System.out.println("Header"+uc.getHeaderFields().toString());System.out.println("-----------------------------------------");BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312"));String s = "";StringBuffer sb=new StringBuffer();while((s = br.readLine())!=null){sb.append(s+"\r\n");}System.out.println("长度+"+sb.toString().length());return sb.toString();}catch(Exception e){return "error open url" + strUrl;}}*/public static String getContent (String strUrl){URLConnection uc = null;String all_content=null;try {all_content =new String();URL url = new URL(strUrl);uc = url.openConnection();uc.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");System.out.println("-----------------------------------------");System.out.println("Content-Length: "+uc.getContentLength());System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie"));System.out.println("-----------------------------------------");//获取文件头信息System.out.println("Header"+uc.getHeaderFields().toString());System.out.println("-----------------------------------------");if (uc == null)return null;InputStream ins = uc.getInputStream();ByteArrayOutputStream outputstream = new ByteArrayOutputStream();byte[] str_b = new byte[1024];int i = -1;while ((i=ins.read(str_b)) > 0) {outputstream.write(str_b,0,i);}all_content = outputstream.toString();// System.out.println(all_content);} catch (Exception e) {e.printStackTrace();log.error("获取网页内容出错");}finally{uc = null;}// return new String(all_content.getBytes("ISO8859-1"));System.out.println(all_content.length());return all_content;}}现在的问题是:图片下载不全,我用后面两种getContent方法下图片,下来的图片大小都和文件头里获得的Content-Length,也就是图片的实际大小不符,预览不了。