java抓取网页内容三种方式2011-12-05 11:23一、GetURL.javaimport java.io.*;import .*;public class GetURL {public static void main(String[] args) {InputStream in = null;OutputStream out = null;try {// 检查命令行参数if ((args.length != 1)&& (args.length != 2))throw new IllegalArgumentException("Wrong number of args");URL url = new URL(args[0]); //创建 URLin = url.openStream(); // 打开到这个URL的流if (args.length == 2) // 创建一个适当的输出流out = new FileOutputStream(args[1]);else out = System.out;// 复制字节到输出流byte[] buffer = new byte[4096];int bytes_read;while((bytes_read = in.read(buffer)) != -1)out.write(buffer, 0, bytes_read);}catch (Exception e) {System.err.println(e);System.err.println("Usage: java GetURL <URL> [<filename>]");}finally { //无论如何都要关闭流try { in.close(); out.close(); } catch (Exception e) {}}}}运行方法:C:\java>java GetURL http://127.0.0.1:8080/kj/index.html index.html 二、geturl.jsp<%@ page import="java.io.*" contentType="text/html;charset=gb2312" %> <%@ page language="java" import=".*"%><%String htmpath=null;BufferedReader in = null;InputStreamReader isr = null;InputStream is = null;PrintWriter pw=null;HttpURLConnection huc = null;try{htmpath=getServletContext().getRealPath("/")+"html\\morejava.html"; pw=new PrintWriter(htmpath);URL url = new URL("http://127.0.0.1:8080/kj/morejava.jsp"); //创建 URL huc = (HttpURLConnection)url.openConnection();is = huc.getInputStream();isr = new InputStreamReader(is);in = new BufferedReader(isr);String line = null;while(((line = in.readLine()) != null)) {if(line.length()==0)continue;pw.println(line);}}catch (Exception e) {System.err.println(e);}finally { //无论如何都要关闭流try { is.close(); isr.close();in.close();huc.disconnect();pw.close(); } catch (Exception e) {}}%>OK--,创建文件成功三、HttpClient.javaimport java.io.*;import .*;public class HttpClient {public static void main(String[] args) {try {// 检查命令行参数if ((args.length != 1) && (args.length != 2))throw new IllegalArgumentException("Wrong number of args");OutputStream to_file;if (args.length == 2)to_file = new FileOutputStream(args[1]);//输出到文件elseto_file = System.out;//输出到控制台URL url = new URL(args[0]);String protocol = url.getProtocol();if (!protocol.equals("http"))throw new IllegalArgumentException("Must use 'http:' protocol"); String host = url.getHost();int port = url.getPort();if (port == -1) port = 80;String filename = url.getFile();Socket socket = new Socket(host, port);//打开一个socket连接InputStream from_server = socket.getInputStream();//获取输入流PrintWriter to_server = new PrintWriter(socket.getOutputStream());//获取输出流to_server.print("GET " + filename + "\n\n");//请求服务器上的文件to_server.flush(); // Send it right now!byte[] buffer = new byte[4096];int bytes_read;//读服务器上的响应,并写入文件。
while((bytes_read = from_server.read(buffer)) != -1)to_file.write(buffer, 0, bytes_read);socket.close();to_file.close();}catch (Exception e) {System.err.println(e);System.err.println("Usage: java HttpClient <URL> [<filename>]");}}}运行方法:C:\java>java HttpClient http://127.0.0.1:8080/kj/index.html index.html注意中文可能会显示乱码,在得到源码后,应该做相应的转码工作,例如:public static String GetURLstr(String strUrl){InputStream in = null;OutputStream out = null;String strdata = "";try{URL url = new URL(strUrl); // 创建 URLin = url.openStream(); // 打开到这个URL的流out = System.out;// 复制字节到输出流byte[] buffer = new byte[4096];int bytes_read;while ((bytes_read = in.read(buffer)) != -1){String reads = new String(buffer, 0, bytes_read, "UTF-8");//System.out.print(reads);strdata = strdata + reads;// out.write(buffer, 0, bytes_read);}in.close();out.close();return strdata;}catch (Exception e){System.err.println(e);System.err.println("Usage: java GetURL <URL> [<filename>]"); return strdata;}。