当前位置:文档之家› 如何抓取网页数据

如何抓取网页数据

网页源码中规则数据的获取过程:第一步:获取网页源码。

第二步:使用正则表达式匹配抽取所需要的数据。

第三步:将结果进行保存。

这里只介绍第一步。

.HttpWebRequest;.HttpWebResponse;System.IO.Stream;System.IO.StreamReader;System.IO.FileStream;通过C#程序来获取访问页面的内容(网页源代码)并实现将内容保存到本机的文件中。

方法一是通过的两个关键的类.HttpWebRequest;.HttpWebResponse;来实现的。

具体代码如下方案0:网上的代码,看明白这个就可以用方案一和方案二了HttpWebRequest httpReq;HttpWebResponse httpResp;string strBuff = "";char[] cbuffer = new char[256];int byteRead = 0;string filename = @"c:\log.txt";///定义写入流操作public void WriteStream(){Uri httpURL = new Uri(txtURL.Text);///HttpWebRequest类继承于WebRequest,并没有自己的构造函数,需通过WebRequest 的Creat方法建立,并进行强制的类型转换httpReq = (HttpWebRequest)WebRequest.Create(httpURL);///通过HttpWebRequest的GetResponse()方法建立HttpWebResponse,强制类型转换httpResp = (HttpWebResponse) httpReq.GetResponse();///GetResponseStream()方法获取HTTP响应的数据流,并尝试取得URL中所指定的网页内容///若成功取得网页的内容,则以System.IO.Stream形式返回,若失败则产生ProtoclViolationException错误。

在此正确的做法应将以下的代码放到一个try块中处理。

这里简单处理Stream respStream = httpResp.GetResponseStream(); ///返回的内容是Stream形式的,所以可以利用StreamReader类获取GetResponseStream的内容,并以StreamReader类的Read方法依次读取网页源程序代码每一行的内容,直至行尾(读取的编码格式:UTF8)StreamReader respStreamReader = new StreamReader(respStream,Encoding.UTF8);byteRead = respStreamReader.Read(cbuffer,0,256);while (byteRead != 0){string strResp = new string(cbuffer,0,byteRead);strBuff = strBuff + strResp;byteRead = respStreamReader.Read(cbuffer,0,256);}respStream.Close();txtHTML.Text = strBuff;}///定义写入文件操作public void WriteFile(){///要将结果存储至文件中,可利用FileStream类建立文件数据流,并以Write方法将HTML内容写入到文件FileStream fileStream = new FileStream(filename,FileMode.OpenOrCreate,FileAccess.Write);byte[] byteSave = Encoding.UTF8.GetBytes(txtHTML.Text.ToString()); ///保存为txt文件时编码方式为UTF8fileStream.Write(byteSave,0,byteSave.Length);fileStream.Close();}private void btnwrite_Click(object sender, EventArgs e){WriteStream();WriteFile();}方案一、如果不需要POST参数可用下边的方案由于是在WinForm下写的代码,因此会有using System.Windows.Forms;命名空间,错误提示框需要这个命名空间。

using System;using System.Collections.Generic;using System.Collections;using System.Text;using System.Data;//这个可以不要using ;using System.IO;using System.Text.RegularExpressions;//正则表达式的命名空间using System.Windows.Forms;/// <summary>/// 根据链接获得网页内容/// </summary>/// <param name="url">超链接地址</param>/// <returns>网页源码</returns>public static string ReadHtml(string url){Uri uri = new Uri(url);try{HttpWebRequest request = HttpWebRequest.Create(uri) as HttpWebRequest;request.KeepAlive = false;request.Proxy = null;HttpWebResponse response = (HttpWebResponse)request.GetResponse();StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));string content = reader.ReadToEnd();try{response.Close();reader.Close();}catch{//MessageBox.Show(ex.Message+"ReadHtml出错");return null;}return content;}catch{//MessageBox.Show(ex.Message);return null;}}方案二、如果网页需要POST参数可用下边的方案using System;using System.Collections.Generic;using System.Collections;using System.Text;using System.Data;using ;using System.IO;using System.Text.RegularExpressions;using System.Windows.Forms;/// <summary>/// 提交页面请求/// </summary>/// <param name="parms">参数</param>/// <param name="url">网址</param>/// <returns>返回结果</returns>public static string Submit(string parms,string url){HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);req.Proxy = null;req.KeepAlive = false;.ServicePointManager.DefaultConnectionLimit = 100;byte[] data = System.Text.Encoding.GetEncoding("GBK").GetBytes(parms);//编码格式具体根据网页的编码类型设置req.Timeout =int.MaxValue;req.Method = "POST";req.ContentType = @"application/x-www-form-urlencoded";req.ContentLength = data.Length;try{Stream writer = req.GetRequestStream();writer.Write(data, 0, data.Length);writer.Close();}catch{return null;}//获取请求回复数据string content = "";try{HttpWebResponse response = (HttpWebResponse)req.GetResponse();StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));content = reader.ReadToEnd();response.Close();reader.Close();}catch{return null;}return content; }。

相关主题