好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

C#使用正则表达式抓取网站信息示例

本文实例讲述了C#使用正则表达式抓取网站信息的方法。分享给大家供大家参考,具体如下:

这里以抓取京东商城商品详情为例。

1、创建JdRobber.cs程序类

?

public class JdRobber

{

   /// <summary>

   /// 判断是否京东链接

   /// </summary>

   /// <param name="param"></param>

   /// <returns></returns>

   public bool ValidationUrl( string url)

   {

     bool result = false ;

     if (!String.IsNullOrEmpty(url))

     {

       Regex regex = new Regex( @"^http://item.jd测试数据/\d+.html$" );

       Match match = regex.Match(url);

       if (match.Success)

       {

         result = true ;

       }

     }

     return result;

   }

   /// <summary>

   /// 抓取京东信息

   /// </summary>

   /// <param name="param"></param>

   /// <returns></returns>

   public void GetInfo( string url)

   {

     if (ValidationUrl(url))

     {

       string htmlStr = WebHandler.GetHtmlStr(url, "Default" );

       if (!String.IsNullOrEmpty(htmlStr))

       {

         string pattern = "" ;     //正则表达式

         string sourceWebID = "" ;   //商品关键ID

         string title = "" ;      //标题

         decimal price = 0;      //价格

         string picName = "" ;     //图片

         //提取商品关键ID

         pattern = @"http://item.jd测试数据/(?<Object>\d+).html" ;

         sourceWebID = WebHandler.GetRegexText(url, pattern);

         //提取标题

         pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>" ;

         title = WebHandler.GetRegexText(htmlStr, pattern);

         //提取图片

         int begin = htmlStr.IndexOf( "<div id=\"spec-n1\"" );

         int end = htmlStr.IndexOf( "</div>" , begin + 1);

         if (begin > 0 && end > 0)

         {

           string subPicHtml = htmlStr.Substring(begin, end - begin);

           pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>" ;

           picName = WebHandler.GetRegexText(subPicHtml, pattern);

         }

         //提取价格

         if (sourceWebID != "" )

         {

           string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1" ;

           string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default" );

           pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""" ;

           price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern));

         }

         Console.WriteLine( "商品名称:{0}" , title);

         Console.WriteLine( "图片:{0}" , picName);

         Console.WriteLine( "价格:{0}" , price);

       }

     }

   }

}

2、创建WebHandler.cs公共方法类

?

/// <summary>

/// 公共方法类

/// </summary>

public class WebHandler

{

   /// <summary>

   /// 获取网页的HTML码

   /// </summary>

   /// <param name="url">链接地址</param>

   /// <param name="encoding">编码类型</param>

   /// <returns></returns>

   public static string GetHtmlStr( string url, string encoding)

   {

     string htmlStr = "" ;

     try

     {

       if (!String.IsNullOrEmpty(url))

       {

         WebRequest request = WebRequest.Create(url); //实例化WebRequest对象

         WebResponse response = request.GetResponse(); //创建WebResponse对象

         Stream datastream = response.GetResponseStream(); //创建流对象

         Encoding ec = Encoding.Default;

         if (encoding == "UTF8" )

         {

           ec = Encoding.UTF8;

         }

         else if (encoding == "Default" )

         {

           ec = Encoding.Default;

         }

         StreamReader reader = new StreamReader(datastream, ec);

         htmlStr = reader.ReadToEnd(); //读取数据

         reader.Close();

         datastream.Close();

         response.Close();

       }

     }

     catch { }

     return htmlStr;

   }

   /// <summary>

   /// 获取正则表达式中的关键字

   /// </summary>

   /// <param name="input">文本</param>

   /// <param name="pattern">表达式</param>

   /// <returns></returns>

   public static string GetRegexText( string input, string pattern)

   {

     string result = "" ;

     if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern))

     {

       Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);

       Match match = regex.Match(input);

       if (match.Success)

       {

         result = match.Groups[ "Object" ].Value;

       }

     }

     return result;

   }

   /// <summary>

   /// 返回有效价格

   /// </summary>

   /// <param name="strPrice"></param>

   /// <returns></returns>

   public static decimal GetValidPrice( string strPrice)

   {

     decimal price = 0;

     try

     {

       if (!String.IsNullOrEmpty(strPrice))

       {

         Regex regex = new Regex( @"^\d+(\.\d{1,2})?$" , RegexOptions.IgnoreCase);

         Match match = regex.Match(strPrice);

         if (match.Success)

         {

           price = decimal .Parse(strPrice);

         }

       }

     }

     catch { }

     return price;

   }

}

希望本文所述对大家C#程序设计有所帮助。

dy("nrwz");

查看更多关于C#使用正则表达式抓取网站信息示例的详细内容...

  阅读:42次