好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

C#网络爬虫代码分享 C#简单的爬取工具

公司编辑妹子需要爬取网页内容,叫我帮忙做了一简单的爬取工具

这是爬取网页内容,像是这对大家来说都是不难得,但是在这里有一些小改动,代码献上,大家参考

?

private string gethttpwebrequest( string url)

     {

       httpwebresponse result;

       string strhtml = string .empty;

       try

       {

         uri uri = new uri(url);

         webrequest webreq = webrequest.create(uri);

         webresponse webres = webreq.getresponse();

 

         httpwebrequest myreq = (httpwebrequest)webreq;

         myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;

         myreq.accept = "*/*" ;

         myreq.keepalive = true ;

         myreq.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" );

         result = (httpwebresponse)myreq.getresponse();

         stream recevicestream = result.getresponsestream();

         streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding( "utf-8" ));

         strhtml = readerofstream.readtoend();

         readerofstream.close();

         recevicestream.close();

         result.close();

       }

       catch

       {

         uri uri = new uri(url);

         webrequest webreq = webrequest.create(uri);

         httpwebrequest myreq = (httpwebrequest)webreq;

         myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;

         myreq.accept = "*/*" ;

         myreq.keepalive = true ;

         myreq.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" );

         //result = (httpwebresponse)myreq.getresponse();

         try

         {

           result = (httpwebresponse)myreq.getresponse();

         }

         catch (webexception ex)

         {

           result = (httpwebresponse)ex.response;

         }

         stream recevicestream = result.getresponsestream();

         streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding( "gb2312" ));

         strhtml = readerofstream.readtoend();

         readerofstream.close();

         recevicestream.close();

         result.close();

       }

       return strhtml;

     }

这是根据url爬取网页远吗,有一些小改动,很多网页有不同的编码格式,甚至有些网站做了反爬取的防范,这个方法经过能够改动也能爬去 

以下是爬取网页所有的网址链接

?

/// <summary>

    /// 提取html代码中的网址

    /// </summary>

    /// <param name="htmlcode"></param>

    /// <returns></returns>

    private static list< string > gethyperlinks( string htmlcode, string url)

    {

      arraylist al = new arraylist();

      bool isgenxin = false ;

      stringbuilder weburlsb = new stringbuilder(); //sql

      stringbuilder linksb = new stringbuilder(); //展示数据

      list< string > weburllistzx = new list< string >(); //新增

      list< string > weburllist = new list< string >(); //旧的

      string productioncontent = htmlcode;

      regex reg = new regex( @"http(s)?://([\w-]+\.)+[\w-]+/?" );

      string wangzhanyuming = reg.match(url, 0).value;

      matchcollection mc = regex.matches(productioncontent.replace( "href=\"/" , "href=\"" + wangzhanyuming).replace( "href='/" , "href='" + wangzhanyuming).replace( "href=/" , "href=" + wangzhanyuming).replace( "href=\"./" , "href=\"" + wangzhanyuming), @"<[aa][^>]* href=[^>]*>" , regexoptions.singleline);

      int index = 1;

      foreach (match m in mc)

      {

        matchcollection mc1 = regex.matches(m.value, @"[a-za-z]+://[^\s]*" , regexoptions.singleline);

        if (mc1.count > 0)

        {

          foreach (match m1 in mc1)

          {

            string linkurlstr = string .empty;

            linkurlstr = m1.value.replace( "\"" , "" ).replace( "'" , "" ).replace( ">" , "" ).replace( ";" , "" );

            weburlsb.append( "$-$" );

            weburlsb.append(linkurlstr);

            weburlsb.append( "$_$" );

            if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))

            {

              isgenxin = true ;

              weburllistzx.add(linkurlstr);

              linksb.appendformat( "{0}<br/>" , linkurlstr);

            }

          }

        }

        else

        {

          if (m.value.indexof( "javascript" ) == -1)

          {

            string amstr = string .empty;

            string wangzhanxiangduilujin = string .empty;

            wangzhanxiangduilujin = url.substring(0, url.lastindexof( "/" ) + 1);

            amstr = m.value.replace( "href=\"" , "href=\"" + wangzhanxiangduilujin).replace( "href='" , "href='" + wangzhanxiangduilujin);

            matchcollection mc11 = regex.matches(amstr, @"[a-za-z]+://[^\s]*" , regexoptions.singleline);

            foreach (match m1 in mc11)

            {

              string linkurlstr = string .empty;

              linkurlstr = m1.value.replace( "\"" , "" ).replace( "'" , "" ).replace( ">" , "" ).replace( ";" , "" );

              weburlsb.append( "$-$" );

              weburlsb.append(linkurlstr);

              weburlsb.append( "$_$" );

              if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))

              {

                isgenxin = true ;

                weburllistzx.add(linkurlstr);

                linksb.appendformat( "{0}<br/>" , linkurlstr);

              }

            }

          }

        }

        index++;

      }

      return weburllistzx;

    }

这块的技术其实就是简单的使用了正则去匹配!接下来献上获取标题,以及存储到xml文件的方法

?

/// <summary>

     /// // 把网址写入xml文件

     /// </summary>

     /// <param name="strurl"></param>

     /// <param name="alhyperlinks"></param>

     private static void writetoxml( string strurl, list< string > alhyperlinks)

     {

       xmltextwriter writer = new xmltextwriter( @"d:\hyperlinks.xml" , encoding.utf8);

       writer.formatting = formatting.indented;

       writer.writestartdocument( false );

       writer.writedoctype( "hyperlinks" , null , "urls.dtd" , null );

       writer.writecomment( "提取自" + strurl + "的超链接" );

       writer.writestartelement( "hyperlinks" );

       writer.writestartelement( "hyperlinks" , null );

       writer.writeattributestring( "datetime" , datetime.now.tostring());

       foreach ( string str in alhyperlinks)

       {

         string title = getdomain(str);

         string body = str;

         writer.writeelementstring(title, null , body);

       }

       writer.writeendelement();

       writer.writeendelement();

       writer.flush();

       writer.close();

     }

     /// <summary>

     /// 获取网址的域名后缀

     /// </summary>

     /// <param name="strurl"></param>

     /// <returns></returns>

     private static string getdomain( string strurl)

     {

       string retval;

       string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)" ;

       regex r = new regex(strregex, regexoptions.ignorecase);

       match m = r.match(strurl);

       retval = m.tostring();

       strregex = @"\.|/$" ;

       retval = regex.replace(retval, strregex, "" ).tostring();

       if (retval == "" )

         retval = "other" ;

       return retval;

     }

/// <summary>

     /// 获取标题

     /// </summary>

     /// <param name="html"></param>

     /// <returns></returns>

     private static string gettitle( string html)

     {

       string titlefilter = @"<title>[\s\s]*?</title>" ;

       string h1filter = @"<h1.*?>.*?</h1>" ;

       string clearfilter = @"<.*?>" ;

 

       string title = "" ;

       match match = regex.match(html, titlefilter, regexoptions.ignorecase);

       if (match.success)

       {

         title = regex.replace(match.groups[0].value, clearfilter, "" );

       }

 

       // 正文的标题一般在h1中,比title中的标题更干净

       match = regex.match(html, h1filter, regexoptions.ignorecase);

       if (match.success)

       {

         string h1 = regex.replace(match.groups[0].value, clearfilter, "" );

         if (! string .isnullorempty(h1) && title.startswith(h1))

         {

           title = h1;

         }

       }

       return title;

     }

这就是所用的全部方法,还是有很多需要改进之处!大家如果有发现不足之处还请指出,谢谢!

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。

dy("nrwz");

查看更多关于C#网络爬虫代码分享 C#简单的爬取工具的详细内容...

  阅读:39次