C#网络爬虫代码分享 C#简单的爬取工具

				 
	公司编辑妹子需要爬取网页内容，叫我帮忙做了一简单的爬取工具

	这是爬取网页内容，像是这对大家来说都是不难得，但是在这里有一些小改动，代码献上，大家参考

				 ? 

									 private   string   gethttpwebrequest(  string   url)  

									       {  

									         httpwebresponse result;  

									         string   strhtml =   string  .empty;  

									         try 

									         {  

									           uri uri =   new   uri(url);  

									           webrequest webreq = webrequest.create(uri);  

									           webresponse webres = webreq.getresponse();  

									           httpwebrequest myreq = (httpwebrequest)webreq;  

									           myreq.useragent =   "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"  ;  

									           myreq.accept =   "*/*"  ;  

									           myreq.keepalive =   true  ;  

									           myreq.headers.add(  "accept-language"  ,   "zh-cn,en-us;q=0.5"  );  

									           result = (httpwebresponse)myreq.getresponse();  

									           stream recevicestream = result.getresponsestream();  

									           streamreader readerofstream =   new   streamreader(recevicestream, system.text.encoding.getencoding(  "utf-8"  ));  

									           strhtml = readerofstream.readtoend();  

									           readerofstream.close();  

									           recevicestream.close();  

									           result.close();  

									         }  

									         catch 

									         {  

									           uri uri =   new   uri(url);  

									           webrequest webreq = webrequest.create(uri);  

									           httpwebrequest myreq = (httpwebrequest)webreq;  

									           myreq.useragent =   "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"  ;  

									           myreq.accept =   "*/*"  ;  

									           myreq.keepalive =   true  ;  

									           myreq.headers.add(  "accept-language"  ,   "zh-cn,en-us;q=0.5"  );  

									           //result = (httpwebresponse)myreq.getresponse();  

									           try 

									           {  

									             result = (httpwebresponse)myreq.getresponse();  

									           }  

									           catch   (webexception ex)  

									           {  

									             result = (httpwebresponse)ex.response;  

									           }  

									           stream recevicestream = result.getresponsestream();  

									           streamreader readerofstream =   new   streamreader(recevicestream, system.text.encoding.getencoding(  "gb2312"  ));  

									           strhtml = readerofstream.readtoend();  

									           readerofstream.close();  

									           recevicestream.close();  

									           result.close();  

									         }  

									         return   strhtml;  

									       } 

	这是根据url爬取网页远吗，有一些小改动，很多网页有不同的编码格式，甚至有些网站做了反爬取的防范，这个方法经过能够改动也能爬去 

	以下是爬取网页所有的网址链接

				 ? 

									 /// <summary>  

									      /// 提取html代码中的网址  

									      /// </summary>  

									      /// <param name="htmlcode"></param>  

									      /// <returns></returns>  

									      private   static   list<  string  > gethyperlinks(  string   htmlcode,   string   url)  

									      {  

									        arraylist al =   new   arraylist();  

									        bool   isgenxin =   false  ;  

									        stringbuilder weburlsb =   new   stringbuilder();  //sql  

									        stringbuilder linksb =   new   stringbuilder();  //展示数据  

									        list<  string  > weburllistzx =   new   list<  string  >();  //新增  

									        list<  string  > weburllist =   new   list<  string  >();  //旧的  

									        string   productioncontent = htmlcode;  

									        regex reg =   new   regex(  @"http(s)?://([\w-]+\.)+[\w-]+/?"  );  

									        string   wangzhanyuming = reg.match(url, 0).value;  

									        matchcollection mc = regex.matches(productioncontent.replace(  "href=\"/"  ,   "href=\""   + wangzhanyuming).replace(  "href='/"  ,   "href='"   + wangzhanyuming).replace(  "href=/"  ,   "href="   + wangzhanyuming).replace(  "href=\"./"  ,   "href=\""   + wangzhanyuming),   @"<[aa][^>]* href=[^>]*>"  , regexoptions.singleline);  

									        int   index = 1;  

									        foreach   (match m   in   mc)  

									        {  

									          matchcollection mc1 = regex.matches(m.value,   @"[a-za-z]+://[^\s]*"  , regexoptions.singleline);  

									          if   (mc1.count > 0)  

									          {  

									            foreach   (match m1   in   mc1)  

									            {  

									              string   linkurlstr =   string  .empty;  

									              linkurlstr = m1.value.replace(  "\""  ,   ""  ).replace(  "'"  ,   ""  ).replace(  ">"  ,   ""  ).replace(  ";"  ,   ""  );  

									              weburlsb.append(  "$-$"  );  

									              weburlsb.append(linkurlstr);  

									              weburlsb.append(  "$_$"  );  

									              if   (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))  

									              {  

									                isgenxin =   true  ;  

									                weburllistzx.add(linkurlstr);  

									                linksb.appendformat(  "{0}<br/>"  , linkurlstr);  

									              }  

									            }  

									          }  

									          else 

									          {  

									            if   (m.value.indexof(  "javascript"  ) == -1)  

									            {  

									              string   amstr =   string  .empty;  

									              string   wangzhanxiangduilujin =   string  .empty;  

									              wangzhanxiangduilujin = url.substring(0, url.lastindexof(  "/"  ) + 1);  

									              amstr = m.value.replace(  "href=\""  ,   "href=\""   + wangzhanxiangduilujin).replace(  "href='"  ,   "href='"   + wangzhanxiangduilujin);  

									              matchcollection mc11 = regex.matches(amstr,   @"[a-za-z]+://[^\s]*"  , regexoptions.singleline);  

									              foreach   (match m1   in   mc11)  

									              {  

									                string   linkurlstr =   string  .empty;  

									                linkurlstr = m1.value.replace(  "\""  ,   ""  ).replace(  "'"  ,   ""  ).replace(  ">"  ,   ""  ).replace(  ";"  ,   ""  );  

									                weburlsb.append(  "$-$"  );  

									                weburlsb.append(linkurlstr);  

									                weburlsb.append(  "$_$"  );  

									                if   (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr))  

									                {  

									                  isgenxin =   true  ;  

									                  weburllistzx.add(linkurlstr);  

									                  linksb.appendformat(  "{0}<br/>"  , linkurlstr);  

									                }  

									              }  

									            }  

									          }  

									          index++;  

									        }  

									        return   weburllistzx;  

									      } 

	这块的技术其实就是简单的使用了正则去匹配！接下来献上获取标题，以及存储到xml文件的方法

				 ? 

									 /// <summary>  

									       /// // 把网址写入xml文件  

									       /// </summary>  

									       /// <param name="strurl"></param>  

									       /// <param name="alhyperlinks"></param>  

									       private   static   void   writetoxml(  string   strurl, list<  string  > alhyperlinks)  

									       {  

									         xmltextwriter writer =   new   xmltextwriter(  @"d:\hyperlinks.xml"  , encoding.utf8);  

									         writer.formatting = formatting.indented;  

									         writer.writestartdocument(  false  );  

									         writer.writedoctype(  "hyperlinks"  ,   null  ,   "urls.dtd"  ,   null  );  

									         writer.writecomment(  "提取自"   + strurl +   "的超链接"  );  

									         writer.writestartelement(  "hyperlinks"  );  

									         writer.writestartelement(  "hyperlinks"  ,   null  );  

									         writer.writeattributestring(  "datetime"  , datetime.now.tostring());  

									         foreach   (  string   str   in   alhyperlinks)  

									         {  

									           string   title = getdomain(str);  

									           string   body = str;  

									           writer.writeelementstring(title,   null  , body);  

									         }  

									         writer.writeendelement();  

									         writer.writeendelement();  

									         writer.flush();  

									         writer.close();  

									       }  

									       /// <summary>  

									       /// 获取网址的域名后缀  

									       /// </summary>  

									       /// <param name="strurl"></param>  

									       /// <returns></returns>  

									       private   static   string   getdomain(  string   strurl)  

									       {  

									         string   retval;  

									         string   strregex =   @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"  ;  

									         regex r =   new   regex(strregex, regexoptions.ignorecase);  

									         match m = r.match(strurl);  

									         retval = m.tostring();  

									         strregex =   @"\.|/$"  ;  

									         retval = regex.replace(retval, strregex,   ""  ).tostring();  

									         if   (retval ==   ""  )  

									           retval =   "other"  ;  

									         return   retval;  

									       }  

									 /// <summary>  

									       /// 获取标题  

									       /// </summary>  

									       /// <param name="html"></param>  

									       /// <returns></returns>  

									       private   static   string   gettitle(  string   html)  

									       {  

									         string   titlefilter =   @"<title>[\s\s]*?</title>"  ;  

									         string   h1filter =   @"<h1.*?>.*?</h1>"  ;  

									         string   clearfilter =   @"<.*?>"  ;  

									         string   title =   ""  ;  

									         match match = regex.match(html, titlefilter, regexoptions.ignorecase);  

									         if   (match.success)  

									         {  

									           title = regex.replace(match.groups[0].value, clearfilter,   ""  );  

									         }  

									         // 正文的标题一般在h1中，比title中的标题更干净  

									         match = regex.match(html, h1filter, regexoptions.ignorecase);  

									         if   (match.success)  

									         {  

									           string   h1 = regex.replace(match.groups[0].value, clearfilter,   ""  );  

									           if   (!  string  .isnullorempty(h1) && title.startswith(h1))  

									           {  

									             title = h1;  

									           }  

									         }  

									         return   title;  

									       } 

	这就是所用的全部方法，还是有很多需要改进之处！大家如果有发现不足之处还请指出，谢谢！

	以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持服务器之家。

			 dy("nrwz"); 
			
查看更多关于C#网络爬虫代码分享 C#简单的爬取工具的详细内容...
声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://www.haodehen.cn/did56866
更新时间：2022-09-26 阅读：39次