java实现登录之后抓取数据

最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面，收集的一些黑名单网站，从该网站上抓取到自己系统中。

也找了一些资料，觉得没有一个很好的，全面的例子。因此在这里做个笔记提醒自己。

首先需要一个jsoup的jar包，我用的1.6.0。。下载地址为：http://pan.baidu测试数据/s/1mgqouha

1，获取网页内容（核心代码，技术有限没封装）。

2，登录之后抓取网页数据（如何在请求中携带cookie）。

3，获取网站的ajax请求方法（返回json）。

以上这三点我就用一个类全部包含（比较糙望见谅，直接copy代码过去，应该就可以用）

一，这个类分别有这上面的1,2,3三中方法，直接main方法可以进行测试

								
									 package   com.minxinloan.black.web.utils; 

									 import   java.io.bufferedreader; 

									 import   java.io.bytearrayoutputstream; 

									 import   java.io.datainputstream; 

									 import   java.io.dataoutputstream; 

									 import   java.io.file; 

									 import   java.io.fileoutputstream; 

									 import   java.io.filewriter; 

									 import   java.io.ioexception; 

									 import   java.io.inputstream; 

									 import   java.io.inputstreamreader; 

									 import   java.io.outputstream; 

									 import   java.io.printwriter; 

									 import   java.net.httpurlconnection; 

									 import   java.net.url; 

									 import   java.net.urlconnection; 

									 import   java.net.urlencoder; 

									 import   java.nio.charset.charset; 

									 import   java.util.arraylist; 

									 import   java.util.hashmap; 

									 import   java.util.iterator; 

									 import   java.util.list; 

									 import   java.util.map; 

									 import   java.util.map.entry; 

									 import   java.util.stringtokenizer; 

									 import   net.sf.json.jsonarray; 

									 import   net.sf.json.jsonobject; 

									 import   org.jsoup.connection; 

									 import   org.jsoup.connection.method; 

									 import   org.jsoup.jsoup; 

									 import   org.jsoup.nodes.document; 

									 import   org.jsoup.nodes.element; 

									 import   org.jsoup.select.elements; 

									 public   class   cookieutil { 

									     public   final   static   string content_type =   "content-type"  ; 

									     public   static   void   main(string[] args) { 

									       //string loginurl = "http://HdhCmsTestp2peye测试数据/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=lsc66&username=puqiuxiaomao&password=a1234567"; 

									       string listurl =   "http://HdhCmsTestp2peye测试数据/blacklist.php?p=2"  ; 

									       string logurl =   "http://HdhCmsTestp2peye测试数据/member.php"  ; 

									       //********************************需要登录的************************************************* 

									       try   { 

									           connection.response res =  

									               jsoup.connect(logurl) 

									                 .data(  "mod"  ,  "logging" 

									                     ,  "action"  ,  "login" 

									                     ,  "loginsubmit"  ,  "yes" 

									                     ,  "loginhash"  ,  "lsc66" 

									                     ,  "username"  ,  "puqiuxiaomao" 

									                     ,  "password"  ,  "a1234567"  ) 

									                 .method(method.post) 

									                 .execute(); 

									           //这儿的sessionid需要根据要登录的目标网站设置的session cookie名字而定 

									           connection con=jsoup.connect(listurl); 

									           //设置访问形式（电脑访问，手机访问）：直接百度都参数设置 

									           con.header(  "user-agent"  ,   "mozilla/4.0 (compatible; msie 7.0; windows nt 5.1)"  ); 

									           //把登录信息的cookies保存如map对象里面 

									           map <string,string> map=res.cookies(); 

									           iterator<entry<string,string>> it =map.entryset().iterator(); 

									           while  (it.hasnext()){ 

									             entry<string,string> en= it.next();  

									             //把登录的信息放入请求里面 

									             con =con.cookie(en.getkey(), en.getvalue()); 

									           } 

									           //再次获取document对象。 

									           document objectdoc = con.get(); 

									           elements elements = objectdoc.getallelements();  //获取这个连接返回页面的源码内容（不是源码跟源码差不多） 

									           for   (element element : elements) { 

									             //element是迭代出来的标签：如：<div><span></span></div> 

									             elements elements2= element.getallelements();  // 

									              for   (element element2 : elements2) { 

									                element2.text(); 

									                element2.attr(  "href"  );  //获取标签属性。element2代表a标签：href代表属性 

									                element2.text();  //获取标签文本 

									             } 

									           } 

									           //********************************不需要登录的************************************************* 

									           string url =   "http://HdhCmsTestp2peye测试数据/blacklist.php?p=2"  ; 

									           document contemp = jsoup.connect(url).get(); 

									           elements elementstemps = contemp.getallelements(); 

									            for   (element elementstemp : elementstemps) { 

									              elementstemp.text(); 

									              elementstemp.attr(  "href"  );  //获取标签属性。element2代表a标签：href代表属性 

									              elementstemp.text();  //获取标签文本 

									           } 

									           //********************************ajax方法获取内容。。。*************************************************。 

									            httpurlconnection connection =   null  ; 

									             bufferedreader reader =   null  ; 

									             try   { 

									               stringbuffer sb =   new   stringbuffer(); 

									               url geturl =   new   url(url); 

									               connection = (httpurlconnection)geturl.openconnection(); 

									               reader =   new   bufferedreader(  new   inputstreamreader( 

									                   connection.getinputstream(),  "utf-8"  )); 

									               string lines; 

									               while   ((lines = reader.readline()) !=   null  ) { 

									                 sb.append(lines); 

									               }; 

									               list<map<string, object>> list = parsejson2list(sb.tostring());  //json转换成list 

									             }   catch   (exception e) { 

									             }   finally  { 

									               if  (reader!=  null  ) 

									                 try   { 

									                   reader.close(); 

									                 }   catch   (ioexception e) { 

									                 } 

									               // 断开连接 

									               connection.disconnect(); 

									             } 

									       }   catch   (ioexception e) { 

									         // todo auto-generated catch block 

									         e.printstacktrace(); 

									       } 

									     } 

									     public   static   map<string, object> parsejson2map(string jsonstr){  

									       map<string, object> map =   new   hashmap<string, object>();  

									       //最外层解析  

									       jsonobject json = jsonobject.fromobject(jsonstr);  

									       for  (object k : json.keyset()){  

									         object v = json.get(k);   

									         //如果内层还是数组的话，继续解析  

									         if  (v   instanceof   jsonarray){  

									           list<map<string, object>> list =   new   arraylist<map<string,object>>();  

									           iterator<jsonobject> it = ((jsonarray)v).iterator();  

									           while  (it.hasnext()){  

									             jsonobject json2 = it.next();  

									             list.add(parsejson2map(json2.tostring()));  

									           }  

									           map.put(k.tostring(), list);  

									         }   else   {  

									           map.put(k.tostring(), v);  

									         }  

									       }  

									       return   map;  

									     }  

									     public   static   list<map<string, object>> parsejson2list(string jsonstr){  

									       jsonarray jsonarr = jsonarray.fromobject(jsonstr);  

									       list<map<string, object>> list =   new   arraylist<map<string,object>>();  

									       iterator<jsonobject> it = jsonarr.iterator();  

									       while  (it.hasnext()){  

									         jsonobject json2 = it.next();  

									         list.add(parsejson2map(json2.tostring()));  

									       }  

									       return   list;  

									     }  

									 }

二，这个是获取验证码的类，可以研究下。（但你要要分析出网站的验证码的请求地址）

								
									 package   com.minxinloan.black.web.utils; 

									 import   java.io.bufferedreader; 

									 import   java.io.datainputstream; 

									 import   java.io.dataoutputstream; 

									 import   java.io.file; 

									 import   java.io.fileoutputstream; 

									 import   java.io.filewriter; 

									 import   java.io.inputstream; 

									 import   java.io.inputstreamreader; 

									 import   java.io.printwriter; 

									 import   java.net.httpurlconnection; 

									 import   java.net.url; 

									 import   java.net.urlconnection; 

									 import   java.nio.charset.charset; 

									 import   java.util.hashmap; 

									 import   java.util.list; 

									 import   java.util.map; 

									 import   java.util.stringtokenizer; 

									 public   class   utils {  //解析验证码的 

									     public   static   content getrandom(string method, string surl,  // 要解析的url 

									         map<string, string> parammap,   // 存放用户名和密码的map 

									         map<string, string> requestheadermap,  // 存放cookie的map 

									         boolean   isonlyreturnheader, string path) { 

									       content content =   null  ; 

									       httpurlconnection httpurlconnection =   null  ; 

									       inputstream in =   null  ; 

									       try   { 

									         url url =   new   url(surl); 

									         boolean   ispost =   "post"  .equals(method); 

									         if   (method ==   null 

									             || (!  "get"  .equalsignorecase(method) && !  "post" 

									                 .equalsignorecase(method))) { 

									           method =   "post"  ; 

									         } 

									         url resolvedurl = url; 

									         urlconnection urlconnection = resolvedurl.openconnection(); 

									         httpurlconnection = (httpurlconnection) urlconnection; 

									         httpurlconnection.setrequestmethod(method); 

									         httpurlconnection.setrequestproperty(  "accept-language"  , 

									             "zh-cn,zh;q=0.5"  ); 

									         // do not follow redirects, we will handle redirects ourself 

									         httpurlconnection.setinstancefollowredirects(  false  ); 

									         httpurlconnection.setdooutput(  true  ); 

									         httpurlconnection.setdoinput(  true  ); 

									         httpurlconnection.setconnecttimeout(  5000  ); 

									         httpurlconnection.setreadtimeout(  5000  ); 

									         httpurlconnection.setusecaches(  false  ); 

									         httpurlconnection.setdefaultusecaches(  false  ); 

									         httpurlconnection.connect(); 

									         int   responsecode = httpurlconnection.getresponsecode(); 

									         if   (responsecode == httpurlconnection.http_ok 

									             || responsecode == httpurlconnection.http_created) { 

									           byte  [] bytes =   new   byte  [  0  ]; 

									           if   (!isonlyreturnheader) { 

									             datainputstream ins =   new   datainputstream( 

									                 httpurlconnection.getinputstream()); 

									             // 验证码的位置 

									             dataoutputstream out =   new   dataoutputstream( 

									                 new   fileoutputstream(path +   "/code.bmp"  )); 

									             byte  [] buffer =   new   byte  [  4096  ]; 

									             int   count =   0  ; 

									             while   ((count = ins.read(buffer)) >   0  ) { 

									               out.write(buffer,   0  , count); 

									             } 

									             out.close(); 

									             ins.close(); 

									           } 

									           string encoding =   null  ; 

									           if   (encoding ==   null  ) { 

									             encoding = getencodingfromcontenttype(httpurlconnection 

									                 .getheaderfield(  ""  )); 

									           } 

									           content =   new   content(surl,   new   string(bytes, encoding), 

									               httpurlconnection.getheaderfields()); 

									         } 

									       }   catch   (exception e) { 

									         return   null  ; 

									       }   finally   { 

									         if   (httpurlconnection !=   null  ) { 

									           httpurlconnection.disconnect(); 

									         } 

									       } 

									       return   content; 

									     } 

									     public   static   string getencodingfromcontenttype(string contenttype) { 

									       string encoding =   null  ; 

									       if   (contenttype ==   null  ) { 

									         return   null  ; 

									       } 

									       stringtokenizer tok =   new   stringtokenizer(contenttype,   ";"  ); 

									       if   (tok.hasmoretokens()) { 

									         tok.nexttoken(); 

									         while   (tok.hasmoretokens()) { 

									           string assignment = tok.nexttoken().trim(); 

									           int   eqidx = assignment.indexof(  '='  ); 

									           if   (eqidx != -  1  ) { 

									             string varname = assignment.substring(  0  , eqidx).trim(); 

									             if   (  "charset"  .equalsignorecase(varname)) { 

									               string varvalue = assignment.substring(eqidx +   1  ) 

									                   .trim(); 

									               if   (varvalue.startswith(  "\""  ) 

									                   && varvalue.endswith(  "\""  )) { 

									                 // substring works on indices 

									                 varvalue = varvalue.substring(  1  , 

									                     varvalue.length() -   1  ); 

									               } 

									               if   (charset.issupported(varvalue)) { 

									                 encoding = varvalue; 

									               } 

									             } 

									           } 

									         } 

									       } 

									       if   (encoding ==   null  ) { 

									         return   "utf-8"  ; 

									       } 

									       return   encoding; 

									     } 

									     // 这个是输出 

									     public   static   boolean   infile(string content, string path) { 

									       printwriter out =   null  ; 

									       file file =   new   file(path); 

									       try   { 

									         if   (!file.exists()) { 

									           file.createnewfile(); 

									         } 

									         out =   new   printwriter(  new   filewriter(file)); 

									         out.write(content); 

									         out.flush(); 

									         return   true  ; 

									       }   catch   (exception e) { 

									         e.printstacktrace(); 

									       }   finally   { 

									         out.close(); 

									       } 

									       return   false  ; 

									     } 

									     public   static   string gethtmlreadline(string httpurl) { 

									       string currentline =   ""  ; 

									       string totalstring =   ""  ; 

									       inputstream urlstream; 

									       string content =   ""  ; 

									       try   { 

									         url url =   new   url(httpurl); 

									         httpurlconnection connection = (httpurlconnection) url 

									             .openconnection(); 

									         connection.connect(); 

									         system.out.println(connection.getresponsecode()); 

									         urlstream = connection.getinputstream(); 

									         bufferedreader reader =   new   bufferedreader( 

									         new   inputstreamreader(urlstream,   "utf-8"  )); 

									         while   ((currentline = reader.readline()) !=   null  ) { 

									           totalstring += currentline +   "\n"  ; 

									         } 

									         content = totalstring; 

									       }   catch   (exception e) { 

									       } 

									       return   content; 

									     } 

									 } 

									 class   content { 

									     private   string url; 

									     private   string body; 

									     private   map<string, list<string>> m_mheaders =   new   hashmap<string, list<string>>(); 

									     public   content(string url, string body, map<string, list<string>> headers) { 

									       this  .url = url; 

									       this  .body = body; 

									       this  .m_mheaders = headers; 

									     } 

									     public   string geturl() { 

									       return   url; 

									     } 

									     public   string getbody() { 

									       return   body; 

									     } 

									     public   map<string, list<string>> getheaders() { 

									       return   m_mheaders; 

									     } 

									 }

原文链接：https://blog.csdn.net/HUXU981598436/article/details/79134920

查看更多关于java实现登录之后抓取数据的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://www.haodehen.cn/did252621

更新时间：2023-07-28 阅读：98次