最近做了一个从网络上 抓取数据 的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。
也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。
首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu测试数据/s/1mgqouha
1,获取网页内容(核心代码,技术有限没封装)。
2,登录之后抓取网页数据(如何在请求中携带cookie)。
3,获取网站的ajax请求方法(返回json)。
以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)
一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
package com.minxinloan.black.web.utils;
import java.io.bufferedreader; import java.io.bytearrayoutputstream; import java.io.datainputstream; import java.io.dataoutputstream; import java.io.file; import java.io.fileoutputstream; import java.io.filewriter; import java.io.ioexception; import java.io.inputstream; import java.io.inputstreamreader; import java.io.outputstream; import java.io.printwriter; import java.net.httpurlconnection; import java.net.url; import java.net.urlconnection; import java.net.urlencoder; import java.nio.charset.charset; import java.util.arraylist; import java.util.hashmap; import java.util.iterator; import java.util.list; import java.util.map; import java.util.map.entry; import java.util.stringtokenizer;
import net.sf.json.jsonarray; import net.sf.json.jsonobject;
import org.jsoup.connection; import org.jsoup.connection.method; import org.jsoup.jsoup; import org.jsoup.nodes.document; import org.jsoup.nodes.element; import org.jsoup.select.elements;
public class cookieutil {
public final static string content_type = "content-type" ;
public static void main(string[] args) {
//string loginurl = "http://HdhCmsTestp2peye测试数据/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=lsc66&username=puqiuxiaomao&password=a1234567"; string listurl = "http://HdhCmsTestp2peye测试数据/blacklist.php?p=2" ; string logurl = "http://HdhCmsTestp2peye测试数据/member.php" ;
//********************************需要登录的************************************************* try { connection.response res = jsoup.connect(logurl) .data( "mod" , "logging" , "action" , "login" , "loginsubmit" , "yes" , "loginhash" , "lsc66" , "username" , "puqiuxiaomao" , "password" , "a1234567" ) .method(method.post) .execute();
//这儿的sessionid需要根据要登录的目标网站设置的session cookie名字而定 connection con=jsoup.connect(listurl); //设置访问形式(电脑访问,手机访问):直接百度都参数设置 con.header( "user-agent" , "mozilla/4.0 (compatible; msie 7.0; windows nt 5.1)" ); //把登录信息的cookies保存如map对象里面 map <string,string> map=res.cookies(); iterator<entry<string,string>> it =map.entryset().iterator(); while (it.hasnext()){ entry<string,string> en= it.next(); //把登录的信息放入请求里面 con =con.cookie(en.getkey(), en.getvalue());
} //再次获取document对象。 document objectdoc = con.get();
elements elements = objectdoc.getallelements(); //获取这个连接返回页面的源码内容(不是源码跟源码差不多) for (element element : elements) { //element是迭代出来的标签:如:<div><span></span></div> elements elements2= element.getallelements(); // for (element element2 : elements2) { element2.text(); element2.attr( "href" ); //获取标签属性。element2代表a标签:href代表属性 element2.text(); //获取标签文本 } }
//********************************不需要登录的*************************************************
string url = "http://HdhCmsTestp2peye测试数据/blacklist.php?p=2" ; document contemp = jsoup.connect(url).get(); elements elementstemps = contemp.getallelements(); for (element elementstemp : elementstemps) { elementstemp.text(); elementstemp.attr( "href" ); //获取标签属性。element2代表a标签:href代表属性 elementstemp.text(); //获取标签文本 }
//********************************ajax方法获取内容。。。*************************************************。 httpurlconnection connection = null ; bufferedreader reader = null ; try { stringbuffer sb = new stringbuffer(); url geturl = new url(url); connection = (httpurlconnection)geturl.openconnection(); reader = new bufferedreader( new inputstreamreader( connection.getinputstream(), "utf-8" )); string lines; while ((lines = reader.readline()) != null ) { sb.append(lines); }; list<map<string, object>> list = parsejson2list(sb.tostring()); //json转换成list } catch (exception e) {
} finally { if (reader!= null ) try { reader.close(); } catch (ioexception e) { } // 断开连接 connection.disconnect(); }
} catch (ioexception e) { // todo auto-generated catch block e.printstacktrace(); }
}
public static map<string, object> parsejson2map(string jsonstr){ map<string, object> map = new hashmap<string, object>(); //最外层解析 jsonobject json = jsonobject.fromobject(jsonstr); for (object k : json.keyset()){ object v = json.get(k); //如果内层还是数组的话,继续解析 if (v instanceof jsonarray){ list<map<string, object>> list = new arraylist<map<string,object>>(); iterator<jsonobject> it = ((jsonarray)v).iterator(); while (it.hasnext()){ jsonobject json2 = it.next(); list.add(parsejson2map(json2.tostring())); } map.put(k.tostring(), list); } else { map.put(k.tostring(), v); } } return map; }
public static list<map<string, object>> parsejson2list(string jsonstr){ jsonarray jsonarr = jsonarray.fromobject(jsonstr); list<map<string, object>> list = new arraylist<map<string,object>>(); iterator<jsonobject> it = jsonarr.iterator(); while (it.hasnext()){ jsonobject json2 = it.next(); list.add(parsejson2map(json2.tostring())); } return list; }
} |
二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
package com.minxinloan.black.web.utils;
import java.io.bufferedreader; import java.io.datainputstream; import java.io.dataoutputstream; import java.io.file; import java.io.fileoutputstream; import java.io.filewriter; import java.io.inputstream; import java.io.inputstreamreader; import java.io.printwriter; import java.net.httpurlconnection; import java.net.url; import java.net.urlconnection; import java.nio.charset.charset; import java.util.hashmap; import java.util.list; import java.util.map; import java.util.stringtokenizer;
public class utils { //解析验证码的 public static content getrandom(string method, string surl, // 要解析的url map<string, string> parammap, // 存放用户名和密码的map map<string, string> requestheadermap, // 存放cookie的map boolean isonlyreturnheader, string path) {
content content = null ; httpurlconnection httpurlconnection = null ; inputstream in = null ; try { url url = new url(surl); boolean ispost = "post" .equals(method); if (method == null || (! "get" .equalsignorecase(method) && ! "post" .equalsignorecase(method))) { method = "post" ; } url resolvedurl = url; urlconnection urlconnection = resolvedurl.openconnection(); httpurlconnection = (httpurlconnection) urlconnection; httpurlconnection.setrequestmethod(method); httpurlconnection.setrequestproperty( "accept-language" , "zh-cn,zh;q=0.5" ); // do not follow redirects, we will handle redirects ourself httpurlconnection.setinstancefollowredirects( false ); httpurlconnection.setdooutput( true ); httpurlconnection.setdoinput( true ); httpurlconnection.setconnecttimeout( 5000 ); httpurlconnection.setreadtimeout( 5000 ); httpurlconnection.setusecaches( false ); httpurlconnection.setdefaultusecaches( false ); httpurlconnection.connect();
int responsecode = httpurlconnection.getresponsecode();
if (responsecode == httpurlconnection.http_ok || responsecode == httpurlconnection.http_created) { byte [] bytes = new byte [ 0 ]; if (!isonlyreturnheader) { datainputstream ins = new datainputstream( httpurlconnection.getinputstream()); // 验证码的位置 dataoutputstream out = new dataoutputstream( new fileoutputstream(path + "/code.bmp" )); byte [] buffer = new byte [ 4096 ]; int count = 0 ; while ((count = ins.read(buffer)) > 0 ) { out.write(buffer, 0 , count); } out.close(); ins.close(); } string encoding = null ; if (encoding == null ) { encoding = getencodingfromcontenttype(httpurlconnection .getheaderfield( "" )); } content = new content(surl, new string(bytes, encoding), httpurlconnection.getheaderfields()); } } catch (exception e) { return null ; } finally { if (httpurlconnection != null ) { httpurlconnection.disconnect(); } } return content; }
public static string getencodingfromcontenttype(string contenttype) { string encoding = null ; if (contenttype == null ) { return null ; } stringtokenizer tok = new stringtokenizer(contenttype, ";" ); if (tok.hasmoretokens()) { tok.nexttoken(); while (tok.hasmoretokens()) { string assignment = tok.nexttoken().trim(); int eqidx = assignment.indexof( '=' ); if (eqidx != - 1 ) { string varname = assignment.substring( 0 , eqidx).trim(); if ( "charset" .equalsignorecase(varname)) { string varvalue = assignment.substring(eqidx + 1 ) .trim(); if (varvalue.startswith( "\"" ) && varvalue.endswith( "\"" )) { // substring works on indices varvalue = varvalue.substring( 1 , varvalue.length() - 1 ); } if (charset.issupported(varvalue)) { encoding = varvalue; } } } } } if (encoding == null ) { return "utf-8" ; } return encoding; }
// 这个是输出 public static boolean infile(string content, string path) { printwriter out = null ; file file = new file(path); try { if (!file.exists()) { file.createnewfile(); } out = new printwriter( new filewriter(file));
out.write(content); out.flush(); return true ; } catch (exception e) { e.printstacktrace(); } finally { out.close(); } return false ; }
public static string gethtmlreadline(string httpurl) { string currentline = "" ; string totalstring = "" ; inputstream urlstream; string content = "" ;
try { url url = new url(httpurl);
httpurlconnection connection = (httpurlconnection) url .openconnection();
connection.connect(); system.out.println(connection.getresponsecode()); urlstream = connection.getinputstream();
bufferedreader reader = new bufferedreader(
new inputstreamreader(urlstream, "utf-8" ));
while ((currentline = reader.readline()) != null ) { totalstring += currentline + "\n" ; }
content = totalstring;
} catch (exception e) { }
return content; } }
class content { private string url; private string body; private map<string, list<string>> m_mheaders = new hashmap<string, list<string>>();
public content(string url, string body, map<string, list<string>> headers) { this .url = url; this .body = body; this .m_mheaders = headers; }
public string geturl() { return url; }
public string getbody() { return body; }
public map<string, list<string>> getheaders() { return m_mheaders; }
} |
原文链接:https://blog.csdn.net/HUXU981598436/article/details/79134920