好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

PHP采集系统源代码

今天公司PHP牛人教了PHP采集 系统 的原理^_^,太牛了!

代码如下 <?php //获得网页内容 function getFileContents($url) { $user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)"; $urlparts = parse_url($url); $path = $urlparts['path']; $host = $urlparts['host']; if (!empty($urlparts['query'])) $path .= "?".$urlparts['query']; if (isset ($urlparts['port'])) { $port = (int) $urlparts['port']; } else if ($urlparts['scheme'] == "http") { $port = 80; } else if ($urlparts['scheme'] == "https") { $port = 443; } if ($port == 80) { $portq = ""; } else { $portq = ":$port"; } $all = "*/*"; $request = "GET $path HTTP/1.0rnHost: $host$portqrnAccept: $allrnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn"; $fsocket_timeout = 60; if (substr($url, 0, 5) == "https") { $target = "ssl://".$host; } else { $target = $host; } $errno = 0; $errstr = ""; $fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout); if (!$fp) { $contents['state'] = "NOHOST"; print "Error: $errstr"; return $contents; } else { if (!fputs($fp, $request)) { $contents['state'] = "Cannot send request"; return $contents; } $data = null; socket_set_timeout($fp, $fsocket_timeout); $status = socket_get_status($fp); while (!feof($fp) && !$status['timed_out']) { $data .= fgets($fp, 8192); } fclose($fp); if ($status['timed_out'] == 1) { $contents['state'] = "timeout"; } else{ if(strstr($data,"Location: ")&&strstr($data,"Cache-Control: private")){ $contents['state'] = "jump"; $contents['file'] = substr($data, strpos($data, "rnrn") + 4); } else{ $contents['state'] = "ok"; $contents['file'] = substr($data, strpos($data, "rnrn") + 4); } } } return $contents; } /* 检查url文件是否可以读取 check if file is available and in readable form */ function url_status($url) { $user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)"; $urlparts = parse_url($url); $path = $urlparts['path']; $host = $urlparts['host']; if (!empty($urlparts['query'])) $path .= "?".$urlparts['query']; if (isset ($urlparts['port'])) { $port = (int) $urlparts['port']; } else if ($urlparts['scheme'] == "http") { $port = 80; } else if ($urlparts['scheme'] == "https") { $port = 443; } if ($port == 80) { $portq = ""; } else { $portq = ":$port"; } $all = "*/*"; //just to prevent "comment effect" in get accept $request = "HEAD $path HTTP/1.1rnHost: $host$portqrnAccept: $allrnAccept-Charset: iso-8859-1rnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn"; if (substr($url, 0, 5) == "https") { $target = "ssl://".$host; } else { $target = $host; } $fsocket_timeout = 60; $errno = 0; $errstr = ""; $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout); $linkstate = "ok"; if (!$fp) { $status['state'] = "NOHOST"; } else { socket_set_timeout($fp, $fsocket_timeout); fputs($fp, $request); $answer = fgets($fp, 4096); $regs = Array (); if (ereg("HTTP/[0-9.]+ (([0-9])[0-9]{2})", $answer, $regs)) { $httpcode = $regs[2]; $full_httpcode = $regs[1]; if ($httpcode <> 2 && $httpcode <> 3) { $status['state'] = "Unreachable: http $full_httpcode"; $linkstate = "Unreachable"; } } if ($linkstate <> "Unreachable") { while ($answer) { $answer = fgets($fp, 4096); if (ereg("Location: *([^nr ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) { $status['path'] = $regs[1]; $status['state'] = "Relocation: http $full_httpcode"; fclose($fp); return $status; } if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) { $status['date'] = $regs[1]; } if (eregi("Content-Type:", $answer)) { $content = $answer; $answer = ''; break; } } $socket_status = socket_get_status($fp); if (eregi("Content-Type: *([a-z/]*)", $content, $regs)) { if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') { $status['content'] = 'text'; $status['state'] = 'ok'; } else if ($regs[1] == 'application/pdf') { $status['content'] = 'pdf'; $status['state'] = 'ok'; } else if ($regs[1] == 'application/msword') { $status['content'] = 'doc'; $status['state'] = 'ok'; } else { $status['state'] = "Not text or html"; } } else if ($socket_status['timed_out'] == 1) { $status['state'] = "Timed out (no reply from server)"; } else $status['state'] = "Not text or html"; } } fclose($fp); return $status; } $host = 'http://HdhCmsTestadmin5测试数据'; $list_exp = '<div class="itembox"'; $url_start = '<a href="'; $url_end = '" target='; $detail_title_start = '<h1>'; $detail_title_end = '</h1>'; $detail_summary_start = '<div id="arctext">'; $detail_summary_end = '<div id="arctext">'; $max_page = 179; for($page=$max_page;$page>0;$page--){ $url = "http://HdhCmsTestadmin5测试数据/browse/26/list_".$page.".shtml"; $status = url_status($url); if($status['content'] == 'text' && $status['state'] == 'ok'){ $files = getFileContents($url); $contents = $files['file']; $arr = explode($list_exp, $contents); for($i=1;$i<count($arr);$i++){ $detail_url = ""; $detail_url = strstr($arr[$i], $url_start); $detail_url = str_replace($url_start, "", $detail_url); $pos = strpos($detail_url, $url_end); $detail_url = substr($detail_url, 0, $pos); $detail_url = $host.$detail_url; $summary = getFileContents($detail_url); print_r($summary); exit; } } } ?>

查看更多关于PHP采集系统源代码的详细内容...

  阅读:50次