抓取网页数据的思路 - youngperson/study-100 GitHub Wiki
携带登录的cookie信息
网页中登录需要抓取的目标网站后,谷歌浏览器->检查->Resources->Cookies,把cookie信息复制过来。以抓取知乎网站为例!
config.php
<?php
function genCookie() {
//网页登录知乎后,谷歌浏览器->检查->Resources->Cookies
$cookie_arr = array(
'__utma' => '51854390.97457187.1471250812.1471250812.1471250812.1',
'__utmb' => '51854390.2.10.1471250812',
'__utmc' => '51854390',
'__utmv' => '51854390.100-1|2=registration_date=20160701=1^3=entry_date=20160701=1',
'__utmz' => '51854390.1471250812.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
'_xsrf' => '6e332049911c5e4e18cee124535d5dd4',
'_za' => 'b9599061-3f7a-4cd7-bae3-f4bda4a37d7d',
'_zap' => 'e13f2900-92ff-433d-914c-bf9a5e3cbc9e',
'a_t' => '"2.0AGBA80MEKQoXAAAAew7ZVwBgQPNDBCkKACAAtAJsYwoXAAAAYQJVTXYO2VcAGnNe404B5n79tW9-uKTyt61zirXjvtmuRVOqTl0KdFPVoMga9hvhuQ=="',
'cap_id' => '"MDE5Yjc3N2Y3YWRkNDMwNTk0MWI0NzM4NzFmN2JkZjU=|1471250782|c1a8e6ce27415e006db82c8cac6cfd103429f11d"',
'd_c0' => '"ACAAtAJsYwqPTl31eWdBbQYlHU8L9T-RWtA=|1471250783"',
'l_cap_id' => '"NzdhOTk5MTM2ZTg4NDVjMTg1N2U5OTc1MjMzMzUwZjA=|1471250782|efad76e634c868df98fa6a1a36fa82344b0d1005"',
'l_n_c' => '1',
'login' => '"YWIwZDFlMjQwODY4NGE1YWI2Yzk4ODZhNDZmOGQwZjk=|1471250806|bb77ea9a2cb71eb4519509f29fd949ea5c2a0908"',
'q_c1' => 'c58f5d70b8994f08aef0fe1fa55e355a|1471250782000|1471250782000',
'z_c0' => 'Mi4wQUdCQTgwTUVLUW9BSUFDMEFteGpDaGNBQUFCaEFsVk5kZzdaVndBYWMxN2pUZ0htZnYyMWIzNjRwUEszclhPS3RR|1471250811|d33765814a75b0db93e7ed5b76a1a5742319e686',
);
$cookie = '';
foreach ($cookie_arr as $key => $value) {
if($key != 'z_c0')
$cookie .= $key . '=' . $value . ';';
else
$cookie .= $key . '=' . $value;
}
return $cookie;
}
抓取网页
function request($method, $url, $fields = array())
{
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_COOKIE, genCookie());
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
if ($method === 'POST')
{
curl_setopt($ch, CURLOPT_POST, true );
curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);
}
$result = curl_exec($ch);
return $result;
}
分析网页中需要的数据
# 使用正则进行匹配
<a class="name" href="/people/phper-28">phper</a> #用户id、用户名
#<a class="name" href="/people\/(.*)">(.*)</a>#U
<span class="location item" title="北京"><a href="/topic/19550828" title="北京"> #地点
#<span class="location item" title=["|\'](.*?)["|\']>#
借助队列
借助redis的队列,待抓取的用户队列和已经抓取过的用户队列。
$redis = new Redis();
$redis->connect('127.0.0.1', '6379');
//如果队列里面没有用户id则初始化一个
$init_user_id = 'gao-tai-ye';
if ($redis->llen('request_queue') == 0)
{
$redis->lpush('request_queue', $init_user_id);
}
$pdo = new PDO('mysql:host=localhost;dbname=test','root','111111');
//死循环消费队列中的任务
$set_total = 10000;
while (1) {
echo "--------begin get user info--------\n";
//获取当前队列中的uid数量
$total = $redis->llen('request_queue');
//获取已经抓取过的uid数量
$get_total = $redis->zcard('already_get_queue');
if ($get_total>=$set_total || $total == 0) {
echo "--------done--------\n";
break;
}
$startTime = microtime();
//从队列中取一个用户id
$tmp_u_id = $redis->lpop('request_queue');
//从集合中判断下该用户id是否抓取过
$tmp_size = $redis->zscore('already_get_queue', $tmp_u_id);
if (empty($tmp_size))
{
$endTime = microtime();
$startTime = explode(' ', $startTime);
$current_user = saveUserInfo($tmp_u_id);
//print_r($userInfo);
if($current_user['u_id']) {
$sql = "xxxx";
//执行sql
$pdo->exec($sql);
//用户主动关注了的列表
if ($current_user['followees_count']) {
$followees_result = request('GET', 'https://www.zhihu.com/people/' . $tmp_u_id . '/followees');
$followees_result = getFollowUserId($followees_result);
//压入队列
//print_r($followees_result);
$num = count($followees_result);
for($i=0;$i<$num;$i++) {
$redis->lpush('request_queue', $followees_result[$i]);
}
}
//用户被哪些人关注了的列表
if ($current_user['followers_count']) {
$followers_result = request('GET', 'https://www.zhihu.com/people/' . $tmp_u_id . '/followers');
$followers_result = getFollowUserId($followers_result);
//压入队列
//print_r($followers_result);
$num = count($followers_result);
for($i=0;$i<$num;$i++) {
$redis->lpush('request_queue', $followers_result[$i]);
}
}
$redis->zadd('already_get_queue', 1, $tmp_u_id);
}else{
echo "--------uid为空,账号可能被封了--------\n";
break;
}
$endTime = explode(' ', $endTime);
$total_time = $endTime[0] - $startTime[0] + $endTime[1] - $startTime[1];
$timecost = sprintf("%.2f",$total_time);
echo "--------const " . $timecost . " second on $tmp_u_id--------\n";
}else{
echo "--------user $tmp_u_id info and followee and follower already get--------\n";
}
}
优化改进
curl_multi_xxx多线程、pcntl多进程