抓取网页数据的思路 - youngperson/study-100 GitHub Wiki

携带登录的cookie信息

网页中登录需要抓取的目标网站后,谷歌浏览器->检查->Resources->Cookies,把cookie信息复制过来。以抓取知乎网站为例!

config.php

<?php
function genCookie() {
	//网页登录知乎后,谷歌浏览器->检查->Resources->Cookies
	$cookie_arr = array(
		'__utma' => '51854390.97457187.1471250812.1471250812.1471250812.1',
		'__utmb' => '51854390.2.10.1471250812',
		'__utmc' => '51854390',
		'__utmv' => '51854390.100-1|2=registration_date=20160701=1^3=entry_date=20160701=1',
		'__utmz' => '51854390.1471250812.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
		'_xsrf' => '6e332049911c5e4e18cee124535d5dd4',
		'_za' => 'b9599061-3f7a-4cd7-bae3-f4bda4a37d7d',
		'_zap' => 'e13f2900-92ff-433d-914c-bf9a5e3cbc9e',
		'a_t' => '"2.0AGBA80MEKQoXAAAAew7ZVwBgQPNDBCkKACAAtAJsYwoXAAAAYQJVTXYO2VcAGnNe404B5n79tW9-uKTyt61zirXjvtmuRVOqTl0KdFPVoMga9hvhuQ=="',
		'cap_id' => '"MDE5Yjc3N2Y3YWRkNDMwNTk0MWI0NzM4NzFmN2JkZjU=|1471250782|c1a8e6ce27415e006db82c8cac6cfd103429f11d"',
		'd_c0' => '"ACAAtAJsYwqPTl31eWdBbQYlHU8L9T-RWtA=|1471250783"',
		'l_cap_id' => '"NzdhOTk5MTM2ZTg4NDVjMTg1N2U5OTc1MjMzMzUwZjA=|1471250782|efad76e634c868df98fa6a1a36fa82344b0d1005"',
		'l_n_c' => '1',
		'login' => '"YWIwZDFlMjQwODY4NGE1YWI2Yzk4ODZhNDZmOGQwZjk=|1471250806|bb77ea9a2cb71eb4519509f29fd949ea5c2a0908"',
		'q_c1' => 'c58f5d70b8994f08aef0fe1fa55e355a|1471250782000|1471250782000',
		'z_c0' => 'Mi4wQUdCQTgwTUVLUW9BSUFDMEFteGpDaGNBQUFCaEFsVk5kZzdaVndBYWMxN2pUZ0htZnYyMWIzNjRwUEszclhPS3RR|1471250811|d33765814a75b0db93e7ed5b76a1a5742319e686',
	);

	$cookie = '';
	foreach ($cookie_arr as $key => $value) {
		if($key != 'z_c0')
			$cookie .= $key . '=' . $value . ';';
		else
			$cookie .= $key . '=' . $value;
	}

	return $cookie;
}

抓取网页

function request($method, $url, $fields = array())
{
	$ch = curl_init($url);
	curl_setopt($ch, CURLOPT_HEADER, 0);
	curl_setopt($ch, CURLOPT_COOKIE, genCookie());
	curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36');
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
	curl_setopt($ch, CURLOPT_TIMEOUT, 10);
	if ($method === 'POST')
	{
		curl_setopt($ch, CURLOPT_POST, true );
		curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);
	}
	$result = curl_exec($ch);
	return $result;
}

分析网页中需要的数据

# 使用正则进行匹配
<a class="name" href="/people/phper-28">phper</a>   #用户id、用户名
#<a class="name" href="/people\/(.*)">(.*)</a>#U

<span class="location item" title="北京"><a href="/topic/19550828" title="北京">  #地点
#<span class="location item" title=["|\'](.*?)["|\']>#

借助队列

借助redis的队列,待抓取的用户队列和已经抓取过的用户队列。

$redis = new Redis();
$redis->connect('127.0.0.1', '6379');

//如果队列里面没有用户id则初始化一个
$init_user_id = 'gao-tai-ye';
if ($redis->llen('request_queue') == 0)
{
	$redis->lpush('request_queue', $init_user_id);
}

$pdo = new PDO('mysql:host=localhost;dbname=test','root','111111');

//死循环消费队列中的任务
$set_total = 10000;
while (1) {
	echo "--------begin get user info--------\n";
	//获取当前队列中的uid数量
	$total = $redis->llen('request_queue');
	//获取已经抓取过的uid数量
	$get_total = $redis->zcard('already_get_queue');
	if ($get_total>=$set_total || $total == 0) {
		echo "--------done--------\n";
		break;
	}

	$startTime = microtime();
	//从队列中取一个用户id
	$tmp_u_id = $redis->lpop('request_queue');
	//从集合中判断下该用户id是否抓取过
	$tmp_size = $redis->zscore('already_get_queue', $tmp_u_id);
	if (empty($tmp_size))
	{
		$endTime = microtime();
		$startTime = explode(' ', $startTime);

		$current_user = saveUserInfo($tmp_u_id);
		//print_r($userInfo);
		if($current_user['u_id']) {

			$sql = "xxxx";
			//执行sql
			$pdo->exec($sql);

			//用户主动关注了的列表   
			if ($current_user['followees_count']) {
				$followees_result = request('GET', 'https://www.zhihu.com/people/' . $tmp_u_id . '/followees');
				$followees_result = getFollowUserId($followees_result);
				//压入队列
				//print_r($followees_result);
				$num = count($followees_result);
				for($i=0;$i<$num;$i++) {
					  $redis->lpush('request_queue', $followees_result[$i]);
				}
			}

			//用户被哪些人关注了的列表
			if ($current_user['followers_count']) {
				$followers_result = request('GET', 'https://www.zhihu.com/people/' . $tmp_u_id . '/followers');
				$followers_result = getFollowUserId($followers_result);
				//压入队列
				//print_r($followers_result);
				$num = count($followers_result);
				for($i=0;$i<$num;$i++) {
					  $redis->lpush('request_queue', $followers_result[$i]);
				}
			}

			$redis->zadd('already_get_queue', 1, $tmp_u_id);
		}else{
			echo "--------uid为空,账号可能被封了--------\n";
			break;
		}

		$endTime = explode(' ', $endTime);
		$total_time = $endTime[0] - $startTime[0] + $endTime[1] - $startTime[1];
		$timecost = sprintf("%.2f",$total_time);
		echo "--------const  " . $timecost . " second on $tmp_u_id--------\n";
	}else{
		echo "--------user $tmp_u_id info and followee and follower already get--------\n";
	}

}

优化改进

curl_multi_xxx多线程、pcntl多进程