• 0
  • 0

百度搜索接口分析

2020-05-21 22 0 admin 所属分类:PHP 记录

由于平时我们需要快速的在搜搜引擎上搜索数据,我们可以调用诸如百度搜索引擎的搜搜接口。格式如下

https://www.baidu.com/s?wd=你的搜索内容,涉及到url链接用URL编码

可以在网站上添加 a 标记跳转到百度,进行快速搜索,平时是在浏览器上打开的。由于请求时附带了请求信息,ua等,使用访问不会被百度屏蔽和拦截。

如果说我们喜欢在代码层面获悉,相关的搜索内容,怎么办呢?以PHP代码为例,通过添加部分必要请求头,绕过百度检测。

经过分析,必要请求头如下:

User-Agent   操作系统信息字段 

Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

Accept  浏览器返回接收数据格式、配置

text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3

Accept-Language  接收的语言类型

zh-CN,zh;q=0.9

不强制登录状态请求,部分操作需要登录可以附带cookie信息

PHP代码封装成爬虫类

class Spider {
	//存放未登录 返回的cookie信息的
	protected $cookie_nologin_path = './cookie_nologin.txt';
	//存放已登录的cookie信息
	protected $cookie_login_path = './cookie_login.txt';
	//appid
	protected $appid;
	protected $appkey;
	protected $token;
	//expired cookie有效期  单位 s
	protected $expired = 3600;
	protected $url = '';
	//搜索引擎地址
	function __construct() {
	}
	//返回附带给百度接口的头信息
	public function getHeaders() {
		return ['User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','Accept-Language: zh-CN,zh;q=0.9'];
	}
	public function getCookie($url,$flush=false) {
		$lasttime = filemtime($this->cookie_nologin_path);
		if (!$flush && $lasttime>0 && time()-$lasttime<$this->expired) {
			return file_get_contents($this->cookie_nologin_path);
		}
		$info = parse_url($url);
		$url = $info['scheme'].'://'.$info['host'];
		$data = $this->curlGet($url,['showheader'=>1,'headers'=>$this->getHeaders(),'returnheader'=>1]);
		$content = preg_split("/\r\n/", $data);
		$cookie = '';
		foreach ($content as $k => $v) {
			if ($v && strpos($v, 'Set-Cookie')!==false) {
				$v = trim(str_replace('Set-Cookie:', '', $v));
				$cookie= $cookie.$v;
			}
		}
		if ($cookie) {
			//缓存cookie
			file_put_contents($this->cookie_nologin_path,$cookie);
		}
		return $cookie;
	}
	public function curlGet($url,$options) {
		$ch = curl_init();
		curl_setopt($ch, CURLOPT_URL, $url);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
		// 是否返回响应头
		if ($options['showheader']) {
			curl_setopt($ch, CURLOPT_HEADER, 1);
		} else {
			curl_setopt($ch, CURLOPT_HEADER, 0);
		}
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
		curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
		if ($options['cookie']) {
			// debug($options['cookie']);
			curl_setopt($ch, CURLOPT_COOKIE, $options['cookie']);
		}
		// curl_setopt($ch, CURLOPT_USERAGENT, 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
		if ($options['headers']) {
			curl_setopt($ch,CURLOPT_HTTPHEADER ,(array)$options['headers']);
		}
		//curl_setopt($ch, CURLOPT_SSLVERSION, 1);
		$result = curl_exec($ch);
		// 只返回头部信息
		if ($options['returnheader']) {
			// 获得响应结果里的:头大小
			$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
			$result = substr($result, 0, $headerSize);
		}
		curl_close($ch);
		return $result;
	}
}
class BaiduSpider extends Spider {
	function __construct() {
		parent::__construct();
		$this->url = 'https://www.baidu.com';
		$this->cookie_nologin_path = './baidu_cookie_nologin.txt';
		//存放已登录的cookie信息
		$this->cookie_login_path = './baidu_cookie_login.txt';
	}
	// 检测当前网址是否已被搜索引擎搜索
	public function isCollectPage($url) {
		$url = "{$this->url}/s?wd=".urlencode($url);
		$data = $this->curlGet($url,['headers'=>$this->getHeaders(),'cookie'=>$this->getCookie($url)]);
		if (!$data) {
			return false;
		}
		//解析返回的数据
		if (strpos($data,"result c-container")!==false) {
			return true;
		} else {
			return false;
		}
	}
	// 得到域名的收入数
	public function getCollectNum($url) {
		$params = parse_url($url);
		$url = "{$this->url}/s?wd=".urlencode("site:".$params['host']);
		// debug($url);
		$data = $this->curlGet($url,['headers'=>$this->getHeaders(),'cookie'=>$this->getCookie($url)]);
		if (!$data) {
			return 0;
		}
		if (preg_match("/找到相关结果数约(\d+?)个/",$data,$match)) {
			return $match[1];
		} else if (preg_match("/该网站共有([\s\S.]+?)个网页被百度收录/",$data,$match)) {
			return trim(str_replace(",","",strip_tags($match[1])));
		} else {
			return 0;
		}
	}
	// 初始化自助操作
	public function initApi($params) {
		if ($params['appid']) {
			$this->appid = $params['appid'];
		}
		if ($params['appkey']) {
			$this->appkey = $params['appkey'];
		}
		if ($params['token']) {
			$this->token = $params['token'];
		}
	}
	/**
		 * 提交网址给该搜索引擎
		*	urls 是数组  array(
		*	    '路径1',
		*	    '路径2l',
		*	);
		*	注释:带http 或者https的完整链接
		*	token 是您申请的key  百度站长平台中获取 https://ziyuan.baidu.com/
		*	domain 是您的网站域名 
		**/
	public function submitWeb($urls=[]) {
		if (!is_array($urls)) {
			$urls = [$urls];
		}
		$domain = parse_url($urls[0],PHP_URL_HOST);
		$api = 'http://data.zz.baidu.com/urls?site'.$domain.'&token='.$this->token;
		$ch = curl_init();
		$options =  array(
						CURLOPT_URL => $api,
						CURLOPT_POST => true,
						CURLOPT_RETURNTRANSFER => true,
						CURLOPT_POSTFIELDS => implode("\n", $urls),
						CURLOPT_HTTPHEADER => array('Content-Type: text/plain'),
					);
		curl_setopt_array($ch, $options);
		$result = curl_exec($ch);
		return $result;
	}
}

使用

$baidu = new BaiduSpider();
$url = 'https://blog.nango.top/admin/article/index.html';
$data = $baidu->getCollectNum($url);


如果觉得麻烦可以使用现成的API接口 功能丰富

UomgAPI


返回顶部