本页代码可以在这里下载。
首先我们需要获取cookies,因为登录和不登录所能看到页面数量是不一样的。
在开发者工具中获取请求头。

这里我没有使用上一节使用的代理池,因为我发现免费的代理效率很差,所以这里采用付费代理,不过不是很贵,而且允许你免费试用3小时。
我们首先构造一个请求队列,将目录页的文章链接抓取下来以后设置新的请求添加到队列中,爬取部分直接去队列中取请求然后执行即可。
然后页面处理部分分为目录处理和文章处理,对应不同的页面。
请求部分,没有使用代理池,而是直接使用运营商提供的代理api获取代理并访问。
使用免费代理也是可以的,不过效率极差,经常性被拒绝访问或者因为某些原因直接抛出异常终止程序,免费代理部分在github的SoGouSpiderFree里。
(这个链接真实存在)

部分代码(全部代码点上面链接):
# -*- coding:utf-8 -*-
import requests
from requests import Session
from SoGouSpider.db import RedisQueue
from SoGouSpider.config import *
from SoGouSpider.request import WeixinRequest
from urllib.parse import urlencode
from pyquery import PyQuery as pq
from requests import ReadTimeout
from MongoDB import Mongo
import re
class Spider():
base_url = 'http://weixin.sogou.com/weixin'
key_word = 'RNG'
headers = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection ': 'keep-alive',
'Cache-Control': 'max-age=0',
'cookies': 'CXID=0DDCDF3AB524336D02F12EE46CD334E4; SUID=D4C85D7C3965860A5AA39D940000145A; '
'IPLOC=CN2102; SUV=1531959792494460; '
'sct=5; SNUID=8E67BCEBD1D5A2CE38998221D186284B; '
'ld=wyllllllll2bFdo5lllllVH5TZGlllllnLLdflllllwlllll9Zlll5@@@@@@@@@@; '
'LSTMV=347%2C155; LCLKINT=4472; ABTEST=0|1531961096|v1; weixinIndexVisited=1; '
'JSESSIONID=aaa4Ta7_4rS8e9Jqz3Hsw; '
'ppinf=5|1532852075|1534061675|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTclOUYlQjMlRTUlQ'
'TQlQjR8Y3J0OjEwOjE1MzI4NTIwNzV8cmVmbmljazoxODolRTclOUYlQjMlRTUlQTQlQjR8dXNlcmlkOjQ0Om85dDJsdUI5VE8'
'5TE1CemdVRDd5dC10RjI4MzBAd2VpeGluLnNvaHUuY29tfA; '
'pprdig=Lk7HiV8rT2LS8uZh0riBcnZ8cokN-aN-Yv5OjbnX3qmZS4SYIg7PnnZqXWsxfPwNF1M-YxeT9PZQxGVw7qc6d15IjwIg'
'_2E9537JOqzdHQL34_9ntlXJ_gYE7RCQ-Nt_piMGk9cvi5Ll9oRWWsdK2dUqWTbDnESGbkA07hWhO9E; '
'sgid=06-34211517-AVtdd2sgghdVYlHDIz6Ug1U; ppmdig=153286085500000088c26503f3219f8b3403a4a6915fc676',
'Host': 'weixin.sogou.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
session = Session()
queue = RedisQueue()
mongo = Mongo()
def get_proxy(self):
"""
从代理池获取代理
:return:
"""
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
print('Get Proxy', response.text)
return response.text
return None
except requests.ConnectionError:
return None
def start(self):
"""
初始化工作
"""
# 全局更新Headers
self.session.headers.update(self.headers)
start_url = self.base_url + '?' + urlencode({'query': self.key_word, 'type': 2})
weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
# 调度第一个请求
print('start:' + str(weixin_request.need_proxy))
self.queue.add(weixin_request)
def parse_index(self, response):
"""
解析索引页
:param response: 响应
:return: 新的响应
"""
doc = pq(response.text)
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
url = item.attr('href')
weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
yield weixin_request
next = doc('#sogou_next').attr('href')
if next:
url = self.base_url + str(next)
weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
yield weixin_request
def parse_detail(self, response):
"""
解析详情页
:param response: 响应
:return: 微信公众号文章
"""
doc = pq(response.text)
data = {
'title': doc('.rich_media_title').text(),
'content': doc('.rich_media_content').text(),
'date': doc('#publish_time').text(),
'nickname': doc('#js_profile_qrcode > div > strong').text(),
'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
}
yield data
def request(self, weixin_request):
"""
执行请求
:param weixin_request: 请求
:return: 响应
"""
try:
print(weixin_request.need_proxy)
if weixin_request.need_proxy:
# 代理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "H44849JU5O0CUXJD"
proxyPass = "9D6530023F397B62"
proxy = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
print('proxy:' + str(proxy))
if proxy:
proxies = {
'http': proxy,
'https': proxy
}
return requests.get(weixin_request.url, proxies=proxies)
return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
except (ConnectionError, ReadTimeout) as e:
print(e.args)
return False
def error(self, weixin_request):
"""
错误处理
:param weixin_request: 请求
:return:
"""
weixin_request.fail_time = weixin_request.fail_time + 1
print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
if weixin_request.fail_time < MAX_FAILED_TIME:
self.queue.add(weixin_request)
def schedule(self):
"""
调度请求
:return:
"""
while not self.queue.empty():
weixin_request = self.queue.pop()
# weixin_request.need_proxy = True
print('schedule:' + str(weixin_request.need_proxy))
callback = weixin_request.callback
print('Schedule', weixin_request.url)
response = self.request(weixin_request)
print('状态码' + str(response.status_code))
if response and response.status_code in VALID_STATUSES:
results = list(callback(response))
if results:
for result in results:
print('New Result', type(result))
if isinstance(result, WeixinRequest):
self.queue.add(result)
if isinstance(result, dict):
self.mongo.insert(result)
else:
self.error(weixin_request)
else:
self.error(weixin_request)
def run(self):
"""
入口
:return:
"""
self.start()
self.schedule()
if __name__ == '__main__':
spider = Spider()
spider.run()
爬取成功,
console输出:
可视化数据库中:
查看详情:
和原文做对比:

0 条评论