本页代码可以在这里下载。
首先我们需要获取cookies,因为登录和不登录所能看到页面数量是不一样的。
在开发者工具中获取请求头。

这里我没有使用上一节使用的代理池,因为我发现免费的代理效率很差,所以这里采用付费代理,不过不是很贵,而且允许你免费试用3小时。
我们首先构造一个请求队列,将目录页的文章链接抓取下来以后设置新的请求添加到队列中,爬取部分直接去队列中取请求然后执行即可。
然后页面处理部分分为目录处理和文章处理,对应不同的页面。
请求部分,没有使用代理池,而是直接使用运营商提供的代理api获取代理并访问。
使用免费代理也是可以的,不过效率极差,经常性被拒绝访问或者因为某些原因直接抛出异常终止程序,免费代理部分在github的SoGouSpiderFree里。
(这个链接真实存在)

 
部分代码(全部代码点上面链接):

# -*- coding:utf-8 -*-
import requests
from requests import Session
from SoGouSpider.db import RedisQueue
from SoGouSpider.config import *
from SoGouSpider.request import WeixinRequest
from urllib.parse import urlencode
from pyquery import PyQuery as pq
from requests import ReadTimeout
from MongoDB import Mongo
import re
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    key_word = 'RNG'
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection	': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'cookies': 'CXID=0DDCDF3AB524336D02F12EE46CD334E4; SUID=D4C85D7C3965860A5AA39D940000145A; '
                   'IPLOC=CN2102; SUV=1531959792494460; '
                   'sct=5; SNUID=8E67BCEBD1D5A2CE38998221D186284B; '
                   'ld=wyllllllll2bFdo5lllllVH5TZGlllllnLLdflllllwlllll9Zlll5@@@@@@@@@@; '
                   'LSTMV=347%2C155; LCLKINT=4472; ABTEST=0|1531961096|v1; weixinIndexVisited=1; '
                   'JSESSIONID=aaa4Ta7_4rS8e9Jqz3Hsw; '
                   'ppinf=5|1532852075|1534061675|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTclOUYlQjMlRTUlQ'
                   'TQlQjR8Y3J0OjEwOjE1MzI4NTIwNzV8cmVmbmljazoxODolRTclOUYlQjMlRTUlQTQlQjR8dXNlcmlkOjQ0Om85dDJsdUI5VE8'
                   '5TE1CemdVRDd5dC10RjI4MzBAd2VpeGluLnNvaHUuY29tfA; '
                   'pprdig=Lk7HiV8rT2LS8uZh0riBcnZ8cokN-aN-Yv5OjbnX3qmZS4SYIg7PnnZqXWsxfPwNF1M-YxeT9PZQxGVw7qc6d15IjwIg'
                   '_2E9537JOqzdHQL34_9ntlXJ_gYE7RCQ-Nt_piMGk9cvi5Ll9oRWWsdK2dUqWTbDnESGbkA07hWhO9E; '
                   'sgid=06-34211517-AVtdd2sgghdVYlHDIz6Ug1U; ppmdig=153286085500000088c26503f3219f8b3403a4a6915fc676',
        'Host': 'weixin.sogou.com',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
    }
    session = Session()
    queue = RedisQueue()
    mongo = Mongo()
    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None
    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({'query': self.key_word, 'type': 2})
        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
        # 调度第一个请求
        print('start:' + str(weixin_request.need_proxy))
        self.queue.add(weixin_request)
    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
            yield weixin_request
    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title': doc('.rich_media_title').text(),
            'content': doc('.rich_media_content').text(),
            'date': doc('#publish_time').text(),
            'nickname': doc('#js_profile_qrcode > div > strong').text(),
            'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data
    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            print(weixin_request.need_proxy)
            if weixin_request.need_proxy:
                # 代理服务器
                proxyHost = "http-dyn.abuyun.com"
                proxyPort = "9020"
                # 代理隧道验证信息
                proxyUser = "H44849JU5O0CUXJD"
                proxyPass = "9D6530023F397B62"
                proxy = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
                    "host": proxyHost,
                    "port": proxyPort,
                    "user": proxyUser,
                    "pass": proxyPass,
                }
                print('proxy:' + str(proxy))
                if proxy:
                    proxies = {
                        'http': proxy,
                        'https': proxy
                    }
                    return requests.get(weixin_request.url, proxies=proxies)
            return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False
    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)
    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            # weixin_request.need_proxy = True
            print('schedule:' + str(weixin_request.need_proxy))
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            print('状态码' + str(response.status_code))
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mongo.insert(result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)
    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
if __name__ == '__main__':
    spider = Spider()
    spider.run()

 
爬取成功,
console输出:
 
可视化数据库中:
 
查看详情:
和原文做对比:


0 条评论

发表回复

Avatar placeholder

您的电子邮箱地址不会被公开。 必填项已用 * 标注