{"id":1096,"date":"2018-07-29T19:28:00","date_gmt":"2018-07-29T11:28:00","guid":{"rendered":"http:\/\/www.sniper97.cn\/?p=1096"},"modified":"2018-07-29T19:28:00","modified_gmt":"2018-07-29T11:28:00","slug":"%e7%ac%ac%e5%8d%81%e5%9b%9b%e8%8a%82%ef%bc%9a%e7%88%ac%e5%8f%96%e6%90%9c%e7%8b%97%e5%be%ae%e4%bf%a1","status":"publish","type":"post","link":"http:\/\/www.sniper97.cn\/index.php\/note\/carwler\/1096\/","title":{"rendered":"\u7b2c\u5341\u56db\u8282\uff1a\u722c\u53d6\u641c\u72d7\u5fae\u4fe1"},"content":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728<a href=\"https:\/\/github.com\/Sniper970119\/Spider\/tree\/master\/20180721\" target=\"_blank\" rel=\"noopener\" data-slimstat=\"5\">\u8fd9\u91cc<\/a>\u4e0b\u8f7d\u3002<br \/>\n\u9996\u5148\u6211\u4eec\u9700\u8981\u83b7\u53d6cookies\uff0c\u56e0\u4e3a\u767b\u5f55\u548c\u4e0d\u767b\u5f55\u6240\u80fd\u770b\u5230\u9875\u9762\u6570\u91cf\u662f\u4e0d\u4e00\u6837\u7684\u3002<br \/>\n\u5728\u5f00\u53d1\u8005\u5de5\u5177\u4e2d\u83b7\u53d6\u8bf7\u6c42\u5934\u3002<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/86.png\" alt=\"\" class=\"alignnone size-full wp-image-1098\" width=\"541\" height=\"91\" \/> <img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/87.png\" alt=\"\" class=\"alignnone size-full wp-image-1099\" width=\"641\" height=\"60\" \/><br \/>\n\u8fd9\u91cc\u6211\u6ca1\u6709\u4f7f\u7528\u4e0a\u4e00\u8282\u4f7f\u7528\u7684\u4ee3\u7406\u6c60\uff0c\u56e0\u4e3a\u6211\u53d1\u73b0\u514d\u8d39\u7684\u4ee3\u7406\u6548\u7387\u5f88\u5dee\uff0c\u6240\u4ee5\u8fd9\u91cc\u91c7\u7528\u4ed8\u8d39\u4ee3\u7406\uff0c\u4e0d\u8fc7\u4e0d\u662f\u5f88\u8d35\uff0c\u800c\u4e14\u5141\u8bb8\u4f60\u514d\u8d39\u8bd5\u75283\u5c0f\u65f6\u3002<br \/>\n\u6211\u4eec\u9996\u5148\u6784\u9020\u4e00\u4e2a\u8bf7\u6c42\u961f\u5217\uff0c\u5c06\u76ee\u5f55\u9875\u7684\u6587\u7ae0\u94fe\u63a5\u6293\u53d6\u4e0b\u6765\u4ee5\u540e\u8bbe\u7f6e\u65b0\u7684\u8bf7\u6c42\u6dfb\u52a0\u5230\u961f\u5217\u4e2d\uff0c\u722c\u53d6\u90e8\u5206\u76f4\u63a5\u53bb\u961f\u5217\u4e2d\u53d6\u8bf7\u6c42\u7136\u540e\u6267\u884c\u5373\u53ef\u3002<br \/>\n\u7136\u540e\u9875\u9762\u5904\u7406\u90e8\u5206\u5206\u4e3a\u76ee\u5f55\u5904\u7406\u548c\u6587\u7ae0\u5904\u7406\uff0c\u5bf9\u5e94\u4e0d\u540c\u7684\u9875\u9762\u3002<br \/>\n\u8bf7\u6c42\u90e8\u5206\uff0c\u6ca1\u6709\u4f7f\u7528\u4ee3\u7406\u6c60\uff0c\u800c\u662f\u76f4\u63a5\u4f7f\u7528\u8fd0\u8425\u5546\u63d0\u4f9b\u7684\u4ee3\u7406api\u83b7\u53d6\u4ee3\u7406\u5e76\u8bbf\u95ee\u3002<br \/>\n\u4f7f\u7528\u514d\u8d39\u4ee3\u7406\u4e5f\u662f\u53ef\u4ee5\u7684\uff0c\u4e0d\u8fc7\u6548\u7387\u6781\u5dee\uff0c\u7ecf\u5e38\u6027\u88ab\u62d2\u7edd\u8bbf\u95ee\u6216\u8005\u56e0\u4e3a\u67d0\u4e9b\u539f\u56e0\u76f4\u63a5\u629b\u51fa\u5f02\u5e38\u7ec8\u6b62\u7a0b\u5e8f\uff0c\u514d\u8d39\u4ee3\u7406\u90e8\u5206\u5728github\u7684SoGouSpiderFree\u91cc\u3002<br \/>\n\uff08\u8fd9\u4e2a\u94fe\u63a5\u771f\u5b9e\u5b58\u5728\uff09<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/94.png\" alt=\"\" class=\"alignnone size-full wp-image-1109\" width=\"1789\" height=\"520\" \/><br \/>\n&nbsp;<br \/>\n\u90e8\u5206\u4ee3\u7801\uff08\u5168\u90e8\u4ee3\u7801\u70b9\u4e0a\u9762\u94fe\u63a5\uff09\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># -*- coding:utf-8 -*-\nimport requests\nfrom requests import Session\nfrom SoGouSpider.db import RedisQueue\nfrom SoGouSpider.config import *\nfrom SoGouSpider.request import WeixinRequest\nfrom urllib.parse import urlencode\nfrom pyquery import PyQuery as pq\nfrom requests import ReadTimeout\nfrom MongoDB import Mongo\nimport re\nclass Spider():\n    base_url = 'http:\/\/weixin.sogou.com\/weixin'\n    key_word = 'RNG'\n    headers = {\n        'accept': '*\/*',\n        'accept-encoding': 'gzip, deflate',\n        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',\n        'Connection\t': 'keep-alive',\n        'Cache-Control': 'max-age=0',\n        'cookies': 'CXID=0DDCDF3AB524336D02F12EE46CD334E4; SUID=D4C85D7C3965860A5AA39D940000145A; '\n                   'IPLOC=CN2102; SUV=1531959792494460; '\n                   'sct=5; SNUID=8E67BCEBD1D5A2CE38998221D186284B; '\n                   'ld=wyllllllll2bFdo5lllllVH5TZGlllllnLLdflllllwlllll9Zlll5@@@@@@@@@@; '\n                   'LSTMV=347%2C155; LCLKINT=4472; ABTEST=0|1531961096|v1; weixinIndexVisited=1; '\n                   'JSESSIONID=aaa4Ta7_4rS8e9Jqz3Hsw; '\n                   'ppinf=5|1532852075|1534061675|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTclOUYlQjMlRTUlQ'\n                   'TQlQjR8Y3J0OjEwOjE1MzI4NTIwNzV8cmVmbmljazoxODolRTclOUYlQjMlRTUlQTQlQjR8dXNlcmlkOjQ0Om85dDJsdUI5VE8'\n                   '5TE1CemdVRDd5dC10RjI4MzBAd2VpeGluLnNvaHUuY29tfA; '\n                   'pprdig=Lk7HiV8rT2LS8uZh0riBcnZ8cokN-aN-Yv5OjbnX3qmZS4SYIg7PnnZqXWsxfPwNF1M-YxeT9PZQxGVw7qc6d15IjwIg'\n                   '_2E9537JOqzdHQL34_9ntlXJ_gYE7RCQ-Nt_piMGk9cvi5Ll9oRWWsdK2dUqWTbDnESGbkA07hWhO9E; '\n                   'sgid=06-34211517-AVtdd2sgghdVYlHDIz6Ug1U; ppmdig=153286085500000088c26503f3219f8b3403a4a6915fc676',\n        'Host': 'weixin.sogou.com',\n        'user-agent': 'Mozilla\/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko\/20100101 Firefox\/61.0'\n    }\n    session = Session()\n    queue = RedisQueue()\n    mongo = Mongo()\n    def get_proxy(self):\n        \"\"\"\n        \u4ece\u4ee3\u7406\u6c60\u83b7\u53d6\u4ee3\u7406\n        :return:\n        \"\"\"\n        try:\n            response = requests.get(PROXY_POOL_URL)\n            if response.status_code == 200:\n                print('Get Proxy', response.text)\n                return response.text\n            return None\n        except requests.ConnectionError:\n            return None\n    def start(self):\n        \"\"\"\n        \u521d\u59cb\u5316\u5de5\u4f5c\n        \"\"\"\n        # \u5168\u5c40\u66f4\u65b0Headers\n        self.session.headers.update(self.headers)\n        start_url = self.base_url + '?' + urlencode({'query': self.key_word, 'type': 2})\n        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)\n        # \u8c03\u5ea6\u7b2c\u4e00\u4e2a\u8bf7\u6c42\n        print('start:' + str(weixin_request.need_proxy))\n        self.queue.add(weixin_request)\n    def parse_index(self, response):\n        \"\"\"\n        \u89e3\u6790\u7d22\u5f15\u9875\n        :param response: \u54cd\u5e94\n        :return: \u65b0\u7684\u54cd\u5e94\n        \"\"\"\n        doc = pq(response.text)\n        items = doc('.news-box .news-list li .txt-box h3 a').items()\n        for item in items:\n            url = item.attr('href')\n            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)\n            yield weixin_request\n        next = doc('#sogou_next').attr('href')\n        if next:\n            url = self.base_url + str(next)\n            weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)\n            yield weixin_request\n    def parse_detail(self, response):\n        \"\"\"\n        \u89e3\u6790\u8be6\u60c5\u9875\n        :param response: \u54cd\u5e94\n        :return: \u5fae\u4fe1\u516c\u4f17\u53f7\u6587\u7ae0\n        \"\"\"\n        doc = pq(response.text)\n        data = {\n            'title': doc('.rich_media_title').text(),\n            'content': doc('.rich_media_content').text(),\n            'date': doc('#publish_time').text(),\n            'nickname': doc('#js_profile_qrcode &gt; div &gt; strong').text(),\n            'wechat': doc('#js_profile_qrcode &gt; div &gt; p:nth-child(3) &gt; span').text()\n        }\n        yield data\n    def request(self, weixin_request):\n        \"\"\"\n        \u6267\u884c\u8bf7\u6c42\n        :param weixin_request: \u8bf7\u6c42\n        :return: \u54cd\u5e94\n        \"\"\"\n        try:\n            print(weixin_request.need_proxy)\n            if weixin_request.need_proxy:\n                # \u4ee3\u7406\u670d\u52a1\u5668\n                proxyHost = \"http-dyn.abuyun.com\"\n                proxyPort = \"9020\"\n                # \u4ee3\u7406\u96a7\u9053\u9a8c\u8bc1\u4fe1\u606f\n                proxyUser = \"H44849JU5O0CUXJD\"\n                proxyPass = \"9D6530023F397B62\"\n                proxy = \"http:\/\/%(user)s:%(pass)s@%(host)s:%(port)s\" % {\n                    \"host\": proxyHost,\n                    \"port\": proxyPort,\n                    \"user\": proxyUser,\n                    \"pass\": proxyPass,\n                }\n                print('proxy:' + str(proxy))\n                if proxy:\n                    proxies = {\n                        'http': proxy,\n                        'https': proxy\n                    }\n                    return requests.get(weixin_request.url, proxies=proxies)\n            return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)\n        except (ConnectionError, ReadTimeout) as e:\n            print(e.args)\n            return False\n    def error(self, weixin_request):\n        \"\"\"\n        \u9519\u8bef\u5904\u7406\n        :param weixin_request: \u8bf7\u6c42\n        :return:\n        \"\"\"\n        weixin_request.fail_time = weixin_request.fail_time + 1\n        print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)\n        if weixin_request.fail_time &lt; MAX_FAILED_TIME:\n            self.queue.add(weixin_request)\n    def schedule(self):\n        \"\"\"\n        \u8c03\u5ea6\u8bf7\u6c42\n        :return:\n        \"\"\"\n        while not self.queue.empty():\n            weixin_request = self.queue.pop()\n            # weixin_request.need_proxy = True\n            print('schedule:' + str(weixin_request.need_proxy))\n            callback = weixin_request.callback\n            print('Schedule', weixin_request.url)\n            response = self.request(weixin_request)\n            print('\u72b6\u6001\u7801' + str(response.status_code))\n            if response and response.status_code in VALID_STATUSES:\n                results = list(callback(response))\n                if results:\n                    for result in results:\n                        print('New Result', type(result))\n                        if isinstance(result, WeixinRequest):\n                            self.queue.add(result)\n                        if isinstance(result, dict):\n                            self.mongo.insert(result)\n                else:\n                    self.error(weixin_request)\n            else:\n                self.error(weixin_request)\n    def run(self):\n        \"\"\"\n        \u5165\u53e3\n        :return:\n        \"\"\"\n        self.start()\n        self.schedule()\nif __name__ == '__main__':\n    spider = Spider()\n    spider.run()\n<\/pre>\n<p>&nbsp;<br \/>\n\u722c\u53d6\u6210\u529f\uff0c<br \/>\nconsole\u8f93\u51fa\uff1a<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/88.png\" alt=\"\" class=\"alignnone size-full wp-image-1100\" width=\"1319\" height=\"723\" \/><br \/>\n&nbsp;<br \/>\n\u53ef\u89c6\u5316\u6570\u636e\u5e93\u4e2d\uff1a<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/89.png\" alt=\"\" class=\"alignnone size-full wp-image-1101\" width=\"1288\" height=\"517\" \/><br \/>\n&nbsp;<br \/>\n\u67e5\u770b\u8be6\u60c5\uff1a<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/91.png\" alt=\"\" class=\"alignnone size-full wp-image-1102\" width=\"1264\" height=\"879\" \/><br \/>\n\u548c\u539f\u6587\u505a\u5bf9\u6bd4\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/92.png\" alt=\"\" class=\"alignnone size-full wp-image-1103\" width=\"792\" height=\"888\" \/><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728\u8fd9\u91cc\u4e0b\u8f7d\u3002 \u9996\u5148\u6211\u4eec\u9700\u8981\u83b7\u53d6cookies\uff0c\u56e0\u4e3a\u767b\u5f55\u548c\u4e0d\u767b\u5f55\u6240\u80fd\u770b\u5230\u9875\u9762\u6570\u91cf\u662f\u4e0d\u4e00\u6837\u7684 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_mi_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[12],"tags":[],"views":4591,"_links":{"self":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1096"}],"collection":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/comments?post=1096"}],"version-history":[{"count":0,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1096\/revisions"}],"wp:attachment":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/media?parent=1096"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/categories?post=1096"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/tags?post=1096"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}