{"id":1054,"date":"2018-07-26T17:17:31","date_gmt":"2018-07-26T09:17:31","guid":{"rendered":"http:\/\/www.sniper97.cn\/?p=1054"},"modified":"2018-07-26T17:17:31","modified_gmt":"2018-07-26T09:17:31","slug":"%e7%ac%ac%e4%b9%9d%e8%8a%82%ef%bc%9a%e7%88%ac%e5%8f%96%e4%bb%8a%e6%97%a5%e5%a4%b4%e6%9d%a1","status":"publish","type":"post","link":"http:\/\/www.sniper97.cn\/index.php\/note\/carwler\/1054\/","title":{"rendered":"\u7b2c\u4e5d\u8282\uff1a\u722c\u53d6\u4eca\u65e5\u5934\u6761"},"content":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728<a href=\"https:\/\/github.com\/Sniper970119\/Spider\/tree\/master\/20180721\" target=\"_blank\" rel=\"noopener\" data-slimstat=\"5\">\u8fd9\u91cc<\/a>\u4e0b\u8f7d\u3002<\/p>\n<h4>1.\u5206\u6790\u7f51\u7ad9<\/h4>\n<p>\u9996\u5148\u5206\u6790\u8bf7\u6c42\u62a5\u6587\uff0c\u53d6\u51fa\u6bd4\u8f83\u5173\u952e\u7684\u6d88\u606f\u5934\u5c5e\u6027\u3002<\/p>\n<h4><img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/69.png\" alt=\"\" class=\"alignnone size-full wp-image-1056\" width=\"558\" height=\"326\" \/><\/h4>\n<p>\u53e6\u5916\u901a\u8fc7\u5206\u6790\u8bf7\u6c42URL\u6211\u4eec\u53ef\u4ee5\u5f97\u51fa\u8bf7\u6c42\u5730\u5740\uff0c<\/p>\n<pre data-enlighter-language=\"python\" class=\"EnlighterJSRAW\">base_url = <span>'https:\/\/www.toutiao.com\/api\/pc\/feed\/?'<\/span><\/pre>\n<p>\u5c5e\u6027\u6709\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"null\">attrs = {\n    'category': 'news_tech',\n    'utm_source': 'toutiao',\n    'widen': '1',\n    'max_behot_time': '0',\n    'max_behot_time_tmp': '0',\n    'tadrequire': 'true',\n    'as': 'A1353B4569B7E17',\n    'cp': '5B59A71E724A1E1',\n    '_signature': 'g9UFGQAA2Jkp-kWZLlPu9YPVBQ'\n}\n<\/pre>\n<p>\u5176\u4e2dmax_behot_time_tmp\u662f\u54cd\u5e94\u7b2c\u4e00\u4e2a\u6587\u7ae0\u7684id\uff0c\u7c7b\u4f3c\u4e8eoffset\uff0c\u5f53\u4e3a0\u65f6\uff0c\u670d\u52a1\u5668\u4f1a\u968f\u673a\u53d1\u9001\u3002<br \/>\n\u7136\u540e\u6211\u4eec\u67e5\u770bAjax\uff0c\u89c2\u5bdf\u54cd\u5e94\u6d88\u606f\u683c\u5f0f\u3002<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/68.png\" alt=\"\" class=\"alignnone size-full wp-image-1055\" width=\"1368\" height=\"904\" \/> \u786e\u5b9a\u8981\u6293\u53d6\u7684\u5c5e\u6027\uff0c\u5176\u4e2dgroup_id\u5c31\u662f\u6587\u7ae0\u7684\u552f\u4e00\u6807\u8bc6\u3002<\/p>\n<h4><img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/70.png\" alt=\"\" class=\"alignnone size-full wp-image-1057\" width=\"1206\" height=\"527\" \/><\/h4>\n<h4><img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/71.png\" alt=\"\" class=\"alignnone size-full wp-image-1058\" width=\"1043\" height=\"860\" \/><\/h4>\n<p>&nbsp;<br \/>\n\u9996\u5148\u83b7\u53d6\u7f51\u9875json\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># get page\ndef get_page():\n    url = base_url + urlencode(attrs)\n    try:\n        responce = requests.get(url=url, headers=headers)\n        if responce.status_code == 200:\n            return responce.json()\n    except requests.ConnectionError as e:\n        print('Error', e.args)\n<\/pre>\n<p>\u968f\u540e\u5904\u7406json\uff0c\u83b7\u53d6\u6211\u4eec\u9700\u8981\u7684\u4fe1\u606f\uff0c\u8fd9\u91cc\u6709\u4e00\u4e9b\u6570\u636e\u6709\u4e9b\u5c5e\u6027\u6ca1\u6709\uff0c\u6240\u4ee5\u6211\u4eec\u8981\u8fdb\u884c\u5224\u65ad\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># parse the json\ndef parse_page(json):\n    if json:\n        items = json.get('data')\n        for item in items:\n            toutiao = {}\n            toutiao['id'] = item['group_id']\n            toutiao['title'] = item['title']\n            if 'chinese_tag' in item:\n                toutiao['chinese_tag'] = item['chinese_tag']\n            else:\n                toutiao['chinese_tag'] = 'NULL'\n            if 'comments_count' in item:\n                toutiao['comments_count'] = item['comments_count']\n            else:\n                toutiao['comments_count'] = 'NULL'\n            if 'image_url' in item:\n                toutiao['image_url'] = item['image_url']\n            else:\n                toutiao['image_url'] = 'NULL'\n            toutiao['is_feed_ad'] = item['is_feed_ad']\n            toutiao['source'] = item['source']\n            toutiao['source_url'] = item['source_url']\n            yield toutiao<\/pre>\n<p>\u5b58\u5165\u6570\u636e\u5e93\uff0c\u5b58\u5165\u4e4b\u524d\u4f9d\u9760\u6587\u5b57id\u8fdb\u884c\u91cd\u590d\u6027\u68c0\u67e5\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># save in mongo\ndef save_to_mongo(result):\n    id = result['id']\n    client = pymongo.MongoClient(host='localhost', port=27017)\n    db = client.Test\n    collection = db.TouTiao\n    if_have = collection.find_one({'id': id})\n    if if_have is None:\n        collection.insert(result)<\/pre>\n<p>\u4e3b\u51fd\u6570\uff0c\u904d\u538699\u6b21\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># main\nif __name__ == '__main__':\n    for i in range(100):\n        json = get_page()\n        results = parse_page(json)\n        for result in results:\n            save_to_mongo(result)\n<\/pre>\n<p>\u722c\u53d6\u6210\u529f\uff1a<\/p>\n<h4><img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/72.png\" alt=\"\" class=\"alignnone size-full wp-image-1059\" width=\"1136\" height=\"774\" \/><\/h4>\n<p>\u53ef\u4ee5\u770b\u5230\u5176\u4e2d\u90e8\u5206\u6570\u636e\u67d0\u4e9b\u5c5e\u6027\u662f\u4e3a\u7a7a\u7684\uff1a<\/p>\n<h4><img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/74.png\" alt=\"\" class=\"alignnone size-full wp-image-1061\" width=\"1202\" height=\"564\" \/><\/h4>\n<p>\u67e5\u770b\u53ef\u89c6\u5316\u754c\u9762\uff0c\u4e00\u5171\u83b7\u5f97\u4e86917\u6761\u6570\u636e\u3002<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/73.png\" alt=\"\" class=\"alignnone size-full wp-image-1060\" width=\"761\" height=\"410\" \/> \u5b8c\u6574\u4ee3\u7801\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># -*- coding:utf-8 -*-\nfrom urllib.parse import urlencode\nimport requests\nimport pymongo\nbase_url = 'https:\/\/www.toutiao.com\/api\/pc\/feed\/?'\nheaders = {\n    'referer': 'https:\/\/www.toutiao.com\/ch\/news_tech\/',\n    'user-agent': 'Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko)'\n                  ' Chrome\/67.0.3396.99 Safari\/537.36',\n    'cookie': 'tt_webid=6582430453025900040; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=164d592b6b577c-'\n              '01bbb425c262e8-47e1039-1fa400-164d592b6b6193; CNZZDATA1259612802=1935731616-1532588410-'\n              'https%253A%252F%252Fwww.baidu.com%252F%7C1532588410; tt_webid=6582430453025900040; __'\n              'tasessionId=fm5a1x2sm1532591519911; csrftoken=4298418f9ec85b57aa4d9e5b781bed87;'\n              ' uuid=\"w:a033d21de37d453780932620f5d81416\"',\n    'x-requested-with': 'XMLHttpRequest'\n}\nattrs = {\n    'category': 'news_tech',\n    'utm_source': 'toutiao',\n    'widen': '1',\n    'max_behot_time': '0',\n    'max_behot_time_tmp': '0',\n    'tadrequire': 'true',\n    'as': 'A1353B4569B7E17',\n    'cp': '5B59A71E724A1E1',\n    '_signature': 'g9UFGQAA2Jkp-kWZLlPu9YPVBQ'\n}\n# get page\ndef get_page():\n    url = base_url + urlencode(attrs)\n    try:\n        responce = requests.get(url=url, headers=headers)\n        if responce.status_code == 200:\n            return responce.json()\n    except requests.ConnectionError as e:\n        print('Error', e.args)\n# parse the json\ndef parse_page(json):\n    if json:\n        items = json.get('data')\n        for item in items:\n            toutiao = {}\n            toutiao['id'] = item['group_id']\n            toutiao['title'] = item['title']\n            if 'chinese_tag' in item:\n                toutiao['chinese_tag'] = item['chinese_tag']\n            else:\n                toutiao['chinese_tag'] = 'NULL'\n            if 'comments_count' in item:\n                toutiao['comments_count'] = item['comments_count']\n            else:\n                toutiao['comments_count'] = 'NULL'\n            if 'image_url' in item:\n                toutiao['image_url'] = item['image_url']\n            else:\n                toutiao['image_url'] = 'NULL'\n            toutiao['is_feed_ad'] = item['is_feed_ad']\n            toutiao['source'] = item['source']\n            toutiao['source_url'] = item['source_url']\n            yield toutiao\n# save in mongo\ndef save_to_mongo(result):\n    id = result['id']\n    client = pymongo.MongoClient(host='localhost', port=27017)\n    db = client.Test\n    collection = db.TouTiao\n    if_have = collection.find_one({'id': id})\n    if if_have is None:\n        collection.insert(result)\n# main\nif __name__ == '__main__':\n    for i in range(100):\n        json = get_page()\n        results = parse_page(json)\n        for result in results:\n            save_to_mongo(result)\n<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728\u8fd9\u91cc\u4e0b\u8f7d\u3002 1.\u5206\u6790\u7f51\u7ad9 \u9996\u5148\u5206\u6790\u8bf7\u6c42\u62a5\u6587\uff0c\u53d6\u51fa\u6bd4\u8f83\u5173\u952e\u7684\u6d88\u606f\u5934\u5c5e\u6027\u3002 \u53e6\u5916\u901a\u8fc7\u5206\u6790\u8bf7 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_mi_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[12],"tags":[],"views":2911,"_links":{"self":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1054"}],"collection":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/comments?post=1054"}],"version-history":[{"count":0,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1054\/revisions"}],"wp:attachment":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/media?parent=1054"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/categories?post=1054"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/tags?post=1054"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}