{"id":988,"date":"2018-07-24T22:52:34","date_gmt":"2018-07-24T14:52:34","guid":{"rendered":"http:\/\/www.sniper97.cn\/?p=988"},"modified":"2018-07-24T22:52:34","modified_gmt":"2018-07-24T14:52:34","slug":"%e7%ac%ac%e5%9b%9b%e8%8a%82%ef%bc%9a%e7%88%ac%e5%8f%96%e7%8c%ab%e7%9c%bc%e7%94%b5%e5%bd%b1%e6%8e%92%e8%a1%8c%e6%a6%9c","status":"publish","type":"post","link":"http:\/\/www.sniper97.cn\/index.php\/note\/carwler\/988\/","title":{"rendered":"\u7b2c\u56db\u8282\uff1a\u722c\u53d6\u732b\u773c\u7535\u5f71\u6392\u884c\u699c"},"content":{"rendered":"<p>\u672c\u9875\u6e90\u7801\u53ef\u4ee5\u5728<a href=\"https:\/\/github.com\/Sniper970119\/Spider\/tree\/master\/20180721\" target=\"_blank\" rel=\"noopener\">\u8fd9\u91cc<\/a>\u67e5\u770b\u6216\u4e0b\u8f7d\u3002<br \/>\n&nbsp;<br \/>\n\u83b7\u53d6\u9875\u9762\u6e90\u7801\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># get page code\ndef get_one_page(url):\n    headers = {\n        'User-agent': 'Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/67.0.339'\n                      '6.99 Safari\/537.36'\n    }\n    try:\n        responce = requests.get(url=url, headers=headers, timeout=5)\n        if responce.status_code == 200:\n            return responce.text\n        else:\n            return None\n    except RequestException:\n        return None<\/pre>\n<p><img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/32.png\" alt=\"\" class=\"alignnone size-full wp-image-997\" width=\"1033\" height=\"258\" \/><br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/31.png\" alt=\"\" class=\"alignnone size-full wp-image-996\" width=\"1149\" height=\"514\" \/><br \/>\n\u5206\u6790\u6e90\u7801\uff0c\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u5206\u5272\u6e90\u7801\uff0c\u5e76\u4f7f\u7528<a href=\"http:\/\/www.sniper97.cn\/index.php\/category\/note\/python\/yield\" target=\"_blank\" rel=\"noopener\">yield\u5173\u952e\u5b57<\/a>\u5904\u7406\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># parse the message with regex\ndef parse_one_page(html):\n    pattern = re.compile('&lt;dd&gt;.*?board-index.*?&gt;(.*?)&lt;\/i&gt;.*?title=\"(.*?)\".*?data-src=\"(.*?)\".*?class=\"star\"&gt;'\n                         '(.*?)&lt;\/p&gt;.*?releasetime\"&gt;(.*?)&lt;\/p&gt;.*?integer\"&gt;(.*?)&lt;\/i&gt;.*?fraction\"&gt;(.*?)&lt;\/i&gt;', re.S)\n    items = re.findall(pattern=pattern, string=html)\n    for item in items:\n        yield {\n            'index': item[0],\n            'title': item[1].strip(),\n            'image': item[2],\n            'actor': item[3].strip()[3:],\n            'time': item[4].strip()[5:],\n            'score': item[5].strip() + item[6].strip()\n        }\n    return items\n<\/pre>\n<p>\u5c06\u6574\u7406\u540e\u7684\u6570\u636e\u5199\u5165\u6587\u4ef6\uff0c\u901a\u8fc7json\u7684dumps\uff08\uff09\u5b9e\u73b0\u5b57\u5178\u7684\u5e8f\u5217\u5316\uff0c\u5e76\u6307\u5b9aensure_ascii\u53c2\u6570\u4e3aFalse\uff0c\u8fd9\u6837\u53ef\u4ee5\u4fdd\u8bc1\u4e2d\u6587\u5f62\u5f0f\u800c\u4e0d\u662fUnicode\u7f16\u7801\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># write to file\ndef write_to_file(content):\n    with open('MaoYanSpider.txt', 'a', encoding='utf-8') as f:\n        f.write(json.dumps(content, ensure_ascii=False) + '\\n')<\/pre>\n<p>\u5904\u7406\u56fe\u7247URL\u5e76\u5c06\u56fe\u7247\u5b58\u50a8\uff0c\u4f7f\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u628a\u539f\u94fe\u63a5\uff08\u5c0f\u56fe\uff09\u540e\u53bb\u6389\uff0c\u53d8\u6210\u5927\u56fe\uff0c\u7136\u540e\u4f7f\u7528\u6587\u4ef6\u4fdd\u5b58\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># get picture\ndef get_picture(content):\n    headers = {\n        'User-agent': 'Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/67.0.339'\n                      '6.99 Safari\/537.36'\n    }\n    url = content['image']\n    url = re.sub('(@.*)', \"\", url)\n    title = content['title']\n    responce = requests.get(url=url, headers=headers, timeout=5)\n    with open(\".\/image\/\" + title + \".jpg\", 'wb') as f:\n        f.write(responce.content)\n<\/pre>\n<p>&nbsp;<br \/>\n\u4e3b\u51fd\u6570\uff0c\u4f7f\u7528\u5faa\u73af\u904d\u5386\u504f\u79fb\u91cf\uff0c\u5e76\u4f9d\u6b21\u8c03\u7528\u4e0a\u8ff0\u65b9\u6cd5\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># main\nif __name__ == '__main__':\n    offset = 0\n    for i in range(10):\n        url = 'http:\/\/www.maoyan.com\/board\/4?offset=' + str(offset)\n        html = get_one_page(url)\n        # f = open('maoyan.txt', 'w')  # print page code in file\n        # f.write(html)\n        # print(html)                  # print page code in console\n        for item in parse_one_page(html):  # print the handled message\n            # print(item)\n            # write_to_file(item)\n            get_picture(item)\n        offset = offset + 10\n<\/pre>\n<p>\u722c\u53d6\u7ed3\u679c\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/33-e1532445311956.png\" alt=\"\" class=\"alignnone wp-image-998 size-full\" width=\"1013\" height=\"612\" \/><br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/36.png\" alt=\"\" class=\"alignnone size-full wp-image-1004\" width=\"1603\" height=\"854\" \/><br \/>\n&nbsp;<br \/>\n\u5b8c\u6574\u4ee3\u7801\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># -*- coding:utf-8 -*-\nimport json\nimport requests\nimport re\nfrom requests.exceptions import RequestException\n# get page code\ndef get_one_page(url):\n    headers = {\n        'User-agent': 'Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/67.0.339'\n                      '6.99 Safari\/537.36'\n    }\n    try:\n        responce = requests.get(url=url, headers=headers, timeout=5)\n        if responce.status_code == 200:\n            return responce.text\n        else:\n            return None\n    except RequestException:\n        return None\n# parse the message with regex\ndef parse_one_page(html):\n    pattern = re.compile('&lt;dd&gt;.*?board-index.*?&gt;(.*?)&lt;\/i&gt;.*?title=\"(.*?)\".*?data-src=\"(.*?)\".*?class=\"star\"&gt;'\n                         '(.*?)&lt;\/p&gt;.*?releasetime\"&gt;(.*?)&lt;\/p&gt;.*?integer\"&gt;(.*?)&lt;\/i&gt;.*?fraction\"&gt;(.*?)&lt;\/i&gt;', re.S)\n    items = re.findall(pattern=pattern, string=html)\n    for item in items:\n        yield {\n            'index': item[0],\n            'title': item[1].strip(),\n            'image': item[2],\n            'actor': item[3].strip()[3:],\n            'time': item[4].strip()[5:],\n            'score': item[5].strip() + item[6].strip()\n        }\n    return items\n# write to file\ndef write_to_file(content):\n    with open('MaoYanSpider.txt', 'a', encoding='utf-8') as f:\n        f.write(json.dumps(content, ensure_ascii=False) + '\\n')\n# get picture\ndef get_picture(content):\n    headers = {\n        'User-agent': 'Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/67.0.339'\n                      '6.99 Safari\/537.36'\n    }\n    url = content['image']\n    url = re.sub('(@.*)', \"\", url)\n    title = content['title']\n    responce = requests.get(url=url, headers=headers, timeout=5)\n    with open(\".\/image\/\" + title + \".jpg\", 'wb') as f:\n        f.write(responce.content)\n# main\nif __name__ == '__main__':\n    offset = 0\n    for i in range(10):\n        url = 'http:\/\/www.maoyan.com\/board\/4?offset=' + str(offset)\n        html = get_one_page(url)\n        # f = open('maoyan.txt', 'w')  # print page code in file\n        # f.write(html)\n        # print(html)                  # print page code in console\n        for item in parse_one_page(html):  # print the handled message\n            # print(item)\n            # write_to_file(item)\n            get_picture(item)\n        offset = offset + 10\n<\/pre>\n<p>.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u9875\u6e90\u7801\u53ef\u4ee5\u5728\u8fd9\u91cc\u67e5\u770b\u6216\u4e0b\u8f7d\u3002 &nbsp; \u83b7\u53d6\u9875\u9762\u6e90\u7801\uff1a # get page code def [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_mi_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[12],"tags":[],"views":3417,"_links":{"self":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/988"}],"collection":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/comments?post=988"}],"version-history":[{"count":0,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/988\/revisions"}],"wp:attachment":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/media?parent=988"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/categories?post=988"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/tags?post=988"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}