{"id":1071,"date":"2018-07-27T09:20:15","date_gmt":"2018-07-27T01:20:15","guid":{"rendered":"http:\/\/www.sniper97.cn\/?p=1071"},"modified":"2018-07-27T09:20:15","modified_gmt":"2018-07-27T01:20:15","slug":"%e7%ac%ac%e5%8d%81%e4%b8%80%e8%8a%82%ef%bc%9a%e7%88%ac%e5%8f%96%e6%b7%98%e5%ae%9d%e5%95%86%e5%93%81","status":"publish","type":"post","link":"http:\/\/www.sniper97.cn\/index.php\/note\/carwler\/1071\/","title":{"rendered":"\u7b2c\u5341\u4e00\u8282\uff1a\u722c\u53d6\u6dd8\u5b9d\u5546\u54c1"},"content":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728<a href=\"https:\/\/github.com\/Sniper970119\/Spider\/tree\/master\/20180721\" target=\"_blank\" rel=\"noopener\" data-slimstat=\"5\">\u8fd9\u91cc<\/a>\u4e0b\u8f7d\u3002<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/76.png\" alt=\"\" class=\"alignnone size-full wp-image-1073\" width=\"1072\" height=\"539\" \/><br \/>\n\u89c2\u5bdf\u8f93\u5165\u6570\u5b57\u7684\u8282\u70b9\u548c\u786e\u5b9a\u7684\u8282\u70b9\u3002\u7136\u540e\u6a21\u62df\u70b9\u51fb\u6362\u9875\uff0c\u4e0d\u8fc7\u6dd8\u5b9d\u6700\u8fd1\u5df2\u7ecf\u53ef\u4ee5\u5728URL\u4e2d\u52a0\u4e86\u4e00\u4e2aselenium\u53c2\u6570\uff0c\u5df2\u7ecf\u53ef\u4ee5\u5bf9selenlum\u8fdb\u884c\u8bc6\u522b\uff0c\u6240\u4ee5\u8fd9\u79cd\u65b9\u6cd5\u6682\u65f6\u5931\u6548\uff0c\u53ea\u80fd\u722c\u51fa\u524d\u51e0\u9875\u7684\u5546\u54c1\u3002<br \/>\n&nbsp;<br \/>\n\u4ee3\u7801\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># -*- coding:utf-8 -*-\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.webdriver.support import expected_conditions as EC\nfrom selenium.common.exceptions import TimeoutException\nfrom pyquery import PyQuery as pq\nimport pymongo\nimport re\nchrome_options = webdriver.ChromeOptions()\nchrome_options.add_argument('--headless')\nbrowser = webdriver.Chrome(chrome_options=chrome_options)\nwait = WebDriverWait(browser, 10)\nKEY_WORD = 'iPhone'\nbase_url = 'https:\/\/s.taobao.com\/search?q='\ndef get_page(page):\n    try:\n        print('******\u83b7\u53d6\u9875\u9762\u6e90\u7801******')\n        url = base_url + KEY_WORD\n        browser.get(url)\n        if page &gt; 1:\n            input = wait.until(\n                EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form &gt; input')))\n            submit = wait.until(\n                EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form &gt; span.btn.J_Submit')))\n            input.clear()\n            input.send_keys(page)\n            submit.click()\n        wait.until(\n            EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active &gt; span'), str(page)))\n        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))\n        print('******\u83b7\u53d6\u6e90\u7801\u6210\u529f******')\n        # time.sleep(5)\n        print(browser.current_url)\n        parse_page()\n    except TimeoutException:\n        print('Error:********\u83b7\u53d6\u6e90\u7801\u8d85\u65f6********')\ndef parse_page():\n    print('******\u622a\u53d6\u5546\u54c1\u4fe1\u606f******')\n    html_code = browser.page_source\n    doc = pq(html_code)\n    items = doc('#mainsrp-itemlist .items .item').items()\n    for item in items:\n        product = {\n            'good id': item.find('.pic .pic-link').attr('data-nid'),\n            'image': item.find('.pic .img').attr('data-src'),\n            'price': re.sub('\\s+', '', item.find('.g_price-highlight').text()),\n            'deal': item.find('.deal-cnt').text(),\n            'shop': item.find('.ww-light.ww-small').attr('data-nick'),\n            'shop id': item.find('.shopname').attr('data-userid'),\n            'location': item.find('.location').text(),\n        }\n        save_to_mongo(product)\ndef save_to_mongo(product):\n    client = pymongo.MongoClient(host='localhost', port=27017)\n    db = client.Test\n    collection = db.TaoBao\n    id = product['good id']\n    result = collection.find_one({'good id': id})\n    if result is None:\n        collection.insert(product)\n        print('******\u4fdd\u5b58\u5230\u6570\u636e\u5e93' + str(id) + '******')\n    else:\n        print(str(result) + \"      \" + str(id))\nif __name__ == '__main__':\n    for page in range(1, 100):\n        print('\\n*******\u7b2c' + str(page) + '\u6b21\u722c\u53d6*******')\n        html = get_page(page)\n        # products = parse_page(html)\n        # for product in products:\n        #     save_to_mongo(product)\n<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728\u8fd9\u91cc\u4e0b\u8f7d\u3002 \u89c2\u5bdf\u8f93\u5165\u6570\u5b57\u7684\u8282\u70b9\u548c\u786e\u5b9a\u7684\u8282\u70b9\u3002\u7136\u540e\u6a21\u62df\u70b9\u51fb\u6362\u9875\uff0c\u4e0d\u8fc7\u6dd8\u5b9d\u6700\u8fd1\u5df2\u7ecf\u53ef\u4ee5\u5728U [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_mi_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[12],"tags":[],"views":4001,"_links":{"self":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1071"}],"collection":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/comments?post=1071"}],"version-history":[{"count":0,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1071\/revisions"}],"wp:attachment":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/media?parent=1071"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/categories?post=1071"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/tags?post=1071"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}