{"id":1000,"date":"2018-07-25T19:13:13","date_gmt":"2018-07-25T11:13:13","guid":{"rendered":"http:\/\/www.sniper97.cn\/?p=1000"},"modified":"2018-07-25T19:13:13","modified_gmt":"2018-07-25T11:13:13","slug":"%e7%ac%ac%e4%ba%94%e8%8a%82%ef%bc%9a%e8%a7%a3%e6%9e%90%e5%ba%93%e7%9a%84%e4%bd%bf%e7%94%a8","status":"publish","type":"post","link":"http:\/\/www.sniper97.cn\/index.php\/note\/carwler\/1000\/","title":{"rendered":"\u7b2c\u4e94\u8282\uff1a\u89e3\u6790\u5e93\u7684\u4f7f\u7528"},"content":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728<a href=\"https:\/\/github.com\/Sniper970119\/Spider\/tree\/master\/20180721\" target=\"_blank\" rel=\"noopener\">\u8fd9\u91cc<\/a>\u4e0b\u8f7d\u3002<\/p>\n<h3>1.\u4f7f\u7528XPath<\/h3>\n<p>\u5168\u79f0 XML Path Language\uff0c\u5373XML\u8bed\u8a00\u8def\u5f84\u3002<br \/>\n\u5e38\u7528\u89c4\u5219\uff1a<br \/>\nnodename\u00a0\u00a0\u00a0\u00a0\u00a0 \u9009\u53d6\u6b64\u8282\u70b9\u7684\u6240\u6709\u5b50\u8282\u70b9<br \/>\n\/\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u4ece\u5f53\u524d\u8282\u70b9\u9009\u53d6\u76f4\u63a5\u5b50\u8282\u70b9<br \/>\n\/\/\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u4ece\u5f53\u524d\u7ed3\u70b9\u9009\u53d6\u5b50\u5b59\u8282\u70b9<br \/>\n.\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u9009\u53d6\u5f53\u524d\u8282\u70b9<br \/>\n..\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u9009\u53d6\u5f53\u524d\u7ed3\u70b9\u7684\u7236\u8282\u70b9<br \/>\n@\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 \u9009\u53d6\u5c5e\u6027<br \/>\nxml\u6587\u4ef6\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/35.png\" alt=\"\" class=\"alignnone size-full wp-image-1002\" width=\"734\" height=\"264\" \/><br \/>\n\u4ee3\u7801\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">html = etree.parse('test.xml', etree.HTMLParser())\nresult = etree.tostring(html)\nprint(result.decode('utf-8'))<\/pre>\n<p>\u7ed3\u679c\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/34.png\" alt=\"\" class=\"alignnone size-full wp-image-1001\" width=\"755\" height=\"261\" \/><br \/>\n\u6211\u4eec\u770b\u5230\uff0c\u5373\u4f7f\u7f3a\u5c11\u6807\u7b7e\uff0c\u4e5f\u53ef\u4ee5\u81ea\u52a8\u8865\u5168\u3002<\/p>\n<h5>\u6240\u6709\u7ed3\u70b9\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; all\nhtml = etree.parse('test.xml', etree.HTMLParser())\nresult = html.xpath('\/\/*')\nprint(result)\n<\/pre>\n<p>\u7ed3\u679c\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"null\">[\n&lt;Element html at 0x1d19a6d9a48&gt;,\n&lt;Element body at 0x1d19a6d9b48&gt;,\n&lt;Element div at 0x1d19a6d9b88&gt;,\n&lt;Element ul at 0x1d19a6d9bc8&gt;,\n&lt;Element li at 0x1d19a6d9c08&gt;,\n&lt;Element a at 0x1d19a6d9c88&gt;,\n&lt;Element li at 0x1d19a6d9cc8&gt;,\n&lt;Element a at 0x1d19a6d9d08&gt;,\n&lt;Element li at 0x1d19a6d9d48&gt;,\n&lt;Element a at 0x1d19a6d9c48&gt;,\n&lt;Element li at 0x1d19a6d9d88&gt;,\n&lt;Element a at 0x1d19a6d9dc8&gt;,\n&lt;Element li at 0x1d19a6d9e08&gt;,\n&lt;Element a at 0x1d19a6d9e48&gt;\n]\n<\/pre>\n<h5>\u5b50\u8282\u70b9\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; son\nhtml = etree.parse('test.xml', etree.HTMLParser())\nresult = html.xpath('\/\/li\/a')\nprint(result)<\/pre>\n<p>\u7ed3\u679c\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"null\">[\n&lt;Element a at 0x26e0af09b48&gt;,\n&lt;Element a at 0x26e0af09b88&gt;,\n&lt;Element a at 0x26e0af09bc8&gt;,\n&lt;Element a at 0x26e0af09c08&gt;,\n&lt;Element a at 0x26e0af09c48&gt;\n]<\/pre>\n<p>\u8f93\u51fa\u4e86li\u4e0b\u6240\u6709\u5b50\u8282\u70b9a\u3002<\/p>\n<h5>\u7236\u8282\u70b9\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; father\nhtml = etree.parse('test.xml', etree.HTMLParser())\nresult = html.xpath('\/\/a[@herf=\"link4.html\"]\/..\/@class')\nprint(result)<\/pre>\n<p>\u8f93\u51fa\uff1a<br \/>\n[&#8216;item-0&#8217;]<\/p>\n<h5>\u5c5e\u6027\u83b7\u53d6\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; get attribute\nhtml = etree.parse('test.xml', etree.HTMLParser())\nresult = html.xpath('\/\/li\/a\/@href')\nprint(result)\n<\/pre>\n<p>\u8f93\u51fa\uff1a<br \/>\n[&#8216;link1.html&#8217;, &#8216;link2.html&#8217;, &#8216;link3.html&#8217;, &#8216;link4.html&#8217;, &#8216;link5.html&#8217;]<\/p>\n<h5>\u5c5e\u6027\u591a\u503c\u83b7\u53d6\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; get attribute which have more than one values\ntext = '''\n&lt;li class=\"li li-first\"&gt;&lt;a href=\"link.html\"&gt;first item&lt;\/a&gt;&lt;\/li&gt;\n'''\nhtml = etree.HTML(text)\nresult = html.xpath('\/\/li[contains(@class, \"li\")]\/a\/text()')\nprint(result)<\/pre>\n<p>\u8f93\u51fa\uff1a<br \/>\n[&#8216;first item&#8217;]<\/p>\n<h5>\u591a\u5c5e\u6027\u503c\u5339\u914d\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; match message by more than one attribute\ntext = '''\n&lt;li class=\"li li-first\" name=\"item\"&gt;&lt;a href=\"link.html\"&gt;first item&lt;\/a&gt;&lt;\/li&gt;\n'''\nhtml = etree.HTML(text)\nresult = html.xpath('\/\/li[contains(@class, \"li\")and @name=\"item\"]\/a\/text()')\nprint(result)<\/pre>\n<p>\u8f93\u51fa\uff1a<br \/>\n[&#8216;first item&#8217;]<\/p>\n<h5>\u6309\u5e8f\u8f93\u51fa\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; output by order\ntext = '''\n&lt;li class=\"li li-first\" name=\"item\"&gt;&lt;a href=\"link.html\"&gt;first item&lt;\/a&gt;&lt;\/li&gt;\n'''\nhtml = etree.HTML(text)\nresult = html.xpath('\/\/li[1]\/a\/text()')         # print the first node\nprint(result)\nresult = html.xpath('\/\/li[last()]\/a\/text()')    # print the last node\nprint(result)\nresult = html.xpath('\/\/li[position&lt;3]\/a\/text()')    # print the nodes whose position is smaller than 3\nprint(result)\nresult = html.xpath('\/\/li[last()-2]\/a\/text()')  # print the antepenultimate node\nprint(result)<\/pre>\n<h5>\u8282\u70b9\u8f74\u9009\u62e9\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># xml test -&gt; node axle\ntext = '''\n&lt;li class=\"li li-first\" name=\"item\"&gt;&lt;a href=\"link.html\"&gt;first item&lt;\/a&gt;&lt;\/li&gt;\n'''\nhtml = etree.HTML(text)\nresult = html.xpath('\/\/li[1]\/ancestor::*')\nprint(result)\nresult = html.xpath('\/\/li[1]\/ancestor::div')\nprint(result)\nresult = html.xpath('\/\/li[1]\/attribute::*')\nprint(result)\nresult = html.xpath('\/\/li[1]\/child::a[@href=\"link1.html\"]')\nprint(result)\nresult = html.xpath('\/\/li[1]\/descendant::span')\nprint(result)\nresult = html.xpath('\/\/li[1]\/following::*[2]')\nprint(result)\nresult = html.xpath('\/\/li[1]\/following-sibling::*')\nprint(result)<\/pre>\n<p>\u7b2c\u4e00\u6b21\u9009\u62e9\uff1a\u4f7f\u7528ancentor\u8f74\uff0c\u53ef\u4ee5\u83b7\u53d6\u6240\u6709\u7956\u5148\u8282\u70b9\u3002<br \/>\n\u7b2c\u4e8c\u6b21\u9009\u62e9\uff1a\u8fd4\u56dediv\u7684\u7956\u5148\u8282\u70b9\u3002<br \/>\n\u7b2c\u4e09\u6b21\u9009\u62e9\uff1a\u8c03\u7528attribute\u8f74\uff0c\u53ef\u4ee5\u83b7\u53d6\u6240\u6709\u5c5e\u6027\u503c\u3002<br \/>\n\u7b2c\u56db\u6b21\u9009\u62e9\uff1a\u8c03\u7528child\u8f74\uff0c\u53ef\u4ee5\u83b7\u53d6\u6240\u6709\u76f4\u63a5\u5b50\u8282\u70b9\uff08\u8fd9\u91cc\u52a0\u4e86\u9650\u5236\u6761\u4ef6\uff09\u3002<br \/>\n\u7b2c\u4e94\u6b21\u9009\u62e9\uff1a\u8c03\u7528descendant\u8f74\uff0c\u53ef\u4ee5\u83b7\u53d6\u6240\u6709\u5b50\u5b59\u8282\u70b9\u3002<br \/>\n\u7b2c\u516d\u6b21\u9009\u62e9\uff1a\u8c03\u7528following\u8f74\uff0c\u53ef\u4ee5\u83b7\u53d6\u5f53\u524d\u7ed3\u70b9\u4e4b\u540e\u7684\u6240\u6709\u8282\u70b9\u3002<br \/>\n\u7b2c\u4e03\u6b21\u9009\u62e9\uff1a\u8c03\u7528following-sibling\u8f74\uff0c\u53ef\u4ee5\u83b7\u5f97\u5f53\u524d\u8282\u70b9\u4e4b\u540e\u7684\u6240\u6709\u540c\u7ea7\u8282\u70b9\u3002<\/p>\n<h3>2.\u4f7f\u7528Beautiful Soup<\/h3>\n<p>\u662f\u4e00\u4e2aPython\u7684HTML\u3001XML\u89e3\u6790\u5e93\uff0c\u7528\u5b83\u53ef\u4ee5\u65b9\u4fbf\u7684\u4ece\u7f51\u9875\u4e2d\u67d0\u4e2a\u5143\u7d20\u4e2d\u63d0\u53d6\u6570\u636e\u3002<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/37.png\" alt=\"\" class=\"alignnone size-full wp-image-1007\" width=\"981\" height=\"166\" \/><br \/>\n\u57fa\u672c\u7528\u6cd5\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># beautiful soup test\nhtml = '''\n&lt;html&gt;\n    &lt;head&gt;\n        &lt;title&gt;The Dormouse's story&lt;\/title&gt;\n    &lt;\/head&gt;\n    &lt;body&gt;\n        &lt;p class=\"title\" name=\"dromouse\"&gt;&lt;b&gt;The Dormouse's story&lt;\/b&gt;&lt;\/p&gt;\n        &lt;p class=\"story\"&gt;Once upon a time there were three little sisters;and their names were\n            &lt;a href=\"http:\/\/www.baidu.com\/01\" class=\"sister\" id=\"link1\"&gt;&lt;!--Elsie--&gt;&lt;\/a&gt;\n            &lt;a href=\"http:\/\/www.baidu.com\/02\" class=\"sister\" id=\"link2\"&gt;Lacie&lt;\/a&gt; and\n            &lt;a href=\"http:\/\/www.baidu.com\/03\" class=\"sister\" id=\"link3\"&gt;Tillie&lt;\/a&gt;;\n            and they lived at the bottom of a well.\n        &lt;\/p&gt;\n        &lt;p class=\"story\"&gt;...&lt;\/p&gt;\n'''\nsoup = BeautifulSoup(html, 'lxml')\nprint(soup.prettify())\nprint(soup.title.string)<\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/38.png\" alt=\"\" class=\"alignnone size-full wp-image-1008\" width=\"769\" height=\"787\" \/><br \/>\n\u6211\u4eec\u53ef\u4ee5\u770b\u5230\uff0c\u4ed6\u4f1a\u81ea\u52a8\u628a\u6211\u4eec\u6ca1\u6709\u5173\u95ed\u7684\u6807\u7b7e\u5173\u95ed\u5e76\u4ee5\u6807\u51c6\u7684\u7f29\u8fdb\u5f62\u5f0f\u8f93\u51fa\uff08\u8fd9\u4e00\u6b65\u5728\u751f\u6210beautiful soup\u5bf9\u8c61\u7684\u65f6\u5019\u5c31\u5b8c\u6210\u4e86\uff09\uff0c\u7136\u540e\u8c03\u7528soup.title.string\uff08\u8282\u70b9\u9009\u62e9\u5668\uff09\uff0c\u5b9e\u9645\u4e0a\u662f\u8f93\u51faHTML\u6587\u672c\u4e2dtitle\u8282\u70b9\u7684\u6587\u672c\u5185\u5bb9\u3002<\/p>\n<h5>\u8282\u70b9\u9009\u62e9\u5668\uff1a<\/h5>\n<p>\u9009\u62e9\u5143\u7d20\uff1asoup.title.string<br \/>\n\u83b7\u53d6\u5c5e\u6027\uff1asoup.p.attrs\u00a0\u00a0\u00a0\u00a0\u00a0 soup.p.attrs[&#8216;name&#8217;]<br \/>\n\u83b7\u53d6\u5185\u5bb9\uff1asoup.p.string<br \/>\n&nbsp;<\/p>\n<h5>\u65b9\u6cd5\u9009\u62e9\u5668\uff1a<\/h5>\n<p>\uff081\uff09find_all\uff08\uff09\uff1a<br \/>\n\u67e5\u8be2\u6240\u6709\u7b26\u5408\u6761\u4ef6\u7684\u5143\u7d20\u3002\u7ed9\u5b83\u4f20\u5165\u4e00\u4e9b\u5c5e\u6027\u6216\u6587\u672c\uff0c\u5c31\u53ef\u4ee5\u5f97\u5230\u7b26\u5408\u6761\u4ef6\u7684\u5143\u7d20\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">find_all(name, attrs, recursive, text, **kwargs)<\/pre>\n<p>&nbsp;<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># beautiful soup test -&gt; find_all\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\nsoup = BeautifulSoup(html, 'lxml')\nprint(soup.find_all(name='ul'))\nprint()\nfor ul in soup.find_all(name='ul'):\n    print(ul.find_all(name='li'))\nprint()\nprint(soup.find_all(attrs={'id': 'list-1'}))\nprint()\nprint(soup.find_all(id='list-1'))\nprint()\nprint(soup.find_all(text=re.compile('Foo')))<\/pre>\n<p>\u8fd0\u884c\u7ed3\u679c\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/39.png\" alt=\"\" class=\"alignnone size-full wp-image-1009\" width=\"780\" height=\"581\" \/><br \/>\n\uff082\uff09find\uff08\uff09\uff1a<br \/>\n\u548cfind_all()\u5dee\u4e0d\u591a\uff0c\u53ea\u4e0d\u8fc7\u524d\u8005\u8fd4\u56de\u6240\u6709\u5339\u914d\u5143\u7d20\u7ec4\u6210\u7684\u5217\u8868\uff0c\u540e\u8005\u8fd4\u56de\u5355\u4e2a\u5143\u7d20\uff0c\u4e5f\u5c31\u662f\u7b2c\u4e00\u4e2a\u5339\u914d\u7684\u5143\u7d20\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># beautiful soup test -&gt; find_all\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\nsoup = BeautifulSoup(html, 'lxml')\nprint(soup.find(name='ul'))\n<\/pre>\n<p>\u6267\u884c\u7ed3\u679c\uff0c\u81f3\u8fd4\u56de\u7b2c\u4e00\u6b21\u5339\u914d\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/40-1.png\" alt=\"\" class=\"alignnone size-full wp-image-1010\" width=\"424\" height=\"129\" \/><\/p>\n<h5>CSS\u9009\u62e9\u5668\uff1a<\/h5>\n<p>\u4f7f\u7528select\uff08\uff09\u9009\u62e9\u7ed3\u70b9\u3002<br \/>\n\u4f7f\u7528[ ]\u6216\u8005attrs[ ]\u83b7\u53d6\u5c5e\u6027\u3002<br \/>\n\u4f7f\u7528.get_text[ ]\u6216\u8005\u3002string\u83b7\u53d6\u6587\u672c\u3002<br \/>\n\u4ee3\u7801\uff1a<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># beautiful soup test -&gt; find\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\nsoup = BeautifulSoup(html, 'lxml')\nprint(soup.select('.panel .panel-heading'))\nprint()\nprint(soup.select('ul li'))\nprint()\nprint(soup.select('#list-2 .element'))\nprint()\nfor ul in soup.select('li'):        # attribute\n    print(ul['class'])\n    print(ul.attrs['class'])\n    print('Get Text:', ul.get_text())\n    print('String:', ul.string)<\/pre>\n<p>\u7ed3\u679c\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/41-1.png\" alt=\"\" class=\"alignnone size-full wp-image-1011\" width=\"1252\" height=\"648\" \/><br \/>\n&nbsp;<\/p>\n<h3>3.\u4f7f\u7528pyquery<\/h3>\n<p>\u6bd4\u8d77\u524d\u4e24\u79cd\uff0cpyquery\u53ef\u4ee5\u4eceURL\u548c\u6587\u4ef6\u8fdb\u884c\u521d\u59cb\u5316\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># pyquery test\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\ndoc = pq(html)\nprint(doc('li'))\nprint()\ndoc = pq(url='http:\/\/www.sniper97.cn')  # use url\nprint(doc('title'))\nprint()\ndoc = pq(filename='test.xml')           # use file\nprint(doc('li'))\nprint()\n<\/pre>\n<p>\u7ed3\u679c\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/42-1.png\" alt=\"\" class=\"alignnone size-full wp-image-1012\" width=\"668\" height=\"398\" \/><\/p>\n<h5>\u57fa\u672cCSS\u9009\u62e9\u5668\uff1a<\/h5>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># pyquery test -&gt; css\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\" id=\"list-0\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\ndoc = pq(html)\nprint(doc('#list-0 .list li'))\n<\/pre>\n<p>\u5176\u4e2d\uff0clist-0 \u662f\u9009\u53d6id\u4e3alist-0\u7684\u8282\u70b9\uff0c\u7136\u540e\u518d\u9009\u53d6\u5176\u5185\u90e8\u7684class\u4e3alist\u7684\u8282\u70b9\u5185\u90e8\u6240\u5728\u7684li\u8282\u70b9\u3002<br \/>\n\u8f93\u51fa\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/43-1.png\" alt=\"\" class=\"alignnone size-full wp-image-1013\" width=\"415\" height=\"134\" \/><\/p>\n<h5>\u67e5\u627e\u8282\u70b9\uff1a<\/h5>\n<p>\u5b50\u5b59\u8282\u70b9\uff1afind(\uff09<br \/>\n\u7236\u8282\u70b9\uff1aparent\uff08\uff09\u3001parents\uff08\uff09<br \/>\n\u5144\u5f1f\u8282\u70b9\uff1asiblings\uff08\uff09<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># pyquery test -&gt; find node\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\" id=\"list-0\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\ndoc = pq(html)\nitem = doc('ul')\nprint(item)\nprint()\nlis = item.find('li')   # son node\nprint(lis)\nprint()\npar = item.parents()     # parents node\nprint(par)\nprint()\npar = item.parents('.panel-body')     # parent node , only one point\nprint(par)\nprint()\nnode = doc('li')\nprint(node.siblings('.element'))       # find brother node<\/pre>\n<p>\u8f93\u51fa\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/44-1.png\" alt=\"\" class=\"alignnone size-full wp-image-1014\" width=\"501\" height=\"771\" \/> <img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/45-1.png\" alt=\"\" class=\"alignnone size-full wp-image-1015\" width=\"504\" height=\"821\" \/><\/p>\n<h4>\u83b7\u53d6\u4fe1\u606f\uff1a<\/h4>\n<h5>\u83b7\u53d6\u5c5e\u6027\uff1a<\/h5>\n<p>\u5bf9\u4e8e\u591a\u7ec4\u6570\u636e\u7684\u9700\u8981\u4f7f\u7528\u8fed\u4ee3\u5668\u624d\u80fd\u6b63\u786e\u8f93\u51fa\uff0c.text\uff08\uff09\u4ec5\u80fd\u8f93\u51fa\u6587\u672c\uff0c\u800c.html\uff08\uff09\u53ef\u4ee5\u8f93\u51fahtml\u4ee3\u7801\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># pyquery test -&gt; find node\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\" id=\"list-0\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element1\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element2\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element3\"&gt;&lt;a href=\"www.123.com\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element4\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element5\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\ndoc = pq(html)\nitem = doc('ul')\nprint(item.attr('class'))\nprint(item.attr.id)\nitem = doc('li')\nprint(item.attr('class'))       # can not output attr\nfor i in item.items():\n    print(i.attr('class'))      # can output attr\n    print(i.text())              # can output text\n    print(i.html())              # can output html<\/pre>\n<p>\u8f93\u51fa\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/46.png\" alt=\"\" class=\"alignnone size-full wp-image-1017\" width=\"366\" height=\"468\" \/><\/p>\n<h5>\u8282\u70b9\u64cd\u4f5c\uff1a<\/h5>\n<p>\u63d0\u4f9b\u65b9\u6cd5\u8fdb\u884c\u52a8\u6001\u64cd\u4f5c\uff0c\u5141\u8bb8\u4e3a\u67d0\u4e2a\u8282\u70b9\u6dfb\u52a0\u4e00\u4e2aclass\uff0c\u79fb\u9664\u67d0\u4e2a\u70b9\u7b49\u3002<br \/>\naddClass\u548cremoveClass<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># pyquery test -&gt; node handle\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\" id=\"list-0\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element1\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element2\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element3\"&gt;&lt;a href=\"www.123.com\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element4\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element5\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\ndoc = pq(html)\nul = doc('ul')\nprint(ul)\nprint()\nul.add_class('action')\nprint(ul)\nprint()\nul.remove_class('action')\nprint(ul)\nprint()\n<\/pre>\n<p>.\u8f93\u51fa\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/47.png\" alt=\"\" class=\"alignnone size-full wp-image-1019\" width=\"653\" height=\"754\" \/><br \/>\nattr\u3001text\u3001html<br \/>\n\u9664\u4e86\u53ef\u4ee5\u5bf9class\u5c5e\u6027\u8fdb\u884c\u64cd\u4f5c\u4e4b\u5916\uff0c\u8fd8\u53ef\u4ee5\u4f7f\u7528attr\u5bf9\u5c5e\u6027\u8fdb\u884c\u64cd\u4f5c\uff0c\u4f7f\u7528text\u548chtml\u5bf9\u6587\u672c\u4ee5\u53cahtml\u90e8\u5206\u8fdb\u884c\u64cd\u4f5c\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># pyquery test -&gt; attribute handle\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\" id=\"list-0\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element1\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element2\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element3\"&gt;&lt;a href=\"www.123.com\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element4\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element5\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\ndoc = pq(html)\nul = doc('ul')\nprint(ul)\nprint()\nul.attr('name', 'sniper')\nprint(ul)\nprint()\nul.text('change item')\nprint(ul)\nprint()\nul.html('&lt;a href=\"www.123.com\"&gt;')\nprint(ul)\nprint()<\/pre>\n<p>\u8f93\u51fa\uff0c\u53ef\u4ee5\u770b\u5230name\u5c5e\u6027\u88ab\u6dfb\u52a0\uff0ctext\u88ab\u66ff\u6362\uff0chtml\u88ab\u66ff\u6362\u3002\uff08\u540e\u9762\u7684\u5b50\u8282\u70b9\u4e5f\u88ab\u66ff\u6362\u4e86\uff09\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/48.png\" alt=\"\" class=\"alignnone size-full wp-image-1021\" width=\"1282\" height=\"650\" \/><br \/>\nremove\uff08\uff09\uff1a<br \/>\n\u53ef\u4ee5\u79fb\u9664\u4e00\u4e2a\u8282\u70b9\uff1a.find(&#8216; xxx&#8217;).remove(\u00a0 )<br \/>\n\u4f2a\u7c7b\u9009\u62e9\u5668\uff1a<br \/>\n\u53ef\u4ee5\u9009\u62e9\u7b2c\u4e00\u4e2a\u8282\u70b9\u3001\u6700\u540e\u4e00\u4e2a\u8282\u70b9\u3001\u5947\u5076\u6570\u7ed3\u70b9\u3001\u5305\u542b\u67d0\u4e00\u6587\u672c\u7684\u8282\u70b9\u7b49\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># pyquery test -&gt; attribute handle\nhtml = '''\n&lt;div class=\"panel\"&gt;\n    &lt;div class=\"panel-heading\"&gt;\n        &lt;h4&gt;Hello&lt;\/h4&gt;\n    &lt;\/div&gt;\n    &lt;div class=\"panel-body\" id=\"list-0\"&gt;\n        &lt;ul class=\"list\" id=\"list-1\" name=\"elements\"&gt;\n            &lt;li class=\"element1\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element2\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n            &lt;li class=\"element3\"&gt;&lt;a href=\"www.123.com\"&gt;Jay&lt;\/li&gt;\n        &lt;\/ul&gt;\n        &lt;ul class=\"list list-small\" id=\"list-2\"&gt;\n            &lt;li class=\"element4\"&gt;&lt;a href=\"www.123.com\"&gt;Foo&lt;\/li&gt;\n            &lt;li class=\"element5\"&gt;&lt;a href=\"www.123.com\"&gt;Bar&lt;\/li&gt;\n        &lt;\/ul&gt;\n    &lt;\/div&gt;\n&lt;\/div&gt;\n'''\ndoc = pq(html)\nli = doc('li:first-child')\nprint(li)\nprint()\nli = doc('li:last-child')\nprint(li)\nprint()\nli = doc('li:nth-child(2)')\nprint(li)\nprint()\nli = doc('li:gt(2)')\nprint(li)\nprint()\nli = doc('li:nth-child(2n)')\nprint(li)\nprint()\nli = doc('li:contains(Bar)')\nprint(li)\nprint()\n<\/pre>\n<p>\u8f93\u51fa\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/49.png\" alt=\"\" class=\"alignnone size-full wp-image-1022\" width=\"679\" height=\"546\" \/><br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728\u8fd9\u91cc\u4e0b\u8f7d\u3002 1.\u4f7f\u7528XPath \u5168\u79f0 XML Path Language\uff0c\u5373XML\u8bed\u8a00 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_mi_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[12],"tags":[],"views":2235,"_links":{"self":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1000"}],"collection":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/comments?post=1000"}],"version-history":[{"count":0,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1000\/revisions"}],"wp:attachment":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/media?parent=1000"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/categories?post=1000"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/tags?post=1000"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}