{"id":1087,"date":"2018-07-29T12:53:02","date_gmt":"2018-07-29T04:53:02","guid":{"rendered":"http:\/\/www.sniper97.cn\/?p=1087"},"modified":"2018-07-29T12:53:02","modified_gmt":"2018-07-29T04:53:02","slug":"%e7%ac%ac%e5%8d%81%e4%ba%8c%e8%8a%82%ef%bc%9a%e4%bb%a3%e7%90%86%e7%9a%84%e4%bd%bf%e7%94%a8","status":"publish","type":"post","link":"http:\/\/www.sniper97.cn\/index.php\/note\/carwler\/1087\/","title":{"rendered":"\u7b2c\u5341\u4e09\u8282\uff1a\u4ee3\u7406\u7684\u4f7f\u7528"},"content":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728<a href=\"https:\/\/github.com\/Sniper970119\/Spider\/tree\/master\/20180721\" target=\"_blank\" rel=\"noopener\" data-slimstat=\"5\">\u8fd9\u91cc<\/a>\u4e0b\u8f7d\u3002<\/p>\n<h3>1.\u8bbe\u7f6e\u4ee3\u7406<\/h3>\n<h4>urllib \u4e2d\u4f7f\u7528\u4ee3\u7406\uff1a<\/h4>\n<p>\u4ee3\u7801\uff1a\u5176\u4e2dproxy\u662f\u4ee3\u7406ip\u548c\u7aef\u53e3\u53f7\uff0c\u9a8c\u8bc1\u4ee3\u7406\u53ea\u9700\u8981\u5728url\u4e2d\u52a0\u5165\u9a8c\u8bc1\u53c2\u6570\u5373\u53ef\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># -*- coding:utf-8 -*-\nfrom urllib.error import URLError\nfrom urllib.request import ProxyHandler, build_opener\n# urllib agent\nproxy = '127.0.0.1:9743'\nproxy_handler = ProxyHandler({\n    'http': 'http:\/\/' + proxy,\n    'https': 'https:\/\/' + proxy\n})\nopener = build_opener(proxy_handler)\ntry:\n    responce = opener.open('http:\/\/httpbin.org\/get')\n    print(responce.read().decode('utf-8'))\nexcept URLError as e:\n    print(e.reason)\n<\/pre>\n<h4>requests \u4e2d\u4f7f\u7528\u4ee3\u7406\uff1a<\/h4>\n<p>\u5bf9\u4e8erequests\u6765\u8bf4\uff0c\u4ee3\u7406\u66f4\u52a0\u7b80\u5355\uff0c\u53ea\u9700\u8981\u8bbe\u7f6eproxies\u53c2\u6570\u5373\u53ef\uff0c\u9a8c\u8bc1\u4ee3\u7406\u4e5f\u548curllib\u4e00\u6837\uff0c\u52a0\u5165url\u53c2\u6570\u5373\u53ef\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># requests agent\nproxy = '127.0.0.1:9743'\nproxies = {\n    'http': 'http:\/\/' + proxy,\n    'https': 'https:\/\/' + proxy\n}\ntry:\n    responce = requests.get('http:\/\/http:httpbin.org\/get', proxies = proxies)\n    print(responce.text)\nexcept requests.exceptions.ConnectionError as e:\n    print('Error', e.args)\n<\/pre>\n<p>&nbsp;<\/p>\n<h5>Selenium \u4e2d\u4f7f\u7528\u4ee3\u7406\uff1a<\/h5>\n<p>\u5728chrome\u4e2d\uff0c\u7528Selenium\u8bbe\u7f6e\u4ee3\u7406\u7684\u65b9\u6cd5\u4e5f\u975e\u5e38\u7b80\u5355\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># selenium agent\nproxy = '127.0.0.1:9743'\nchrome_options = webdriver.ChromeOptions()\nchrome_options.add_argument('--proxy-server=http:\/\/' + proxy)\nbrowser = webdriver.Chrome(chrome_options=chrome_options)\nbrowser.get('http:\/\/httpbin.org\/get')<\/pre>\n<p>\u5982\u679c\u662f\u9a8c\u8bc1\u4ee3\u7406\uff0c\u8fd9\u9700\u8981\u5728\u672c\u5730\u521b\u5efa\u4e00\u4e2amainifest.json \u914d\u7f6e\u6587\u4ef6\u548c background.js \u811a\u672c\u6765\u8bbe\u7f6e\u8ba4\u8bc1\u4ee3\u7406\u3002\u8fd0\u884c\u4ee3\u7801\u4e4b\u540e\u672c\u5730\u4f1a\u751f\u6210\u4e00\u4e2a proxy_auth_plugin.zip \u6587\u4ef6\u6765\u4fdd\u5b58\u5f53\u524d\u914d\u7f6e\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\"># selenium agent with auth\nip = '127.0.0.1'\nport = 9743\nusername = 'username'\npassword = 'password'\nmanifest_json = \"\"\"\n{\n    \"version\": \"1.0.0\",\n    \"manifest_version\": 2,\n    \"name\": \"Chrome Proxy\",\n    \"permissions\": [\n        \"proxy\",\n        \"tabs\",\n        \"unlimitedStorage\",\n        \"storage\",\n        \"&lt;all_urls&gt;\",\n        \"webRequest\",\n        \"webRequestBlocking\"\n    ],\n    \"background\": {\n        \"scripts\": [\"background.js\"]\n    }\n}\n\"\"\"\nbackground_js = \"\"\"\nvar config = {\n        mode: \"fixed_servers\",\n        rules: {\n          singleProxy: {\n            scheme: \"http\",\n            host: \"%(ip)s\",\n            port: %(port)s\n          }\n        }\n      }\nchrome.proxy.settings.set({value: config, scope: \"regular\"}, function() {});\nfunction callbackFn(details) {\n    return {\n        authCredentials: {\n            username: \"%(username)s\",\n            password: \"%(password)s\"\n        }\n    }\n}\nchrome.webRequest.onAuthRequired.addListener(\n            callbackFn,\n            {urls: [\"&lt;all_urls&gt;\"]},\n            ['blocking']\n)\n\"\"\" % {'ip': ip, 'port': port, 'username': username, 'password': password}\nplugin_file = 'proxy_auth_plugin.zip'\nwith zipfile.ZipFile(plugin_file, 'w') as zp:\n    zp.writestr(\"manifest.json\", manifest_json)\n    zp.writestr(\"background.js\", background_js)\nchrome_options = Options()\nchrome_options.add_argument(\"--start-maximized\")\nchrome_options.add_extension(plugin_file)\nbrowser = webdriver.Chrome(chrome_options=chrome_options)\nbrowser.get('http:\/\/httpbin.org\/get')<\/pre>\n<p>&nbsp;<\/p>\n<h3>2.\u4ee3\u7406\u6c60\u7ef4\u62a4<\/h3>\n<p>\u6211\u4eec\u5229\u7528\u4ee3\u7406\u53ef\u4ee5\u89e3\u51b3\u76ee\u6807\u7f51\u7ad9\u5c01ip\u7684\u95ee\u9898\u3002\u4f46\u662f\u65e0\u8bba\u662f\u6536\u8d39\u4ee3\u7406\u8fd8\u662f\u514d\u8d39\u4ee3\u7406\u90fd\u5b58\u5728\u4ee3\u7406\u4e0d\u53ef\u7528\u7684\u60c5\u51b5\uff0c\u6240\u4ee5\u6211\u4eec\u9700\u8981\u5efa\u7acb\u4e00\u4e2a\u9ad8\u6548\u6613\u7528\u7684\u4ee3\u7406\u6c60\u3002<br \/>\n\u5b58\u50a8\u6a21\u5757\uff1a\u8d1f\u8d23\u5b58\u50a8\u6293\u53d6\u4e0b\u6765\u7684\u4ee3\u7406<br \/>\n\u83b7\u53d6\u6a21\u5757\uff1a\u9700\u8981\u5b9a\u65f6\u5728\u5404\u5927\u4ee3\u7406\u7f51\u7ad9\u6293\u53d6\u4ee3\u7406\u3002<br \/>\n\u68c0\u6d4b\u6a21\u5757\uff1a\u9700\u8981\u5b9a\u65f6\u68c0\u6d4b\u6570\u636e\u5e93\u4e2d\u7684\u4ee3\u7406\u3002<br \/>\n\u63a5\u53e3\u6a21\u5757\uff1a\u9700\u8981\u7528API\u6765\u63d0\u4f9b\u5bf9\u5916\u670d\u52a1\u7684\u63a5\u53e3\u3002<\/p>\n<h4>\u5b58\u50a8\u6a21\u5757\uff1a<\/h4>\n<p>\u91c7\u7528\u5206\u6570\u5236\uff0c\u5206\u6570100\u4e3a\u53ef\u7528\uff0c\u68c0\u6d4b\u5668\u5b9a\u65f6\u5faa\u73af\u68c0\u6d4b\u6bcf\u4e2a\u4ee3\u7406\u7684\u4f7f\u7528\u60c5\u51b5\uff0c\u68c0\u6d4b\u5230\u4e0d\u53ef\u7528\u5206\u6570\u51cf1\uff0c\u5206\u6570\u51cf\u81f30\u540e\u4ee3\u7406\u88ab\u79fb\u9664\u3002<br \/>\n\u65b0\u83b7\u53d6\u7684\u4ee3\u7406\u7684\u5206\u6570\u4e3a10\uff0c\u5982\u679c\u6d4b\u8bd5\u53ef\u884c\uff0c\u5219\u5206\u6570\u7acb\u5373\u7f6e\u4e3a100\uff0c\u4e0d\u53ef\u884c\u5219\u5206\u6570\u51cf1\uff0c\u5206\u6570\u51cf\u81f30\u540e\u4ee3\u7406\u79fb\u9664\u3002<br \/>\n__init__\uff08\uff09\uff1a\u5bf9Redis\u8fdb\u884c\u521d\u59cb\u5316\uff0c\u5efa\u7acbRedis\u8fde\u63a5\u3002<br \/>\nadd\uff08\uff09\uff1a\u5411\u6570\u636e\u5e93\u4e2d\u6dfb\u52a0\u4ee3\u7406\u8bbe\u7f6e\u5206\u6570\u3002<br \/>\nrandom\uff08\uff09\uff1a\u968f\u673a\u83b7\u53d6\u4ee3\u7406\u7684\u65b9\u6cd5\uff0c\u9996\u5148\u5c1d\u8bd5\u83b7\u53d6100\u5206\uff0c\u5982\u679c\u6ca1\u6709\u6309\u7167\u6392\u540d\u83b7\u53d6\u3002<br \/>\ndecrease\uff08\uff09\uff1a\u5728\u4ee3\u7406\u68c0\u6d4b\u65e0\u6548\u65f6\u5206\u6570-1\uff0c\u5206\u6570\u8fbe\u5230\u6700\u4f4e\u503c\uff0c\u79fb\u9664\u4ee3\u7406\u3002<br \/>\nexists\uff08\uff09\uff1a\u5224\u65ad\u4ee3\u7406\u662f\u5426\u5b58\u5728\u3002<br \/>\nmax\uff08\uff09\uff1a\u5c06\u4ee3\u7406\u7684\u5206\u6570\u8bbe\u7f6e\u4e3a\u6700\u5927\u3002<br \/>\ncount\uff08\uff09\uff1a\u8fd4\u56de\u96c6\u5408\u4e2a\u6570\u3002<br \/>\nall\uff08\uff09\uff1a\u8fd4\u56de\u6240\u6709\u4ee3\u7406\u5217\u8868\uff0c\u4ee5\u4f9b\u68c0\u67e5\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">class RedisClient(object):\n    def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):\n        print('''\n        \u521d\u59cb\u5316 \u5730\u5740\u3001\u7aef\u53e3\u3001\u5bc6\u7801\n        ''')\n        self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True)\n    def add(self, proxy, score=INITIAL_SCORE):\n        print('''\n        \u6dfb\u52a0\u4ee3\u7406\uff0c\u8bbe\u7f6e\u6700\u9ad8\u5206\u6570''')\n        if not self.db.zscore(REDIS_KEY, proxy):\n            return self.db.zadd(REDIS_KEY, proxy)\n    def random(self):\n        print('''\n        \u968f\u673a\u83b7\u53d6\u6709\u6548\u4ee3\u7406''')\n        result = self.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)\n        if len(result):\n            return choice(result)\n        else:\n            result = self.db.zrevrange(REDIS_KEY, 0, 100)\n            if len(result):\n                return choice(result)\n            else:\n                print('\u4ee3\u7406\u6c60\u4e3a\u7a7a')\n    def decrease(self, proxy):\n        print('''\n        \u4ee3\u7406\u503c\u51cf\u4e00\u5206''')\n        score = self.db.zscore(REDIS_KEY, proxy)\n        if score and score &gt; MIN_SCORE:\n            print('\u4ee3\u7406', proxy, '\u5f53\u524d\u5206\u6570', score, '\u51cf1')\n            return self.db.zincrby(REDIS_KEY, proxy, -1)\n        else:\n            print('\u4ee3\u7406', proxy, '\u5f53\u524d\u5206\u6570', score, '\u79fb\u9664')\n            return self.db.zrem(REDIS_KEY, proxy)\n    def exists(self, proxy):\n        print('''\n        \u5224\u65ad\u662f\u5426\u5b58\u5728''')\n        return not self.db.zscore(REDIS_KEY, proxy) == None\n    def max(self, proxy):\n        print('''\n        \u5c06\u4ee3\u7406\u8bbe\u7f6e\u4e3a\u6700\u5927\u503c''')\n        print('\u4ee3\u7406', proxy, '\u53ef\u7528\uff0c\u8bbe\u7f6e\u4e3a', MAX_SCORE)\n        return self.db.zadd(REDIS_KEY, MAX_SCORE, proxy)\n    def count(self):\n        print('''\n        \u83b7\u53d6\u6570\u91cf''')\n        return self.db.zcard(REDIS_KEY)\n    def all(self):\n        print('''\n        \u83b7\u53d6\u5168\u90e8\u4ee3\u7406''')\n        return self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)\n<\/pre>\n<p>&nbsp;<\/p>\n<h4>\u83b7\u53d6\u6a21\u5757\uff1a<\/h4>\n<p>\u9996\u5148\u8bbe\u7f6e\u5143\u7c7b\uff08ProxyMetaclass\uff09\uff0c\u8be5\u7c7b\u4e2d\u5b9e\u73b0\u4e86__new__()\u65b9\u6cd5\uff0c\u8fd9\u4e2a\u65b9\u6cd5\u56fa\u6709\u4e86\u51e0\u4e2a\u53c2\u6570\uff0c\u5176\u4e2d\u7b2c\u56db\u4e2aattrs\u4e2d\u5305\u542b\u4e86\u7c7b\u7684\u4e00\u4e9b\u5c5e\u6027\uff0c\u6211\u4eec\u53ef\u4ee5\u904d\u5386attrs\u6765\u83b7\u53d6\u6240\u6709\u7684\u7c7b\u4fe1\u606f\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">class ProxyMetaclass(type):\n    def __new__(cls, name, bases, attrs):\n        count = 0\n        attrs['__CrawlFunc__'] = []\n        for k, v in attrs.items():\n            if 'crawl_' in k:\n                attrs['__CrawlFunc__'].append(k)\n                count += 1\n        attrs['__CrawlFuncCount__'] = count\n        return type.__new__(cls, name, bases, attrs)\nclass Crawler(object, metaclass=ProxyMetaclass):\n    def get_proxies(self, callback):\n        proxies = []\n        for proxy in eval(\"self.{}()\".format(callback)):\n            print('\u6210\u529f\u83b7\u53d6\u5230\u4ee3\u7406', proxy)\n            proxies.append(proxy)\n        return proxies\n    def crawl_daili66(self, page_count=4):\n        \"\"\"\n        \u83b7\u53d6\u4ee3\u740666\n        :param page_count: \u9875\u7801\n        :return: \u4ee3\u7406\n        \"\"\"\n        start_url = 'http:\/\/www.66ip.cn\/{}.html'\n        urls = [start_url.format(page) for page in range(1, page_count + 1)]\n        for url in urls:\n            print('Crawling', url)\n            html = get_page(url)\n            if html:\n                doc = pq(html)\n                trs = doc('.containerbox table tr:gt(0)').items()\n                for tr in trs:\n                    ip = tr.find('td:nth-child(1)').text()\n                    port = tr.find('td:nth-child(2)').text()\n                    yield ':'.join([ip, port])\n    def crawl_proxy360(self):\n        \"\"\"\n        \u83b7\u53d6Proxy360\n        :return: \u4ee3\u7406\n        \"\"\"\n        start_url = 'http:\/\/www.proxy360.cn\/Region\/China'\n        print('Crawling', start_url)\n        html = get_page(start_url)\n        if html:\n            doc = pq(html)\n            lines = doc('div[name=\"list_proxy_ip\"]').items()\n            for line in lines:\n                ip = line.find('.tbBottomLine:nth-child(1)').text()\n                port = line.find('.tbBottomLine:nth-child(2)').text()\n                yield ':'.join([ip, port])\n    def crawl_goubanjia(self):\n        \"\"\"\n        \u83b7\u53d6Goubanjia\n        :return: \u4ee3\u7406\n        \"\"\"\n        start_url = 'http:\/\/www.goubanjia.com\/free\/gngn\/index.shtml'\n        html = get_page(start_url)\n        if html:\n            doc = pq(html)\n            tds = doc('td.ip').items()\n            for td in tds:\n                td.find('p').remove()\n                yield td.text().replace(' ', '')\n    def crawl_ip181(self):\n        start_url = 'http:\/\/www.ip181.com\/'\n        html = get_page(start_url)\n        ip_address = re.compile('&lt;tr.*?&gt;\\s*&lt;td&gt;(.*?)&lt;\/td&gt;\\s*&lt;td&gt;(.*?)&lt;\/td&gt;')\n        # \\s* \u5339\u914d\u7a7a\u683c\uff0c\u8d77\u5230\u6362\u884c\u4f5c\u7528\n        re_ip_address = ip_address.findall(html)\n        for address, port in re_ip_address:\n            result = address + ':' + port\n            yield result.replace(' ', '')\n    def crawl_ip3366(self):\n        for page in range(1, 4):\n            start_url = 'http:\/\/www.ip3366.net\/free\/?stype=1&amp;page={}'.format(page)\n            html = get_page(start_url)\n            ip_address = re.compile('&lt;tr&gt;\\s*&lt;td&gt;(.*?)&lt;\/td&gt;\\s*&lt;td&gt;(.*?)&lt;\/td&gt;')\n            # \\s * \u5339\u914d\u7a7a\u683c\uff0c\u8d77\u5230\u6362\u884c\u4f5c\u7528\n            re_ip_address = ip_address.findall(html)\n            for address, port in re_ip_address:\n                result = address + ':' + port\n                yield result.replace(' ', '')\n    def crawl_kxdaili(self):\n        for i in range(1, 11):\n            start_url = 'http:\/\/www.kxdaili.com\/ipList\/{}.html#ip'.format(i)\n            html = get_page(start_url)\n            ip_address = re.compile('&lt;tr.*?&gt;\\s*&lt;td&gt;(.*?)&lt;\/td&gt;\\s*&lt;td&gt;(.*?)&lt;\/td&gt;')\n            # \\s* \u5339\u914d\u7a7a\u683c\uff0c\u8d77\u5230\u6362\u884c\u4f5c\u7528\n            re_ip_address = ip_address.findall(html)\n            for address, port in re_ip_address:\n                result = address + ':' + port\n                yield result.replace(' ', '')\n    def crawl_premproxy(self):\n        for i in ['China-01', 'China-02', 'China-03', 'China-04', 'Taiwan-01']:\n            start_url = 'https:\/\/premproxy.com\/proxy-by-country\/{}.htm'.format(i)\n            html = get_page(start_url)\n            if html:\n                ip_address = re.compile('&lt;td data-label=\"IP:port \"&gt;(.*?)&lt;\/td&gt;')\n                re_ip_address = ip_address.findall(html)\n                for address_port in re_ip_address:\n                    yield address_port.replace(' ', '')\n    def crawl_xroxy(self):\n        for i in ['CN', 'TW']:\n            start_url = 'http:\/\/www.xroxy.com\/proxylist.php?country={}'.format(i)\n            html = get_page(start_url)\n            if html:\n                ip_address1 = re.compile(\"title='View this Proxy details'&gt;\\s*(.*).*\")\n                re_ip_address1 = ip_address1.findall(html)\n                ip_address2 = re.compile(\"title='Select proxies with port number .*'&gt;(.*)&lt;\/a&gt;\")\n                re_ip_address2 = ip_address2.findall(html)\n                for address, port in zip(re_ip_address1, re_ip_address2):\n                    address_port = address + ':' + port\n                    yield address_port.replace(' ', '')\n    def crawl_kuaidaili(self):\n        for i in range(1, 4):\n            start_url = 'http:\/\/www.kuaidaili.com\/free\/inha\/{}\/'.format(i)\n            html = get_page(start_url)\n            if html:\n                ip_address = re.compile('&lt;td data-title=\"IP\"&gt;(.*?)&lt;\/td&gt;')\n                re_ip_address = ip_address.findall(html)\n                port = re.compile('&lt;td data-title=\"PORT\"&gt;(.*?)&lt;\/td&gt;')\n                re_port = port.findall(html)\n                for address, port in zip(re_ip_address, re_port):\n                    address_port = address + ':' + port\n                    yield address_port.replace(' ', '')\n    def crawl_xicidaili(self):\n        for i in range(1, 3):\n            start_url = 'http:\/\/www.xicidaili.com\/nn\/{}'.format(i)\n            headers = {\n                'Accept': 'text\/html,application\/xhtml+xml,application\/xml;q=0.9,image\/webp,image\/apng,*\/*;q=0.8',\n                'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',\n                'Host': 'www.xicidaili.com',\n                'Referer': 'http:\/\/www.xicidaili.com\/nn\/3',\n                'Upgrade-Insecure-Requests': '1',\n            }\n            html = get_page(start_url, options=headers)\n            if html:\n                find_trs = re.compile('&lt;tr class.*?&gt;(.*?)&lt;\/tr&gt;', re.S)\n                trs = find_trs.findall(html)\n                for tr in trs:\n                    find_ip = re.compile('&lt;td&gt;(\\d+\\.\\d+\\.\\d+\\.\\d+)&lt;\/td&gt;')\n                    re_ip_address = find_ip.findall(tr)\n                    find_port = re.compile('&lt;td&gt;(\\d+)&lt;\/td&gt;')\n                    re_port = find_port.findall(tr)\n                    for address, port in zip(re_ip_address, re_port):\n                        address_port = address + ':' + port\n                        yield address_port.replace(' ', '')\n    def crawl_ip3366(self):\n        for i in range(1, 4):\n            start_url = 'http:\/\/www.ip3366.net\/?stype=1&amp;page={}'.format(i)\n            html = get_page(start_url)\n            if html:\n                find_tr = re.compile('&lt;tr&gt;(.*?)&lt;\/tr&gt;', re.S)\n                trs = find_tr.findall(html)\n                for s in range(1, len(trs)):\n                    find_ip = re.compile('&lt;td&gt;(\\d+\\.\\d+\\.\\d+\\.\\d+)&lt;\/td&gt;')\n                    re_ip_address = find_ip.findall(trs[s])\n                    find_port = re.compile('&lt;td&gt;(\\d+)&lt;\/td&gt;')\n                    re_port = find_port.findall(trs[s])\n                    for address, port in zip(re_ip_address, re_port):\n                        address_port = address + ':' + port\n                        yield address_port.replace(' ', '')\n    def crawl_iphai(self):\n        start_url = 'http:\/\/www.iphai.com\/'\n        html = get_page(start_url)\n        if html:\n            find_tr = re.compile('&lt;tr&gt;(.*?)&lt;\/tr&gt;', re.S)\n            trs = find_tr.findall(html)\n            for s in range(1, len(trs)):\n                find_ip = re.compile('&lt;td&gt;\\s+(\\d+\\.\\d+\\.\\d+\\.\\d+)\\s+&lt;\/td&gt;', re.S)\n                re_ip_address = find_ip.findall(trs[s])\n                find_port = re.compile('&lt;td&gt;\\s+(\\d+)\\s+&lt;\/td&gt;', re.S)\n                re_port = find_port.findall(trs[s])\n                for address, port in zip(re_ip_address, re_port):\n                    address_port = address + ':' + port\n                    yield address_port.replace(' ', '')\n    def crawl_89ip(self):\n        start_url = 'http:\/\/www.89ip.cn\/apijk\/?&amp;tqsl=1000&amp;sxa=&amp;sxb=&amp;tta=&amp;ports=&amp;ktip=&amp;cf=1'\n        html = get_page(start_url)\n        if html:\n            find_ips = re.compile('(\\d+\\.\\d+\\.\\d+\\.\\d+:\\d+)', re.S)\n            ip_ports = find_ips.findall(html)\n            for address_port in ip_ports:\n                yield address_port\n    def crawl_data5u(self):\n        start_url = 'http:\/\/www.data5u.com\/free\/gngn\/index.shtml'\n        headers = {\n            'Accept': 'text\/html,application\/xhtml+xml,application\/xml;q=0.9,image\/webp,image\/apng,*\/*;q=0.8',\n            'Accept-Encoding': 'gzip, deflate',\n            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',\n            'Cache-Control': 'max-age=0',\n            'Connection': 'keep-alive',\n            'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',\n            'Host': 'www.data5u.com',\n            'Referer': 'http:\/\/www.data5u.com\/free\/index.shtml',\n            'Upgrade-Insecure-Requests': '1',\n            'User-Agent': 'Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/63.0.3239.108 Safari\/537.36',\n        }\n        html = get_page(start_url, options=headers)\n        if html:\n            ip_address = re.compile('&lt;span&gt;&lt;li&gt;(\\d+\\.\\d+\\.\\d+\\.\\d+)&lt;\/li&gt;.*?&lt;li class=\\\"port.*?&gt;(\\d+)&lt;\/li&gt;', re.S)\n            re_ip_address = ip_address.findall(html)\n            for address, port in re_ip_address:\n                result = address + ':' + port\n                yield result.replace(' ', '')\n<\/pre>\n<p>&nbsp;<\/p>\n<h4>\u68c0\u6d4b\u6a21\u5757\uff1a<\/h4>\n<p>\u6211\u4eec\u4f7f\u7528\u5f02\u6b65\u8bf7\u6c42\u5e93aiohttp\u6765\u8fdb\u884c\u68c0\u6d4b\u3002<br \/>\nrequests\u4f5c\u4e3a\u4e00\u4e2a\u540c\u6b65\u8bf7\u6c42\u5e93\uff0c\u6211\u4eec\u53d1\u51fa\u4e00\u4e2a\u8bf7\u6c42\u4e4b\u540e\uff0c\u7a0b\u5e8f\u9700\u8981\u7b49\u5f85\u7f51\u9875\u52a0\u8f7d\u5b8c\u6210\u4e4b\u540e\u624d\u80fd\u7ee7\u7eed\u6267\u884c\uff0c\u5982\u679c\u670d\u52a1\u5668\u54cd\u5e94\u7f13\u6162\uff0c\u90a3\u4f1a\u5341\u5206\u5f71\u54cd\u6548\u7387\uff0c\u6240\u4ee5\u6211\u4eec\u4f7f\u7528\u5f02\u6b65\u8bf7\u6c42\u5e93\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">class Tester(object):\n    def __init__(self):\n        self.redis = RedisClient()\n    async def test_single_proxy(self, proxy):\n        \"\"\"\n        \u6d4b\u8bd5\u5355\u4e2a\u4ee3\u7406\n        :param proxy:\n        :return:\n        \"\"\"\n        conn = aiohttp.TCPConnector(verify_ssl=False)\n        async with aiohttp.ClientSession(connector=conn) as session:\n            try:\n                if isinstance(proxy, bytes):\n                    proxy = proxy.decode('utf-8')\n                real_proxy = 'http:\/\/' + proxy\n                print('\u6b63\u5728\u6d4b\u8bd5', proxy)\n                async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:\n                    if response.status in VALID_STATUS_CODES:\n                        self.redis.max(proxy)\n                        print('\u4ee3\u7406\u53ef\u7528', proxy)\n                    else:\n                        self.redis.decrease(proxy)\n                        print('\u8bf7\u6c42\u54cd\u5e94\u7801\u4e0d\u5408\u6cd5 ', response.status, 'IP', proxy)\n            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):\n                self.redis.decrease(proxy)\n                print('\u4ee3\u7406\u8bf7\u6c42\u5931\u8d25', proxy)\n    def run(self):\n        \"\"\"\n        \u6d4b\u8bd5\u4e3b\u51fd\u6570\n        :return:\n        \"\"\"\n        print('\u6d4b\u8bd5\u5668\u5f00\u59cb\u8fd0\u884c')\n        try:\n            count = self.redis.count()\n            print('\u5f53\u524d\u5269\u4f59', count, '\u4e2a\u4ee3\u7406')\n            for i in range(0, count, BATCH_TEST_SIZE):\n                start = i\n                stop = min(i + BATCH_TEST_SIZE, count)\n                print('\u6b63\u5728\u6d4b\u8bd5\u7b2c', start + 1, '-', stop, '\u4e2a\u4ee3\u7406')\n                test_proxies = self.redis.batch(start, stop)\n                loop = asyncio.get_event_loop()\n                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]\n                loop.run_until_complete(asyncio.wait(tasks))\n                sys.stdout.flush()\n                time.sleep(5)\n        except Exception as e:\n            print('\u6d4b\u8bd5\u5668\u53d1\u751f\u9519\u8bef', e.args)\n<\/pre>\n<p>&nbsp;<\/p>\n<h4>\u63a5\u53e3\u6a21\u5757\uff1a<\/h4>\n<p>\u6211\u4eec\u4f7f\u7528\u4e00\u4e2a\u8f7b\u91cf\u7ea7\u7684Flask\u5e93\u6765\u5b9e\u73b0\u8fd9\u4e2a\u63a5\u53e3\u6a21\u5757\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">def get_conn():\n    if not hasattr(g, 'redis'):\n        g.redis = RedisClient()\n    return g.redis\n@app.route('\/')\ndef index():\n    return '&lt;h2&gt;Welcome to Proxy Pool System&lt;\/h2&gt;'\n@app.route('\/random')\ndef get_proxy():\n    \"\"\"\n    Get a proxy\n    :return: \u968f\u673a\u4ee3\u7406\n    \"\"\"\n    conn = get_conn()\n    return conn.random()\n@app.route('\/count')\ndef get_counts():\n    \"\"\"\n    Get the count of proxies\n    :return: \u4ee3\u7406\u6c60\u603b\u91cf\n    \"\"\"\n    conn = get_conn()\n    return str(conn.count())\nif __name__ == '__main__':\n    app.run()\n<\/pre>\n<p>\u6211\u4eec\u58f0\u660e\u4e86Flask\u5bf9\u8c61\uff0c\u5b9a\u4e49\u4e863\u4e2a\u63a5\u53e3\uff0c\u5206\u522b\u662f\u9996\u9875\u3001\u968f\u673a\u4ee3\u7406\u9875\u3001\u83b7\u53d6\u6570\u91cf\u9875\u3002<br \/>\n&nbsp;<\/p>\n<h4>\u8c03\u5ea6\u6a21\u5757\uff1a<\/h4>\n<p>\u8c03\u7528\u4ee5\u4e0a3\u4e2a\u6a21\u5757\uff0c\u5c06\u8fd9\u4e09\u4e2a\u6a21\u5757\u901a\u8fc7\u591a\u7ebf\u7a0b\u7684\u5f62\u5f0f\u8fd0\u884c\u8d77\u6765\u3002<\/p>\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\">class Scheduler():\n    def schedule_tester(self, cycle=TESTER_CYCLE):\n        \"\"\"\n        \u5b9a\u65f6\u6d4b\u8bd5\u4ee3\u7406\n        \"\"\"\n        tester = Tester()\n        while True:\n            print('\u6d4b\u8bd5\u5668\u5f00\u59cb\u8fd0\u884c')\n            tester.run()\n            time.sleep(cycle)\n    def schedule_getter(self, cycle=GETTER_CYCLE):\n        \"\"\"\n        \u5b9a\u65f6\u83b7\u53d6\u4ee3\u7406\n        \"\"\"\n        getter = Getter()\n        while True:\n            print('\u5f00\u59cb\u6293\u53d6\u4ee3\u7406')\n            getter.run()\n            time.sleep(cycle)\n    def schedule_api(self):\n        \"\"\"\n        \u5f00\u542fAPI\n        \"\"\"\n        app.run(API_HOST, API_PORT)\n    def run(self):\n        print('\u4ee3\u7406\u6c60\u5f00\u59cb\u8fd0\u884c')\n        if TESTER_ENABLED:\n            tester_process = Process(target=self.schedule_tester)\n            tester_process.start()\n        if GETTER_ENABLED:\n            getter_process = Process(target=self.schedule_getter)\n            getter_process.start()\n        if API_ENABLED:\n            api_process = Process(target=self.schedule_api)\n            api_process.start()\n<\/pre>\n<h4>\u8fd0\u884c\uff1a<\/h4>\n<p>\u8bbf\u95eeAPI\u83b7\u5f97\u968f\u673a\u4ee3\u7406\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/83.png\" alt=\"\" class=\"alignnone size-full wp-image-1092\" width=\"399\" height=\"160\" \/><br \/>\n\u6d4b\u8bd5\u4ee3\u7406\u5e76\u8bbe\u7f6e\u5206\u6570\uff1a<br \/>\n<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/84.png\" alt=\"\" class=\"alignnone size-full wp-image-1093\" width=\"993\" height=\"519\" \/><br \/>\n\u67e5\u770bRedis Desktop\u4e2d\u7684\u6570\u636e\uff1a<img loading=\"lazy\" decoding=\"async\" src=\"http:\/\/www.sniper97.cn\/wp-content\/uploads\/2018\/07\/85.png\" alt=\"\" class=\"alignnone size-full wp-image-1094\" width=\"741\" height=\"252\" \/><br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n&nbsp;<br \/>\n.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u9875\u4ee3\u7801\u53ef\u4ee5\u5728\u8fd9\u91cc\u4e0b\u8f7d\u3002 1.\u8bbe\u7f6e\u4ee3\u7406 urllib \u4e2d\u4f7f\u7528\u4ee3\u7406\uff1a \u4ee3\u7801\uff1a\u5176\u4e2dproxy\u662f\u4ee3\u7406ip\u548c [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_mi_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[12],"tags":[],"views":4656,"_links":{"self":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1087"}],"collection":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/comments?post=1087"}],"version-history":[{"count":0,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/1087\/revisions"}],"wp:attachment":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/media?parent=1087"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/categories?post=1087"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/tags?post=1087"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}