本页源码可以在这里查看或下载。
获取页面源码:
# get page code def get_one_page(url): headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339' '6.99 Safari/537.36' } try: responce = requests.get(url=url, headers=headers, timeout=5) if responce.status_code == 200: return responce.text else: return None except RequestException: return None
分析源码,使用正则表达式分割源码,并使用yield关键字处理:
# parse the message with regex def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?title="(.*?)".*?data-src="(.*?)".*?class="star">' '(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S) items = re.findall(pattern=pattern, string=html) for item in items: yield { 'index': item[0], 'title': item[1].strip(), 'image': item[2], 'actor': item[3].strip()[3:], 'time': item[4].strip()[5:], 'score': item[5].strip() + item[6].strip() } return items
将整理后的数据写入文件,通过json的dumps()实现字典的序列化,并指定ensure_ascii参数为False,这样可以保证中文形式而不是Unicode编码:
# write to file def write_to_file(content): with open('MaoYanSpider.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n')
处理图片URL并将图片存储,使用正则表达式把原链接(小图)后去掉,变成大图,然后使用文件保存:
# get picture def get_picture(content): headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339' '6.99 Safari/537.36' } url = content['image'] url = re.sub('(@.*)', "", url) title = content['title'] responce = requests.get(url=url, headers=headers, timeout=5) with open("./image/" + title + ".jpg", 'wb') as f: f.write(responce.content)
主函数,使用循环遍历偏移量,并依次调用上述方法:
# main if __name__ == '__main__': offset = 0 for i in range(10): url = 'http://www.maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) # f = open('maoyan.txt', 'w') # print page code in file # f.write(html) # print(html) # print page code in console for item in parse_one_page(html): # print the handled message # print(item) # write_to_file(item) get_picture(item) offset = offset + 10
爬取结果:
完整代码:
# -*- coding:utf-8 -*- import json import requests import re from requests.exceptions import RequestException # get page code def get_one_page(url): headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339' '6.99 Safari/537.36' } try: responce = requests.get(url=url, headers=headers, timeout=5) if responce.status_code == 200: return responce.text else: return None except RequestException: return None # parse the message with regex def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?title="(.*?)".*?data-src="(.*?)".*?class="star">' '(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S) items = re.findall(pattern=pattern, string=html) for item in items: yield { 'index': item[0], 'title': item[1].strip(), 'image': item[2], 'actor': item[3].strip()[3:], 'time': item[4].strip()[5:], 'score': item[5].strip() + item[6].strip() } return items # write to file def write_to_file(content): with open('MaoYanSpider.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') # get picture def get_picture(content): headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339' '6.99 Safari/537.36' } url = content['image'] url = re.sub('(@.*)', "", url) title = content['title'] responce = requests.get(url=url, headers=headers, timeout=5) with open("./image/" + title + ".jpg", 'wb') as f: f.write(responce.content) # main if __name__ == '__main__': offset = 0 for i in range(10): url = 'http://www.maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) # f = open('maoyan.txt', 'w') # print page code in file # f.write(html) # print(html) # print page code in console for item in parse_one_page(html): # print the handled message # print(item) # write_to_file(item) get_picture(item) offset = offset + 10
.
0 条评论