本页源码可以在这里查看或下载。
 
获取页面源码:

# get page code
def get_one_page(url):
    headers = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339'
                      '6.99 Safari/537.36'
    }
    try:
        responce = requests.get(url=url, headers=headers, timeout=5)
        if responce.status_code == 200:
            return responce.text
        else:
            return None
    except RequestException:
        return None



分析源码,使用正则表达式分割源码,并使用yield关键字处理:

# parse the message with regex
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?title="(.*?)".*?data-src="(.*?)".*?class="star">'
                         '(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S)
    items = re.findall(pattern=pattern, string=html)
    for item in items:
        yield {
            'index': item[0],
            'title': item[1].strip(),
            'image': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5].strip() + item[6].strip()
        }
    return items

将整理后的数据写入文件,通过json的dumps()实现字典的序列化,并指定ensure_ascii参数为False,这样可以保证中文形式而不是Unicode编码:

# write to file
def write_to_file(content):
    with open('MaoYanSpider.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')

处理图片URL并将图片存储,使用正则表达式把原链接(小图)后去掉,变成大图,然后使用文件保存:

# get picture
def get_picture(content):
    headers = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339'
                      '6.99 Safari/537.36'
    }
    url = content['image']
    url = re.sub('(@.*)', "", url)
    title = content['title']
    responce = requests.get(url=url, headers=headers, timeout=5)
    with open("./image/" + title + ".jpg", 'wb') as f:
        f.write(responce.content)

 
主函数,使用循环遍历偏移量,并依次调用上述方法:

# main
if __name__ == '__main__':
    offset = 0
    for i in range(10):
        url = 'http://www.maoyan.com/board/4?offset=' + str(offset)
        html = get_one_page(url)
        # f = open('maoyan.txt', 'w')  # print page code in file
        # f.write(html)
        # print(html)                  # print page code in console
        for item in parse_one_page(html):  # print the handled message
            # print(item)
            # write_to_file(item)
            get_picture(item)
        offset = offset + 10

爬取结果:


 
完整代码:

# -*- coding:utf-8 -*-
import json
import requests
import re
from requests.exceptions import RequestException
# get page code
def get_one_page(url):
    headers = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339'
                      '6.99 Safari/537.36'
    }
    try:
        responce = requests.get(url=url, headers=headers, timeout=5)
        if responce.status_code == 200:
            return responce.text
        else:
            return None
    except RequestException:
        return None
# parse the message with regex
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?title="(.*?)".*?data-src="(.*?)".*?class="star">'
                         '(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S)
    items = re.findall(pattern=pattern, string=html)
    for item in items:
        yield {
            'index': item[0],
            'title': item[1].strip(),
            'image': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5].strip() + item[6].strip()
        }
    return items
# write to file
def write_to_file(content):
    with open('MaoYanSpider.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
# get picture
def get_picture(content):
    headers = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.339'
                      '6.99 Safari/537.36'
    }
    url = content['image']
    url = re.sub('(@.*)', "", url)
    title = content['title']
    responce = requests.get(url=url, headers=headers, timeout=5)
    with open("./image/" + title + ".jpg", 'wb') as f:
        f.write(responce.content)
# main
if __name__ == '__main__':
    offset = 0
    for i in range(10):
        url = 'http://www.maoyan.com/board/4?offset=' + str(offset)
        html = get_one_page(url)
        # f = open('maoyan.txt', 'w')  # print page code in file
        # f.write(html)
        # print(html)                  # print page code in console
        for item in parse_one_page(html):  # print the handled message
            # print(item)
            # write_to_file(item)
            get_picture(item)
        offset = offset + 10

.


0 条评论

发表回复

Avatar placeholder

您的电子邮箱地址不会被公开。 必填项已用*标注