本页代码可以在这里下载。
1.分析请求
打开新浪微博,过滤Ajax请求并查看。
我们可以看到URL中有4个属性值,分别是type、value、containerid和page,其中containerid不过是107603加上uid,page是改变的值。
2.分析响应
我们发现响应数据都在data下,其中cardlistInfo是存储微博总数等属性,cards包含了10条微博的信息。
然后我们观察请求头部分,并找出最小有效子集(一般是cookies、ua、host、referer)
然后自定义头。
headers = { 'Host': 'm.weibo.cn', 'Referer': 'https://m.weibo.cn/u/2656260571', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '67.0.3396.99 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' }
首先获取页面json:
# get page code def get_page(page): params = { 'type': 'uid', 'value': '2656260571', 'containerid': '1076032656260571', 'page': page } url = base_url + urlencode(params) try: response = requests.get(url=url, headers=headers) if response.status_code == 200: return response.json() except requests.ConnectionError as e: print('Error', e.args)
随后处理这些json,提取出需要的信息:
# parse the code def parse_page(json): if json: items = json.get('data').get('cards') for item in items: item = item.get('mblog') weibo = {} weibo['id'] = item['id'] weibo['text'] = item['text'] weibo['attitudes'] = item['attitudes_count'] weibo['comment'] = item['comments_count'] weibo['reposts'] = item['reposts_count'] yield weibo
存入数据库:
# save to database def save_to_mongo(result): client = pymongo.MongoClient(host='localhost', port=27017) db = client.WeiBoTest collection = db.WeiBo collection.insert(result)
主函数调用:
# main if __name__ == '__main__': for page in range(1, 15): json = get_page(page) results = parse_page(json) for result in results: print(result) print() save_to_mongo(result)
运行结果:
console中:
Robo 3T中:
完整代码:
# -*- coding:utf-8 -*- from urllib.parse import urlencode import pymongo import requests base_url = 'https://m.weibo.cn/api/container/getIndex?' headers = { 'Host': 'm.weibo.cn', 'Referer': 'https://m.weibo.cn/u/2656260571', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '67.0.3396.99 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } # get page code def get_page(page): params = { 'type': 'uid', 'value': '2656260571', 'containerid': '1076032656260571', 'page': page } url = base_url + urlencode(params) try: response = requests.get(url=url, headers=headers) if response.status_code == 200: return response.json() except requests.ConnectionError as e: print('Error', e.args) # parse the code def parse_page(json): if json: items = json.get('data').get('cards') for item in items: item = item.get('mblog') weibo = {} weibo['id'] = item['id'] weibo['text'] = item['text'] weibo['attitudes'] = item['attitudes_count'] weibo['comment'] = item['comments_count'] weibo['reposts'] = item['reposts_count'] yield weibo # save to database def save_to_mongo(result): client = pymongo.MongoClient(host='localhost', port=27017) db = client.WeiBoTest collection = db.WeiBo collection.insert(result) # main if __name__ == '__main__': for page in range(1, 15): json = get_page(page) results = parse_page(json) for result in results: print(result) print() save_to_mongo(result)
.
0 条评论