# time : 2020/6/1 11:37 # file : requests_weibo.py # Software: PyCharm # python_version: 3.6 # funcation: 获得迪丽热巴的所有微博及粉丝评论的详细url import requests import time # ************************ time_ = [] # 微博发布时间 comment_url = [] # 某个微博的详细内容url text = [] #微博文字内容 comment_num = [] #评论数 zan_num = [] # 点赞数 id = [] # 每个微博的唯一编号, 需要id 构造粉丝评论url https://m.weibo.cn/comments/hotflow?id=4510598555019129&mid=4510598555019129&max_id_type=0 def spider(a): # 参数a 表示微博的第几页 url = "https://m.weibo.cn/api/container/getIndex" # m. 表示手机登入浏览 # 请求头要改为 手机或者ipad模式 headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'} data = { # 这个data 就是 xhr 中 的查询参数 "uid": "1669879400", "t": "0", "luicode": "10000011", "lfid": "100103type=1&q=迪丽热巴", "type": "uid", "value": "1669879400", "containerid": "1076031669879400", "page": str(a) } data_html = requests.get(url = url,params=data,headers=headers).json(encoding ="utf-8") #获取json数据 # print(data_html) for num in range(1,len(data_html["data"]["cards"])): # len(data_html["data"]["cards"]) 表示每个分页有多少条数据 if data_html["data"]["cards"][num]["card_type"] == 9: # print(data_html["data"]["cards"][num]["mblog"]["text"]) time_.append(data_html["data"]["cards"][num]["mblog"]["created_at"]) text.append(data_html["data"]["cards"][num]["mblog"]["text"]) id.append(data_html["data"]["cards"][num]["mblog"]["id"]) comment_url.append(data_html["data"]["cards"][num]["scheme"]) zan_num.append(data_html["data"]["cards"][num]["mblog"]["attitudes_count"]) if data_html["data"]["cards"][num]["mblog"]["comments_count"] == "100万+": #将100万+ 转换为数字 comment_num.append(1000000) else: comment_num.append(data_html["data"]["cards"][num]["mblog"]["comments_count"]) # **************调用spider,爬取多个页面 def run_spider(): # spider(1) for i in range(0,144): # 144 表示一共有多少条微博 print("--正在抓取第【{}】页--".format(i)) time.sleep(5) spider(i) # print(list(zip(time_,text,comment_url))) # ***************保存数据 def save_data(): V = list(zip(id, time_,text,comment_url,zan_num,comment_num)) import pandas as pd df = pd.DataFrame(V,columns=["id","time","text","comment_url","zan_num","comments_num"]) print(df) df.to_excel("迪丽热巴所有的微博.xlsx") if __name__ == "__main__": run_spider() save_data()
https://xpanx.com/
评论