1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
| # 目的:编写简易微博备份工具, 将指定微博备份为txt文本 # 1. 学习python使用 # 2. 学习使用python获取web页面(学习http调用/了解简单网络请求方式/使用外部库) # 3. 使用python抓取微博接口(了解浏览器抓包方法, http基础规范) # 4. 将抓取接口存入数据库中(掌握本地数据库构建存取/sql的增删改查写法) # 5. 从数据库中读取接口, 输出txt文本 import requests import json import time import sqlite3 from bs4 import BeautifulSoup def get_page(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'referer': 'https://m.weibo.cn/u/6587600437' } # target_url = 'https://m.weibo.cn/u/1768825052?uid=1768825052&luicode=10000011&lfid=1076031768825052' res = requests.get(url, headers=headers) return res def get_since_id(res): weibo_json = json.loads(res.text) since_id = int(weibo_json['data']['cardlistInfo']['since_id']) return since_id def get_weibo(res): weibo_json = json.loads(res.text) weibo_cards = weibo_json["data"]["cards"] weibo_id = [] weibo_time = [] weibo_text = [] reposts_count = [] comments_count = [] attitudes_count = [] for each in weibo_cards: weibo_id.append(each["mblog"]["id"]) weibo_text.append(each["mblog"]["text"]) weibo_time.append(each['mblog']['created_at']) reposts_count.append(each['mblog']['reposts_count']) comments_count.append(each['mblog']['comments_count']) attitudes_count.append(each['mblog']['attitudes_count']) weibo_matrix = [weibo_id, weibo_time, weibo_text, reposts_count, comments_count, attitudes_count] result = list(zip(*weibo_matrix)) return result conn = sqlite3.connect('weibo.db') c = conn.cursor() def create_table(): c.execute("""CREATE TABLE IF NOT EXISTS weibo_person( weibo_id int, created_at text, weibo_text text, reposts_count int, comments_count int, attitudes_count int )""") def query_and_output(): c.execute('SELECT * FROM weibo_person') data = c.fetchall() c.close() conn.close() with open('/Users/yao/www/python/微博/leftchenn.txt', 'w', encoding='utf-8') as file: for item in data: string = f'发布时间:{item[1]} \n 微博内容:{item[2]} \n 转发数:{item[3]} 评论数:{item[4]} 转发数:{item[5]} \n\n\n ' file.write(string) def main(): host = "https://m.weibo.cn/u/1768825052" target_url = 'https://m.weibo.cn/api/container/getIndex?uid=1768825052&luicode=10000011&lfid=1076031768825052&type=uid&value=1768825052&containerid=1076031768825052' res = get_page(target_url) result = get_weibo(res) since_id = get_since_id(res) for i in range(10): print('page {} scraping...'.format(i)) new_url = f'https://m.weibo.cn/api/container/getIndex?uid=1768825052&luicode=10000011&lfid=1076031768825052&type=uid&value=1768825052&containerid=1076031768825052&since_id={since_id}' time.sleep(1) res = get_page(new_url) result.extend(get_weibo(res)) since_id = get_since_id(res) print('page {} scrap done...'.format(i)) create_table() c.executemany('INSERT INTO weibo_person VALUES(?,?,?,?,?,?)', result) conn.commit() print('insert successfully...') query_and_output() print('output successfully...') if __name__ == "__main__": main()
|