1.抓取高分番剧
原始网页

https://www.bilibili.com/anime/index/#season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=4&st=1&sort=0&page=1
python 爬虫代码(储存到数据库)
主文件 <niche_gems.py>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| import requests from bs4 import BeautifulSoup import json import time def getReviewTimes(media_link): res = requests.get(media_link) soup = BeautifulSoup(res.text, 'html.parser') review_times = soup.find_all('div', class_='media-info-review-times') for i in review_times: review_time = i.text return review_time def getAnime(url): res = requests.get(url) graded_data = json.loads(res.text) animes_list = graded_data['data']['list'] media_ids = [] media_links = [] m_orders = [] titles = [] review_times = [] for each in animes_list: media_id = each['media_id'] media_ids.append(media_id) media_link = f'https://www.bilibili.com/bangumi/media/md{media_id}' review_time = getReviewTimes(media_link) review_times.append(review_time) media_links.append(media_link) m_order = each['order'].strip() m_orders.append(m_order) titles.append(each['title']) animes_matrix = [media_ids, m_orders, titles, review_times, media_links] animes_result = list(zip(*animes_matrix)) return animes_result
|
按照评分 by score
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| import niche_gems import sqlite3 import csv import time conn = sqlite3.connect('anime.db') c = conn.cursor() def createTable(): c.execute("""CREATE TABLE IF NOT EXISTS animeByScore( media_id INTEGER PRIMARY KEY, m_order REAL, title TEXT, review_time INTEGER, media_link TEXT)""") def main(): createTable() for i in range(1, 90): print(f'page {i} scraping...') url_score = f'https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=4&st=1&sort=0&page={i}&season_type=1&pagesize=20&type=1' time.sleep(1) animes_results_by_score = [] animes_result_by_score = niche_gems.getAnime(url_score) animes_results_by_score.extend(animes_result_by_score) c.executemany('INSERT OR IGNORE INTO animeByScore VALUES(?,?,?,?,?)', animes_results_by_score) conn.commit() print(f'page {i} animeByScore insert successfully...') print(f'page {i} done...') c.close() conn.close() print('执行完毕,数据库已关闭!') if __name__ == "__main__": main()
|
按照播放量 by play count
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| import niche_gems import sqlite3 conn = sqlite3.connect('anime.db') c = conn.cursor() def createTable(): c.execute("""CREATE TABLE IF NOT EXISTS animeByPlaycount( media_id int primary key, order int, title text, review_time int, media_link text )""") def query_and_output(): c.execute('SELECT * from animeByPlaycount SORT BY order') data = c.fetchall() c.close() conn.close() with open('/Users/yao/www/python//bilibili/anime/animeByPlaycount.txt', 'w', encoding='utf-8') as file: for row in data: file.write() def main(): url_play_count = 'https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=2&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1' animes_result_by_playcount = niche_gems.getAnime(url_play_count) createTable() c.executemany('INSERT OR IGNORE INTO animeByPlaycount') conn.commit() print('animeByPlaycount insert successfully...') query_and_output() print('animeByPlaycount file saved successfully...') if __name__ == "__main__": main()
|
数据库内容

从数据库中取出并展示在 html 中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
| import sqlite3 import csv import pandas as pd from prettytable import PrettyTable conn = sqlite3.connect('anime.db') c = conn.cursor() c.execute('SELECT * from animeByScore WHERE m_order > 9.7') data = c.fetchall() c.close() conn.close() # file_path = '/Users/yao/www/python/bilibili/anime/animeByScore.csv' # with open(file_path, 'w', newline='', encoding='utf-8') as f: # fieldnames = ['anime_id', 'm_order', 'title', 'review_time', 'anime_link'] # f_csv = csv.DictWriter(f, fieldnames=fieldnames) # f_csv.writeheader() # for row in data: # f_csv.writerow( # { # 'anime_id': row[0], # 'm_order': row[1], # 'title': row[2], # 'review_time': row[3], # 'anime_link': row[4] # } # ) html_uri = '/Users/yao/www/python/bilibili/anime/animeByScore.html' record_list = data title = "评分列表" tbody_content = "" for record in record_list: tbody_content = tbody_content + f""" <tr> <td>{record[0]}</td> <td>{record[1]}</td> <td>{record[2]}</td> <td>{record[3]}</td> <td>{record[4]}</td> </tr> """ content = f""" <table> <thead> <th>anime_id</th> <th>m_order</th> <th>title</th> <th>review_time</th> <th>anime_link</th> </thead> <tbody> {tbody_content} </tbody> </table> """ html = f""" <!DOCTYPE html> <html lang="zh-CN"> <head> <meta charset="utf-8"> <title>{title}</title> </head> <body> {content} </body> </html> """ with open(html_uri, 'w', newline='', encoding='utf-8') as f: f.write(html) print('csv file saved (^-^)V')
|
html 展示效果

小技巧:如何把 CSV 转换成 HTML
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| from prettytable import PrettyTable file_path = '/Users/yao/www/python/bilibili/anime/animeByScore.csv' csv_file = open(file_path, 'r') csv_file = csv_file.readlines() tb = PrettyTable(csv_file[0].split(',')) # 这一句也可以这么写 # table = PrettyTable() # table.field_names=csv_file[0].split(',') for row in range(1, 100): csv_file[row] = csv_file[row].split(',') tb.add_row(csv_file[row]) html_path = '/Users/yao/www/python/bilibili/anime/html_file.html' html_file = open(html_path, 'w') html_code = tb.get_html_string() html_file = html_file.write(html_code)
|
2.抓取视频弹幕并制作图云
python 爬虫弹幕
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| import requests import json import re
def get_cid(url): bvid = url.split('/')[4].split('?')[0] danmu_page_link = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp' res = requests.get(danmu_page_link) cid_page = json.loads(res.text) cid = cid_page['data'][0]['cid'] # 获取cid时要注意视频是不是多p,自己做小工具时用其中1p就可以 # cids = res_dict['data']['cid'] # part_names = res_dict['data']['part'] return cid def get_danmu(cid): danmu_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}' res = requests.get(danmu_url) res_xml = res.content.decode('utf-8') pattern = re.compile('<d.*?>(.*?)</d>') danmu_list = pattern.findall(res_xml) return danmu_list def save_file(danmu_list): file_path = '/Users/yao/www/python/bilibili/video/danmu_file.txt' with open(file_path, 'w', encoding='utf-8') as file: for item in danmu_list: file.write(item) file.write('\n') def main(): source = 'https://www.bilibili.com/video/BV1xs411Q799?p=1' cid = get_cid(source) danmu_list = get_danmu(cid) save_file(danmu_list) print('file saved successfully...') if __name__ == '__main__': main()
|
得到的弹幕文本

弹幕文本图云分析
分词代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| import jieba from wordcloud import WordCloud file_path = '/Users/yao/www/python/bilibili/video/danmu_file.txt' with open(file_path, 'r', encoding='utf-8') as file: text_str = file.read() seg_list = jieba.cut(text_str) word_str = " ".join(seg_list) font_path = "/System/Library/Fonts/PingFang.ttc" wc_settintg = { 'font_path': '/System/Library/Fonts/PingFang.ttc', 'background_color': 'white', 'width': 1000, 'height': 860, 'margin': 2, } wc = WordCloud(**wc_settintg).generate(word_str) wc.to_file('/Users/yao/www/python/bilibili/video/xiaojiayu_python_p1.png') print('wordcloud done')
|
参考视频:https://www.bilibili.com/video/BV1g7411e7m4
得到的图云
(B 站的好多弹幕真是越来越不能看了……)

遇到的问题:生成的图片 中文乱码
查了一下这是因为中文字体包不适配,然后发现我们用 mac 自带的苹方字体就能非常简单的搞定。做词云的时候用的 Mac,所以 Windows 的解决方法没有尝试。
本地路径是:
font_path = “/System/Library/Fonts/PingFang.ttc”

在电脑中查看:
- 启动台——其他——打开字体册

- 如果你之前没有改过系统字体的话,默认选中苹方简体,右键选择在 Finder 中显示


这就是我们需要的字体包。
如果不需要其他参数的话,字体路径直接传入 WordCloud()参数就可以了。
wc = WordCloud(font_path).generate(word_str)
词云官方文档地址:https://github.com/amueller/word_cloud