1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
| import requests from bs4 import BeautifulSoup import sqlite3 import time def open_url(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', "Cookie": 'bid=U5nCZ8p0dQU; gr_user_id=cb5f0143-dc7b-4558-8ca8-645601af3a4c; _vwo_uuid_v2=D8FFB9019ECC277DA812225E3B2109D06|9ff6373ff5de826c9b373e0499c2fd95; __gads=ID=fca53c413c37e6b3:T=1583725810:S=ALNI_Ma0O2oj2sLthssNt4FqeaZYp7EL_Q; ll="108288"; douban-fav-remind=1; __yadk_uid=UtObTTkrwA7qukDAgdd2INMv9LH52PEA; viewed="26829016_26264642_26277694_4315606_1955110_1400498_4065258_1094802_26939853_12411215"; __utmc=30149280; __utmc=223695111; dbcl2="33304907:UiHXJr1eiGI"; ck=PYWO; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1591627160%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fmovie.douban.com%252Ftop250%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.542734861.1583725808.1591614666.1591627161.51; __utmb=30149280.0.10.1591627161; __utmz=30149280.1591627161.51.46.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utma=223695111.1397865339.1583730183.1591614666.1591627161.10; __utmb=223695111.0.10.1591627161; __utmz=223695111.1591627161.10.8.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=6441328ab4c5f495.1583730182.10.1591627294.1591614962.', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' } res = requests.get(url, headers=headers) return res def find_moives(res): soup = BeautifulSoup(res.text, 'html.parser') # 电影名 movies = [] targets = soup.find_all('div', class_='hd') for each in targets: movies.append(each.a.span.text) # 评分 ranks = [] targets = soup.find_all('em') for each in targets: index = int(each.text) ranks.append(index) # 资料 profiles = [] targets = soup.find_all('div', class_='bd') for each in targets: try: profiles.append(each.p.text.split('\n')[ 1].strip() + each.p.text.split('\n')[2].strip()) except: continue data_matrix = [ranks, movies, profiles] result = list(zip(*data_matrix)) return result def find_depth(res): soup = BeautifulSoup(res.text, 'html.parser') depth = soup.find( 'span', class_='next').previous_sibling.previous_sibling.text return int(depth) conn = sqlite3.connect("douban_movies.db") c = conn.cursor() def create_table(): c.execute( "CREATE TABLE IF NOT EXISTS douban_movies250(item_id INT,rank INT, movie TEXT, profile TEXT)") def query_and_output(): c.execute('SELECT * FROM douban_movies250') data = c.fetchall() c.close() conn.close() with open('/Users/yao/www/python/douban_movies250.txt', 'w', encoding='utf-8') as file: for row in data: string = f'Top {row[1]} : 《{row[2]}》,\n 简介: {row[3]}\n' file.write(string) def main(): host = 'https://movie.douban.com/top250' res = open_url(host) depth = find_depth(res) result = [] for i in range(depth): url = host + '/?start=' + str(25 * i) time.sleep(1) res = open_url(url) result.extend(find_moives(res)) # item_id = [i for i in range(250)] # result_matrix = [item_id, result] # result = list(zip(*result_matrix)) create_table() c.executemany('INSERT INTO douban_movies250 VALUES(?,?,?,?)', result) conn.commit() print('insert database successfully') query_and_output() print('output successfully ') if __name__ == "__main__": main()
|