python爬虫之:抓取豆瓣电影TOP250

豆瓣用户每天都在对“看过”的电影进行“很差”到“力荐”的评价,豆瓣根据每部影片看过的人数以及该影片所得的评价等综合数据,通过算法分析产生豆瓣电影 Top 250。

原始网页:

https://movie.douban.com/top250

图片

python 爬虫代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
import sqlite3
import time
def open_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        "Cookie": 'bid=U5nCZ8p0dQU; gr_user_id=cb5f0143-dc7b-4558-8ca8-645601af3a4c; _vwo_uuid_v2=D8FFB9019ECC277DA812225E3B2109D06|9ff6373ff5de826c9b373e0499c2fd95; __gads=ID=fca53c413c37e6b3:T=1583725810:S=ALNI_Ma0O2oj2sLthssNt4FqeaZYp7EL_Q; ll="108288"; douban-fav-remind=1; __yadk_uid=UtObTTkrwA7qukDAgdd2INMv9LH52PEA; viewed="26829016_26264642_26277694_4315606_1955110_1400498_4065258_1094802_26939853_12411215"; __utmc=30149280; __utmc=223695111; dbcl2="33304907:UiHXJr1eiGI"; ck=PYWO; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1591627160%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fmovie.douban.com%252Ftop250%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.542734861.1583725808.1591614666.1591627161.51; __utmb=30149280.0.10.1591627161; __utmz=30149280.1591627161.51.46.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utma=223695111.1397865339.1583730183.1591614666.1591627161.10; __utmb=223695111.0.10.1591627161; __utmz=223695111.1591627161.10.8.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=6441328ab4c5f495.1583730182.10.1591627294.1591614962.',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    }
    res = requests.get(url, headers=headers)
    return res
def find_moives(res):
    soup = BeautifulSoup(res.text, 'html.parser')
    # 电影名
    movies = []
    targets = soup.find_all('div', class_='hd')
    for each in targets:
        movies.append(each.a.span.text)
    # 评分
    ranks = []
    targets = soup.find_all('em')
    for each in targets:
        index = int(each.text)
        ranks.append(index)
    # 资料
    profiles = []
    targets = soup.find_all('div', class_='bd')
    for each in targets:
        try:
            profiles.append(each.p.text.split('\n')[
                1].strip() + each.p.text.split('\n')[2].strip())
        except:
            continue
    data_matrix = [ranks, movies, profiles]
    result = list(zip(*data_matrix))
    return result
def find_depth(res):
    soup = BeautifulSoup(res.text, 'html.parser')
    depth = soup.find(
        'span', class_='next').previous_sibling.previous_sibling.text
    return int(depth)
conn = sqlite3.connect("douban_movies.db")
c = conn.cursor()
def create_table():
    c.execute(
        "CREATE TABLE IF NOT EXISTS douban_movies250(item_id INT,rank INT, movie TEXT, profile TEXT)")
def query_and_output():
    c.execute('SELECT * FROM douban_movies250')
    data = c.fetchall()
    c.close()
    conn.close()
    with open('/Users/yao/www/python/douban_movies250.txt', 'w', encoding='utf-8') as file:
        for row in data:
            string = f'Top {row[1]} : 《{row[2]}》,\n 简介: {row[3]}\n'
            file.write(string)
def main():
    host = 'https://movie.douban.com/top250'
    res = open_url(host)
    depth = find_depth(res)
    result = []
    for i in range(depth):
        url = host + '/?start=' + str(25 * i)
        time.sleep(1)
        res = open_url(url)
        result.extend(find_moives(res))
    # item_id = [i for i in range(250)]
    # result_matrix = [item_id, result]
    # result = list(zip(*result_matrix))
    create_table()
    c.executemany('INSERT INTO douban_movies250 VALUES(?,?,?,?)', result)
    conn.commit()
    print('insert database successfully')
    query_and_output()
    print('output successfully ')
if __name__ == "__main__":
    main()

得到的文本

图片


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!