1
+ from typing import List
2
+ import requests ,json ,csv ,os
3
+ from uuid import uuid4
4
+ from bs4 import BeautifulSoup
5
+ from urllib import parse
6
+
7
+ '''主域名'''
8
+ DOMAIN_URL = 'https://book.douban.com'
9
+
10
+ '''
11
+ 协议头
12
+ user-agent(必填)
13
+ Referer(有就填,没有不填)
14
+ Cookie(有账号登录就填,没有不填)
15
+ '''
16
+ HEADERS = {
17
+ 'user-agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' ,
18
+ 'Referer' :'https://book.douban.com/' ,
19
+ 'Cookie' :'填写你的Cookie'
20
+ }
21
+
22
+ '''结果去重集合'''
23
+ RESULT_SET_DATA = set ()
24
+
25
+ '''
26
+ 获取book的tag链接
27
+ params:
28
+ parse_number: int --> 爬取几个tag链接,默认全部
29
+
30
+ return: List[str] --> 确定爬取几个tag链接
31
+ '''
32
+ def get_book_tag_url (split_number :int = None ) -> List [str ]:
33
+
34
+ html = requests .get (url = DOMAIN_URL ,headers = HEADERS )
35
+ soup = BeautifulSoup (html .text ,'lxml' )
36
+
37
+ tag_url_list_data = [
38
+ DOMAIN_URL + parse .quote (tag_url ['href' ])
39
+ for tag_url in soup .select ('ul.hot-tags-col5.s ul a' )
40
+ ]
41
+
42
+ if split_number :
43
+ tag_url_list_data = tag_url_list_data [:split_number ]
44
+
45
+ return tag_url_list_data
46
+
47
+
48
+ '''
49
+ 解析tag_url,进行翻页后,获取book的内容
50
+ params:
51
+ tag_url_list_data: List[str] --> book的tag链接
52
+ parse_number: int --> 翻页参数,默认爬取3页
53
+ write_type: bool --> 是否写入json文件
54
+ return:List[dict] --> 爬取成功book的内容
55
+ '''
56
+ def parse_book_url_info (
57
+ tag_url_list_data :List [str ],
58
+ parse_number :int = 3 ,
59
+ write_json_type :bool = True ,
60
+ write_csv_type :bool = True ,
61
+ write_image_type :bool = True
62
+ ) -> List [dict ]:
63
+
64
+ book_info_list_data = []
65
+
66
+ for tag_url in tag_url_list_data :
67
+
68
+ # 开始翻页,每20算一页
69
+ for parse in range (0 ,parse_number * 20 + 1 ,20 ):
70
+
71
+ # 翻页URL
72
+ parse_url = f'{ tag_url } ?start={ parse } '
73
+
74
+ html = requests .get (url = parse_url ,headers = HEADERS )
75
+ soup = BeautifulSoup (html .text ,'lxml' )
76
+
77
+ # 选择书本
78
+ books = soup .select ('li.subject-item' )
79
+
80
+ for book in books :
81
+
82
+ # 选择书本链接
83
+ book_url = book .select_one ('.info h2 a' )['href' ]
84
+
85
+ # 选择书名
86
+ title = book .select_one ('.info h2 a' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
87
+
88
+ # 选择作者
89
+ info = book .select_one ('.info div.pub' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
90
+
91
+ # 选择评分
92
+ star = book .select_one ('.rating_nums' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
93
+
94
+ # 选择评价
95
+ pl = book .select_one ('.pl' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
96
+
97
+ # 选择书本简介
98
+ introduce = book .select_one ('.info p' ).text .strip ().replace (' ' ,'' ).replace ('\n ' ,'' )
99
+
100
+ # 获取图片URL
101
+ image_url = book .select_one ('.nbg img' )['src' ]
102
+
103
+ book_info_result = dict (
104
+ 书本链接 = book_url ,
105
+ 书名 = title ,
106
+ 作者 = info ,
107
+ 评分 = star ,
108
+ 评价 = pl ,
109
+ 书本简介 = introduce ,
110
+ 图片链接 = image_url
111
+ )
112
+
113
+ '''生成结果hash值'''
114
+ result_hash_data = hash (json .dumps (book_info_result ,ensure_ascii = False ))
115
+
116
+ if result_hash_data not in RESULT_SET_DATA :
117
+
118
+ '''加入去重集合'''
119
+ RESULT_SET_DATA .add (result_hash_data )
120
+
121
+ if write_image_type :
122
+ write_image_book_info (
123
+ image_url = image_url ,
124
+ image_name = title ,
125
+ headers = HEADERS
126
+ )
127
+
128
+ # 检查是否写入json文件
129
+ if write_json_type :
130
+ write_json_book_info (book_info_result )
131
+
132
+ # 检查是否写入csv文件
133
+ if write_csv_type :
134
+ write_csv_book_info (
135
+ headers = [key for key ,value in book_info_result .items ()],
136
+ book_info = [value for key ,value in book_info_result .items ()]
137
+ )
138
+
139
+ print (book_info_result )
140
+
141
+ book_info_list_data .append (book_info_result )
142
+
143
+ return book_info_list_data
144
+
145
+
146
+
147
+ '''
148
+ 保存图片,生成图片映射JSON文件
149
+ params:
150
+ image_url:str --> 图片链接
151
+ image_name:str --> 图片名字
152
+ headers: dict --> 协议头
153
+ '''
154
+ def write_image_book_info (image_url :str ,image_name :str ,headers :dict ):
155
+
156
+ '''确保图片文件名不重复'''
157
+ uuid_id = uuid4 ()
158
+
159
+ filename = './保存图片/图片'
160
+
161
+ image_file_name = f'{ filename } /{ uuid_id } .jpg'
162
+
163
+ image_map_file_name = f'./保存图片/image_map_data.json'
164
+
165
+ '''如果不存在文件夹则创建'''
166
+ if not os .path .exists (filename ):
167
+ os .makedirs (filename )
168
+
169
+ html = requests .get (url = image_url ,headers = headers )
170
+
171
+ '''写入图片'''
172
+ with open (image_file_name ,'wb' ) as f :
173
+
174
+ f .write (html .content )
175
+
176
+ '''保存图片映射JSON文件'''
177
+ with open (image_map_file_name ,'a+' ,encoding = 'utf-8' ) as f :
178
+
179
+ f .write (json .dumps (dict (image_name = image_name ,uuid = str (uuid_id ),image_url = image_url ),ensure_ascii = False )+ '\n ' )
180
+
181
+
182
+
183
+ '''
184
+ 将book的内容,写入json文件
185
+ params:
186
+ book_info: dict --> 爬取成功book的内容
187
+ '''
188
+ def write_json_book_info (book_info :dict ):
189
+
190
+ with open ('book_info.json' ,'a+' ,encoding = 'utf-8' ) as f :
191
+
192
+ '''
193
+ json.dumps() 将dict对象转成str对象,json就是str对象
194
+ ensure_ascii=False 让json显示中文编码
195
+ '''
196
+ f .write (json .dumps (book_info ,ensure_ascii = False )+ '\n ' )
197
+
198
+
199
+
200
+ '''
201
+ 将book的内容,写入csv文件(带表头)
202
+ params:
203
+ headers:list --> CSV表头
204
+ book_info: list --> 爬取成功book的内容
205
+ '''
206
+ def write_csv_book_info (headers :list ,book_info :list ):
207
+
208
+ '''
209
+ 跨平台问题:
210
+ 写入csv 因为Windows有点BUG
211
+ writerows()写入会出现空行
212
+ 所以加入newline=''
213
+ 没有出现这种情况则不需要
214
+ '''
215
+
216
+ '''
217
+ 检查是否创建了CSV文件
218
+ 没有则生成带有表头的CSV文件
219
+ '''
220
+ if not os .path .exists ('book_info.csv' ):
221
+
222
+ with open ('book_info.csv' ,'a+' ,encoding = 'utf-8' ,newline = '' ) as f :
223
+
224
+ f_csv = csv .writer (f )
225
+ f_csv .writerow (headers )
226
+
227
+
228
+
229
+ '''
230
+ 逐行开始写入CSV
231
+ '''
232
+ with open ('book_info.csv' ,'a+' ,encoding = 'utf-8' ,newline = '' ) as f :
233
+
234
+ f_csv = csv .writer (f )
235
+ f_csv .writerow (book_info ) #逐行插入
236
+
237
+ if __name__ == '__main__' :
238
+
239
+ book_tag_url = get_book_tag_url (1 )
240
+
241
+ book_url_info = parse_book_url_info (book_tag_url )
0 commit comments