Skip to content

Commit 9df8bad

Browse files
author
mochazi
committed
2021-12-22 豆瓣读书爬虫-分类实现🎉
1 parent eca861c commit 9df8bad

6 files changed

+801
-0
lines changed
File renamed without changes.
File renamed without changes.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
beautifulsoup4==4.9.3
2+
certifi==2021.10.8
3+
chardet==4.0.0
4+
charset-normalizer==2.0.7
5+
idna==2.10
6+
lxml==4.6.2
7+
requests==2.25.1
8+
soupsieve==2.2.1
9+
urllib3==1.26.7
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
from typing import List
2+
import requests,json,csv,os
3+
from uuid import uuid4
4+
from bs4 import BeautifulSoup
5+
from urllib import parse
6+
7+
'''主域名'''
8+
DOMAIN_URL = 'https://book.douban.com'
9+
10+
'''
11+
协议头
12+
user-agent(必填)
13+
Referer(有就填,没有不填)
14+
Cookie(有账号登录就填,没有不填)
15+
'''
16+
HEADERS = {
17+
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
18+
'Referer':'https://book.douban.com/',
19+
'Cookie':'填写你的Cookie'
20+
}
21+
22+
'''结果去重集合'''
23+
RESULT_SET_DATA = set()
24+
25+
'''
26+
获取book的tag链接
27+
params:
28+
parse_number: int --> 爬取几个tag链接,默认全部
29+
30+
return: List[str] --> 确定爬取几个tag链接
31+
'''
32+
def get_book_tag_url(split_number:int=None) -> List[str]:
33+
34+
html = requests.get(url=DOMAIN_URL,headers=HEADERS)
35+
soup = BeautifulSoup(html.text,'lxml')
36+
37+
tag_url_list_data = [
38+
DOMAIN_URL+ parse.quote(tag_url['href'])
39+
for tag_url in soup.select('ul.hot-tags-col5.s ul a')
40+
]
41+
42+
if split_number:
43+
tag_url_list_data = tag_url_list_data[:split_number]
44+
45+
return tag_url_list_data
46+
47+
48+
'''
49+
解析tag_url,进行翻页后,获取book的内容
50+
params:
51+
tag_url_list_data: List[str] --> book的tag链接
52+
parse_number: int --> 翻页参数,默认爬取3页
53+
write_type: bool --> 是否写入json文件
54+
return:List[dict] --> 爬取成功book的内容
55+
'''
56+
def parse_book_url_info(
57+
tag_url_list_data:List[str],
58+
parse_number:int=3,
59+
write_json_type:bool=True,
60+
write_csv_type:bool=True,
61+
write_image_type:bool=True
62+
) -> List[dict]:
63+
64+
book_info_list_data = []
65+
66+
for tag_url in tag_url_list_data:
67+
68+
# 开始翻页,每20算一页
69+
for parse in range(0,parse_number*20+1,20):
70+
71+
# 翻页URL
72+
parse_url = f'{tag_url}?start={parse}'
73+
74+
html = requests.get(url=parse_url,headers=HEADERS)
75+
soup = BeautifulSoup(html.text,'lxml')
76+
77+
# 选择书本
78+
books = soup.select('li.subject-item')
79+
80+
for book in books:
81+
82+
# 选择书本链接
83+
book_url = book.select_one('.info h2 a')['href']
84+
85+
# 选择书名
86+
title = book.select_one('.info h2 a').text.strip().replace(' ','').replace('\n','')
87+
88+
# 选择作者
89+
info = book.select_one('.info div.pub').text.strip().replace(' ','').replace('\n','')
90+
91+
# 选择评分
92+
star = book.select_one('.rating_nums').text.strip().replace(' ','').replace('\n','')
93+
94+
# 选择评价
95+
pl = book.select_one('.pl').text.strip().replace(' ','').replace('\n','')
96+
97+
# 选择书本简介
98+
introduce = book.select_one('.info p').text.strip().replace(' ','').replace('\n','')
99+
100+
# 获取图片URL
101+
image_url = book.select_one('.nbg img')['src']
102+
103+
book_info_result = dict(
104+
书本链接=book_url,
105+
书名=title,
106+
作者=info,
107+
评分=star,
108+
评价=pl,
109+
书本简介=introduce,
110+
图片链接=image_url
111+
)
112+
113+
'''生成结果hash值'''
114+
result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False))
115+
116+
if result_hash_data not in RESULT_SET_DATA:
117+
118+
'''加入去重集合'''
119+
RESULT_SET_DATA.add(result_hash_data)
120+
121+
if write_image_type:
122+
write_image_book_info(
123+
image_url=image_url,
124+
image_name=title,
125+
headers=HEADERS
126+
)
127+
128+
# 检查是否写入json文件
129+
if write_json_type:
130+
write_json_book_info(book_info_result)
131+
132+
# 检查是否写入csv文件
133+
if write_csv_type:
134+
write_csv_book_info(
135+
headers=[key for key,value in book_info_result.items()],
136+
book_info=[value for key,value in book_info_result.items()]
137+
)
138+
139+
print(book_info_result)
140+
141+
book_info_list_data.append(book_info_result)
142+
143+
return book_info_list_data
144+
145+
146+
147+
'''
148+
保存图片,生成图片映射JSON文件
149+
params:
150+
image_url:str --> 图片链接
151+
image_name:str --> 图片名字
152+
headers: dict --> 协议头
153+
'''
154+
def write_image_book_info(image_url:str,image_name:str,headers:dict):
155+
156+
'''确保图片文件名不重复'''
157+
uuid_id = uuid4()
158+
159+
filename = './保存图片/图片'
160+
161+
image_file_name = f'{filename}/{uuid_id}.jpg'
162+
163+
image_map_file_name = f'./保存图片/image_map_data.json'
164+
165+
'''如果不存在文件夹则创建'''
166+
if not os.path.exists(filename):
167+
os.makedirs(filename)
168+
169+
html = requests.get(url=image_url,headers=headers)
170+
171+
'''写入图片'''
172+
with open(image_file_name,'wb') as f:
173+
174+
f.write(html.content)
175+
176+
'''保存图片映射JSON文件'''
177+
with open(image_map_file_name,'a+',encoding='utf-8') as f:
178+
179+
f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n')
180+
181+
182+
183+
'''
184+
将book的内容,写入json文件
185+
params:
186+
book_info: dict --> 爬取成功book的内容
187+
'''
188+
def write_json_book_info(book_info:dict):
189+
190+
with open('book_info.json','a+',encoding='utf-8') as f:
191+
192+
'''
193+
json.dumps() 将dict对象转成str对象,json就是str对象
194+
ensure_ascii=False 让json显示中文编码
195+
'''
196+
f.write(json.dumps(book_info,ensure_ascii=False)+'\n')
197+
198+
199+
200+
'''
201+
将book的内容,写入csv文件(带表头)
202+
params:
203+
headers:list --> CSV表头
204+
book_info: list --> 爬取成功book的内容
205+
'''
206+
def write_csv_book_info(headers:list,book_info:list):
207+
208+
'''
209+
跨平台问题:
210+
写入csv 因为Windows有点BUG
211+
writerows()写入会出现空行
212+
所以加入newline=''
213+
没有出现这种情况则不需要
214+
'''
215+
216+
'''
217+
检查是否创建了CSV文件
218+
没有则生成带有表头的CSV文件
219+
'''
220+
if not os.path.exists('book_info.csv'):
221+
222+
with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
223+
224+
f_csv = csv.writer(f)
225+
f_csv.writerow(headers)
226+
227+
228+
229+
'''
230+
逐行开始写入CSV
231+
'''
232+
with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
233+
234+
f_csv = csv.writer(f)
235+
f_csv.writerow(book_info) #逐行插入
236+
237+
if __name__ == '__main__':
238+
239+
book_tag_url = get_book_tag_url(1)
240+
241+
book_url_info = parse_book_url_info(book_tag_url)

0 commit comments

Comments
 (0)