精易论坛
标题:
Python网络爬虫,小说下载
[打印本页]
作者:
天生表演家
时间:
2020-4-28 19:27
标题:
Python网络爬虫,小说下载
1
from
bs4
import
BeautifulSoup
2
import
requests,os,re
3
4
5
6
7
#获取小说章节与链接
8
if
__name__ ==
"__main__"
:
9
server =
"https://www.abcxs.com"
10
url =
"https://www.abcxs.com/book/13417/#main"
11
r = requests.get(url)
12
html = r.text
13
14
#获取书名
15
title_bf = BeautifulSoup(html)
16
title = title_bf.find_all(property =
'og:title'
)
17
print(title)
18
#data = re.search( '<meta content="(.*?)" property=', title, re.M|re.I)
19
searchObj = re.search(
'<meta content="(.*?)" property='
, str(title), re.M|re.I)
20
if
searchObj:
21
print
(
"searchObj.group(1) : "
, searchObj.group(
1
))
22
ShuMing = searchObj.group(
1
)
23
else
:
24
print
(
"Nothing found!!"
)
25
26
#获取小说目录
27
28
div_bf = BeautifulSoup(html)
29
div = div_bf.find_all(
'div'
,class_=
"listmain"
)
30
#print(div[0])
31
a_bf = BeautifulSoup(str(div[
0
]))
32
a = a_bf.find_all(
'a'
)
33
for
each
in
a:
34
print(each.string, server + each.get(
'href'
))
35
36
#创建文件目录
37
path =
"J:/python/Python/我的Python学习/爬虫及文件写入/"
+ ShuMing
38
if
not
os.path.exists(path):
39
os.mkdir(path)
40
#获取正文内容
41
if
__name__ ==
"__main__"
:
42
r = requests.get(server + each.get(
'href'
))
43
html = r.text
44
bf = BeautifulSoup(html)
45
tetx_content = bf.find_all(
'div'
, class_ =
'showtxt'
)
46
print(tetx_content[
0
].text.replace(
'\xa0'
*
8
,
'\n'
))
47
#tetx_content = bf.replace(u'\xa0', u' ')
48
49
#写入文件
50
with
open(path +
"/"
+ each.string +
'.txt'
,
'w'
)
as
f:
51
#f.file.write(tetx_content[0].content.replace(u'\xa0,u'))
52
f.write(tetx_content[
0
].text.replace(
u'\xa0'
,
u' '
))
作者:
michell12
时间:
2020-5-27 18:17
你这连粘贴都是胡乱粘的
作者:
天生表演家
时间:
2020-5-27 19:18
michell12 发表于 2020-5-27 18:17
你这连粘贴都是胡乱粘的
我哪儿胡乱粘贴了?
作者:
chhzll
时间:
2020-11-28 00:27
讲究。。。
欢迎光临 精易论坛 (https://125.confly.eu.org/)
Powered by Discuz! X3.4