精易论坛

标题: Python 网络爬虫,小说下载 [打印本页]

作者: 天生表演家    时间: 2020-4-28 19:16
标题: Python 网络爬虫,小说下载
第一次写爬虫,代码还是有点小问题,就是写入文件那块,写了一半的时候,出现编码无法转换的问题,有能力的可自行修改,修改好后记得通知我一下哦!

 1from bs4 import BeautifulSoup
2import requests,os,re
3
4
5
6
7#获取小说章节与链接
8if name == "main":
9    server = "https://www.abcxs.com"</span><br><span class="linenum hljs-number" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(174, 135, 250); padding-right: 20px; word-spacing: 0px; word-wrap: inherit !important; word-break: inherit !important;">10    url = "https://www.abcxs.com/book/13417/#main"</span><br><span class="linenum hljs-number" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(174, 135, 250); padding-right: 20px; word-spacing: 0px; word-wrap: inherit !important; word-break: inherit !important;">11    r = requests.get(url)
12    html = r.text
13
14    #获取书名
15    title_bf = BeautifulSoup(html)
16    title = title_bf.find_all(property = 'og:title')
17    print(title)
18    #data =  re.search( '?)" property=', title, re.M|re.I)
19    searchObj = re.search( '?)" property=', str(title), re.M|re.I)
20    if searchObj:
21        print ("searchObj.group(1) : ", searchObj.group(1))
22        ShuMing = searchObj.group(1)
23    else:
24        print ("Nothing found!!")
25
26    #获取小说目录
27
28    div_bf = BeautifulSoup(html)
29    div = div_bf.findall('div',class="listmain")
30    #print(div[zxsq-anti-bbcode-0])
31    a_bf = BeautifulSoup(str(div[0]))
32    a = a_bf.find_all('a')
33    for each in a:
34        print(each.string, server + each.get('href'))
35
36        #创建文件目录
37        path = "J:/python/Python/我的Python学习/爬虫及文件写入/" + ShuMing
38        if not os.path.exists(path):
39            os.mkdir(path)
40        #获取正文内容
41        if name == "main":
42            r = requests.get(server + each.get('href'))
43            html = r.text
44            bf = BeautifulSoup(html)
45            tetx_content = bf.findall('div', class = 'showtxt')
46            print(tetx_content[0].text.replace('\xa0'*8,'\n'))
47            #tetx_content = bf.replace(u'\xa0', u' ')
48
49            #写入文件
50            with open(path + "/" + each.string + '.txt''w'as f:
51                #f.file.write(tetx_content[zxsq-anti-bbcode-0].content.replace(u'\xa0,u'))
52                f.write(tetx_content[0].text.replace(u'\xa0'u' '))



作者: 天生表演家    时间: 2020-4-28 19:17

 1from bs4 import BeautifulSoup
2import requests,os,re
3
4
5
6
7#获取小说章节与链接
8if name == "main":
9    server = "https://www.abcxs.com"</span><br><span class="linenum hljs-number" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(174, 135, 250); padding-right: 20px; word-spacing: 0px; word-wrap: inherit !important; word-break: inherit !important;">10    url = "https://www.abcxs.com/book/13417/#main"</span><br><span class="linenum hljs-number" style="font-size: inherit; line-height: inherit; margin: 0px; padding: 0px; color: rgb(174, 135, 250); padding-right: 20px; word-spacing: 0px; word-wrap: inherit !important; word-break: inherit !important;">11    r = requests.get(url)
12    html = r.text
13
14    #获取书名
15    title_bf = BeautifulSoup(html)
16    title = title_bf.find_all(property = 'og:title')
17    print(title)
18    #data =  re.search( '?)" property=', title, re.M|re.I)
19    searchObj = re.search( '?)" property=', str(title), re.M|re.I)
20    if searchObj:
21        print ("searchObj.group(1) : ", searchObj.group(1))
22        ShuMing = searchObj.group(1)
23    else:
24        print ("Nothing found!!")
25
26    #获取小说目录
27
28    div_bf = BeautifulSoup(html)
29    div = div_bf.findall('div',class="listmain")
30    #print(div[zxsq-anti-bbcode-0])
31    a_bf = BeautifulSoup(str(div[0]))
32    a = a_bf.find_all('a')
33    for each in a:
34        print(each.string, server + each.get('href'))
35
36        #创建文件目录
37        path = "J:/python/Python/我的Python学习/爬虫及文件写入/" + ShuMing
38        if not os.path.exists(path):
39            os.mkdir(path)
40        #获取正文内容
41        if name == "main":
42            r = requests.get(server + each.get('href'))
43            html = r.text
44            bf = BeautifulSoup(html)
45            tetx_content = bf.findall('div', class = 'showtxt')
46            print(tetx_content[0].text.replace('\xa0'*8,'\n'))
47            #tetx_content = bf.replace(u'\xa0', u' ')
48
49            #写入文件
50            with open(path + "/" + each.string + '.txt''w'as f:
51                #f.file.write(tetx_content[zxsq-anti-bbcode-0].content.replace(u'\xa0,u'))
52                f.write(tetx_content[0].text.replace(u'\xa0'u' '))



作者: f15007937680    时间: 2021-8-19 22:38
什么鬼阿




欢迎光临 精易论坛 (https://125.confly.eu.org/) Powered by Discuz! X3.4