精易论坛
标题:
l视频p虫
[打印本页]
作者:
325332460
时间:
2021-12-10 15:27
标题:
l视频p虫
import requests
from lxml import etree
import re
from concurrent.futures import ProcessPoolExecutor
from multiprocessing.dummy import Pool
import time
import random
url=
' https://www.pearvideo.com/category_5'
headtop=
{
' User-Agent': 'Mozilla/5.0
(
Windows NT 10.0; Win64; x64
)
AppleWebKit/537.36
(
KHTML, like Gecko
)
Chrome/94.0.4606.81 Safari/537.36'
}
adata=requests.
get
(
url=url,headers=headtop
)
.text
###想要拿到MP4必须进入视频里面在获取MP4url
tree=etree.
HTML
(
adata
)
li_list=tree.
xpath
(
' //ul
[
@id="listvideoListUl"
]
/li'
)
urls=
[
]
for li in li_list:
spurl=
' https://www.pearvideo.com/'+li.xpath
(
'./div/a/@href'
)
[
0
]
#视频进入的链接
spname=li.
xpath
(
' ./div/a/div
[
2
]
/text
(
)
'
)
[
0
]
+'.mp4'#视频名称加后缀
enterdata=requests.
get
(
url=spurl,headers=headtop
)
.text
#进入视频网页
mp4urlid=re.
sub
(
"\D","",spurl
)
#正则获取视频id
{
可以print看看url
}
#timezhou
=
str
(
random.
Random
(
)
)
#random
.
Random
(
)
#0
.1111123这个不重要 这个是在阿贾克斯返回MP4链接请求用到不过不重要
#print
(
spurl
)
headtops=
{
' Referer':spurl,
' User-Agent': 'Mozilla/5.0
(
Windows NT 10.0; Win64; x64
)
AppleWebKit/537.36
(
KHTML, like Gecko
)
Chrome/94.0.4606.81 Safari/537.36'
}
#当我们打开抓包工具查看阿贾克斯返回时候发现MP4在阿贾克斯返回json里面
想要拿到json 要加上head Referer才能获取碰巧Referer是想获取该视频进入的链接
mp4url=
' https://www.pearvideo.com/videoStatus.jsp?contId='+mp4urlid+'&mrd=0.6387279085804516'
#####mp4url拼接
由于正则不好勉强拼接出来,把错误都MP4url加上cont-加视频id即可
#####
#
[
url=https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4
]
https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4
[
/url
]
错误
#
[
url=https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4
]
https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4
[
/url
]
正确
#print
(
mp4urlid
)
mp4urldata=requests.
get
(
url=mp4url,headers=headtops
)
.text
#print
(
mp4urldata
)
ex=
' srcUrl":"
(
.*?
)
"
}
}'
mp4urldownload=re.
findall
(
ex,mp4urldata
)
[
0
]
mp4urldownloadw = re.
findall
(
r
' -.+mp4', mp4urldownload
)
[
0
]
#print
(
mp4urldownloadw
)
mp4urldownload1=re.
findall
(
r
' https.+/20....../',mp4urldownload
)
[
0
]
+'cont-'+mp4urlid+mp4urldownloadw
#print
(
mp4urldownload1
)
dic=
{
' name':spname,
' url':mp4urldownload1
}
urls.
append
(
dic
)
def
get_pool
(
dic
)
:
url=dic
[
' url'
]
name=dic
[
' name'
]
mp4data=requests.
get
(
url=url,headers=headtops
)
.content
mp4lenght=
len
(
mp4data
)
with
open
(
name,mode=
' wb'
)
as f:
f.
write
(
mp4data
)
print
(
' 总长度%s已下载完成'%
(
mp4lenght
)
)
pool=
Pool
(
4
)
pool.
map
(
get_pool,urls
)
pool.
close
(
)
pool.
join
(
)
#
#
[
url=https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4
]
https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4
[
/url
]
错误
#
[
url=https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4
]
https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4
[
/url
]
正确
import requests
from lxml import etree
import re
from concurrent.futures import ProcessPoolExecutor
from multiprocessing.dummy import Pool
import time
import random
url='https://www.pearvideo.com/category_5'
headtop={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
adata=requests.get(url=url,headers=headtop).text###想要拿到MP4必须进入视频里面在获取MP4url
tree=etree.HTML(adata)
li_list=tree.xpath('//ul[@id="listvideoListUl"]/li')
urls=[]
for li in li_list:
spurl='https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]#视频进入的链接
spname=li.xpath('./div/a/div[2]/text()')[0]+'.mp4'#视频名称加后缀
enterdata=requests.get(url=spurl,headers=headtop).text#进入视频网页
mp4urlid=re.sub("\D","",spurl)#正则获取视频id {可以print看看url}
#timezhou=str( random.Random())
#random.Random() #0.1111123这个不重要 这个是在阿贾克斯返回MP4链接请求用到不过不重要
#print(spurl)
headtops={
'Referer':spurl,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}#当我们打开抓包工具查看阿贾克斯返回时候发现MP4在阿贾克斯返回json里面 想要拿到json 要加上head Referer才能获取碰巧Referer是想获取该视频进入的链接
mp4url='https://www.pearvideo.com/videoStatus.jsp?contId='+mp4urlid+'&mrd=0.6387279085804516'
#####mp4url拼接 由于正则不好勉强拼接出来,把错误都MP4url加上cont-加视频id即可#####
#
https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4
错误
#
https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4
正确
#print(mp4urlid)
mp4urldata=requests.get(url=mp4url,headers=headtops).text
#print(mp4urldata)
ex='srcUrl":"(.*?)"}}'
mp4urldownload=re.findall(ex,mp4urldata)[0]
mp4urldownloadw = re.findall(r'-.+mp4', mp4urldownload)[0]
#print(mp4urldownloadw)
mp4urldownload1=re.findall(r'https.+/20....../',mp4urldownload)[0]+'cont-'+mp4urlid+mp4urldownloadw
#print( mp4urldownload1)
dic={
'name':spname,
'url':mp4urldownload1
}
urls.append(dic)
def get_pool(dic):
url=dic['url']
name=dic['name']
mp4data=requests.get(url=url,headers=headtops).content
mp4lenght=len(mp4data)
with open(name,mode='wb')as f:
f.write(mp4data)
print('总长度%s已下载完成'%(mp4lenght))
pool=Pool(4)
pool.map(get_pool,urls)
pool.close()
pool.join()
#
#
https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4
错误
#
https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4
正确
作者:
knowledge
时间:
2021-12-10 15:35
感谢分享,学习一下.
欢迎光临 精易论坛 (https://125.confly.eu.org/)
Powered by Discuz! X3.4