精易论坛

标题: l视频p虫 [打印本页]

作者: 325332460    时间: 2021-12-10 15:27
标题: l视频p虫
  
import requests
from lxml import etree
import re
from concurrent.futures import ProcessPoolExecutor
from multiprocessing.dummy import Pool
import time
import random
url= ' https://www.pearvideo.com/category_5'
headtop={
' User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
adata=requests.get (url=url,headers=headtop).text ###想要拿到MP4必须进入视频里面在获取MP4url
tree=etree.HTML (adata)
li_list=tree.xpath ( ' //ul[@id="listvideoListUl"]/li')
urls=[]
for li in li_list:
spurl= ' https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]#视频进入的链接
spname=li.xpath ( ' ./div/a/div[2]/text()')[0]+'.mp4'#视频名称加后缀
enterdata=requests.get (url=spurl,headers=headtop).text #进入视频网页
mp4urlid=re.sub ("\D","",spurl) #正则获取视频id  {可以print看看url}
#timezhou =str ( random.Random ())
#random .Random ()      #0 .1111123这个不重要 这个是在阿贾克斯返回MP4链接请求用到不过不重要
#print (spurl)
headtops={
' Referer':spurl,
' User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
} #当我们打开抓包工具查看阿贾克斯返回时候发现MP4在阿贾克斯返回json里面 想要拿到json 要加上head Referer才能获取碰巧Referer是想获取该视频进入的链接
mp4url= ' https://www.pearvideo.com/videoStatus.jsp?contId='+mp4urlid+'&mrd=0.6387279085804516'
#####mp4url拼接 由于正则不好勉强拼接出来,把错误都MP4url加上cont-加视频id即可 #####
# [url=https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4]https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4[/url] 错误
# [url=https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4]https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4[/url]  正确
#print (mp4urlid)
mp4urldata=requests.get (url=mp4url,headers=headtops).text
#print (mp4urldata)
ex= ' srcUrl":"(.*?)"}}'
mp4urldownload=re.findall (ex,mp4urldata)[0]
mp4urldownloadw = re.findall (r ' -.+mp4', mp4urldownload)[0]
#print (mp4urldownloadw)
mp4urldownload1=re.findall (r ' https.+/20....../',mp4urldownload)[0]+'cont-'+mp4urlid+mp4urldownloadw
#print ( mp4urldownload1)
dic={
' name':spname,
' url':mp4urldownload1
}
urls.append (dic)
def get_pool (dic):
url=dic[ ' url']
name=dic[ ' name']
mp4data=requests.get (url=url,headers=headtops).content
mp4lenght=len (mp4data)
with open (name,mode= ' wb')as f:
f.write (mp4data)
print ( ' 总长度%s已下载完成'%(mp4lenght))
pool=Pool (4)
pool.map (get_pool,urls)
pool.close ()
pool.join ()
#
# [url=https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4]https://video.pearvideo.com/mp4/third/20211209/1639066911889-11298265-140008-hd.mp4[/url] 错误
# [url=https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4]https://video.pearvideo.com/mp4/third/20211209/cont-1747476-11298265-140008-hd.mp4[/url]  正确


作者: knowledge    时间: 2021-12-10 15:35
感谢分享,学习一下.




欢迎光临 精易论坛 (https://125.confly.eu.org/) Powered by Discuz! X3.4