标题: 爬今日头条高清小姐姐图集 [打印本页] 作者: ideologism 时间: 2019-1-1 12:57 标题: 爬今日头条高清小姐姐图集 from hashlib import md5 from bs4 import BeautifulSoup from urllib.parse import urlencode import requests from requests.exceptions import RequestException import json import re import os from multiprocessing import Pool
def parse_page_index(html):
data = json.loads(html) if 'data' in data.keys(): for items in data.get('data'): if items.get('article_url') == None: continue yield items.get('article_url')
def get_page_detail(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
reponse = requests.get(url,headers=headers) try: if reponse.status_code == 200: return reponse.text except RequestException:
print('详情页页不存在')
def parse_page_detail(html):
# print(html)
result = re.findall('gallery:.*?JSON.parse\("(.*?)"\)', html, re.S) for results in result:
results = re.sub('\\\\','',results) if len(result) ==0: pass else:
soup = BeautifulSoup(html, 'lxml')
title = soup.title.string if results:
data = json.loads(results) if 'count' in data.keys():
sub_images = data.get('sub_images')
images = [items.get('url') for items in sub_images] for image in images:
download_images(image)
def save_images(content):
file_path = ('E:/爬取的图片/'+ md5(content).hexdigest() +'.jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f:
f.write(content)
f.close()
def main(offset,keyword):
html=get_page_index(offset,keyword) for url in parse_page_index(html):
html = get_page_detail(url) if html:
result=parse_page_detail(html)
if __name__ == '__main__':
pool = Pool()
keyword = input('keyword:') for i in range(20):
offset=i*20
main(offset,keyword) 作者: ideologism 时间: 2019-1-1 12:58
加油加油,继续学习,忘大佬勿喷作者: wangxd 时间: 2019-1-1 13:03
加油加油,继续学习,忘大佬勿喷 作者: 汉族 时间: 2019-1-1 14:29
易语言的话,可以用多线程 不要命的爬作者: ideologism 时间: 2019-1-1 14:30