精易论坛

标题: 爬今日头条高清小姐姐图集 [打印本页]

作者: ideologism 时间: 2019-1-1 12:57
标题: 爬今日头条高清小姐姐图集
from hashlib import md5
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import requests
from requests.exceptions import RequestException
import json
import re
import os
from multiprocessing import Pool

def get_page_index(offset,keyword):
data = {
      'offset': offset,
      'format':'json',
      'keyword': keyword,
      'autoload': 'true',
      'count': '20',
      'cur_tab': 1,
      'from': 'search_tab',
      'pd': 'synthesis'
}
reponse = requests.get('https://www.toutiao.com/search_content/?'+urlencode(data))
try:
      if reponse.status_code == 200:
         return reponse.text
except RequestException:
      print('请求页不存在')

def parse_page_index(html):
data = json.loads(html)
if 'data' in data.keys():
      for items in data.get('data'):
         if items.get('article_url') == None:
            continue
         yield items.get('article_url')

def get_page_detail(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
reponse = requests.get(url,headers=headers)
try:
      if reponse.status_code == 200:
         return reponse.text
except RequestException:
      print('详情页页不存在')

def parse_page_detail(html):
# print(html)
result = re.findall('gallery:.*?JSON.parse\("(.*?)"\)', html, re.S)
for results in result:
      results = re.sub('\\\\','',results)
      if len(result) ==0:
         pass
      else:
         soup = BeautifulSoup(html, 'lxml')
         title = soup.title.string
         if results:
            data = json.loads(results)
            if 'count' in data.keys():
                  sub_images = data.get('sub_images')
                  images = [items.get('url') for items in sub_images]
                  for image in images:
                     download_images(image)

def download_images(url):
print('正在下载....'+url)
headers = {
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
reponse = requests.get(url, headers=headers)
try:
      if reponse.status_code == 200:
         save_images(reponse.content)
except RequestException:
      print('详情页页不存在')

def save_images(content):
file_path = ('E:/爬取的图片/'+ md5(content).hexdigest() +'.jpg')
if not os.path.exists(file_path):
      with open(file_path,'wb') as f:
         f.write(content)
         f.close()

def main(offset,keyword):
html=get_page_index(offset,keyword)
for url in parse_page_index(html):
      html = get_page_detail(url)
      if html:
         result=parse_page_detail(html)

if __name__ == '__main__':
pool = Pool()
keyword = input('keyword:')
for i in range(20):
      offset=i*20
      main(offset,keyword)

作者: ideologism 时间: 2019-1-1 12:58
加油加油，继续学习，忘大佬勿喷

作者: wangxd 时间: 2019-1-1 13:03
加油加油，继续学习，忘大佬勿喷

作者: 汉族 时间: 2019-1-1 14:29
易语言的话，可以用多线程不要命的爬

作者: ideologism 时间: 2019-1-1 14:30

汉族发表于 2019-1-1 14:29
易语言的话，可以用多线程不要命的爬

这个可以开多进程只是我没开

作者: cpa软件 时间: 2019-2-11 12:48
pythonpythonpythonpythonpythonpythonpython

欢迎光临精易论坛 (https://125.confly.eu.org/)