nhentai爬虫

2018 年 10 月 14 日

22999 次浏览

2246字数

自己写的一个nhentai爬虫
目前只能单线程爬url地址的所有图片

# -*- coding=utf-8
import requests
import os
from bs4 import BeautifulSoup

def mkdir(path): 
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
 
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists=os.path.exists(path)
    # 判断结果
    if not isExists:
            os.makedirs(path)
            print (path+' 创建成功')
            return True
    else:

            print (path+' 目录已存在')
            return False 
 
def trans(p):
    p=p.replace('|','')   #replace()字符替换 old（旧字符串） 替换成 new(新字符串)
    p=p.replace('?','')
    p=p.replace('*','')
    p=p.replace('<','')
    p=p.replace('>','')
    p=p.replace('/','')
    p=p.replace('\\','')
    p=p.replace('"','')
    p=p.replace('\\','')
    p=p.replace(':','')
    return p

headers = {'user-agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}

url="https://nhentai.net/search/?q=doujinshi+full+color"

resp = requests.get(url=url, headers=headers)
resp.encoding = 'UTF-8'
soup = BeautifulSoup(resp.text, 'html.parser')
#抓取图片首页连接
for news in soup.select('.gallery'):
    a = news.select('a')[0]['href']     #取出class=gallery元素下的a标签的href
    url1="https://nhentai.net"+a
    
    resp1 = requests.get(url=url1, headers=headers)
    resp1.encoding = 'UTF-8'
    soup1 = BeautifulSoup(resp1.text, 'html.parser')
    #处理标题
    try:
        dirname=soup1.select('#info h2')[0].text
    except IndexError:
        dirname=soup1.select('#info h1')[0].text
#        print("未找到中文标题 使用英文标题")

#    print(dirname)
    dirname=trans(dirname)
    mkpath="E:\\nhentai\\" + dirname + "\\"
    mkdir(mkpath)
    for news1 in soup1.select('.thumb-container'):
        b = news1.select('a')[0]['href'] #取出二级图片链接
        img ="https://nhentai.net"+b  #构造链接

        resp2 = requests.get(url=img, headers=headers)
        resp2.encoding = 'UTF-8'
        soup2 = BeautifulSoup(resp2.text, 'html.parser')
        c = soup2.select('.fit-horizontal')[0]['src']              #取出图片链接
        name = c.split("/")[-1]                 #split匹配出文件名
        r = requests.get(url=c, headers=headers)
        
        with open(mkpath + name,'wb') as f:
            f.write(r.content)

nhentai爬虫