1、创建Scrapy项目
scrapy startproject qidian
2.进入项目目录,使用命令genspider创建Spider
scrapy genspider wanben qidian.com
3、定义要抓取的数据(处理items.py文件)
import scrapy
class QidianItem(scrapy.Item):
# define the fields for your item here like:
# 书名
book_name = scrapy.Field()
# 所在页码
num = scrapy.Field()
# 书的id
book_id = scrapy.Field()
# 作者
author = scrapy.Field()
# 简介
intro = scrapy.Field()
# 书的链接地址
book_url = scrapy.Field()
# 封面图链接
cover_img_url = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:wanben.py)
# -*- coding: utf-8 -*-
# 获取起点完本书籍信息
import scrapy
from ..items import QidianItem
class WanbenSpider(scrapy.Spider):
name = 'wanben'
allowed_domains = ['qidian.com']
start_urls = ['https://www.qidian.com/finish']
def parse(self, response):
next_page = response.xpath('//*[@id="page-container"]/div/ul/li[*]/a[contains(@class,"lbf-pagination-page")]/@href').extract()
for page in next_page:
print("处理第%s页" %(page[-1]))
yield scrapy.Request(url="https:"+page,callback=self.parse_book)
def parse_book(self,response):
# 书籍在第几页
num = (response.url).split("=")[-1]
book_name = response.xpath('//div[@class="book-mid-info"]/h4/a/text()').extract()
book_id = response.xpath('//div[@class="book-mid-info"]/h4/a/@data-bid').extract()
author = response.xpath('//div[@class="book-mid-info"]/p[@class="author"]/a[1]/text()').extract()
intro = response.xpath('//div[@class="book-mid-info"]/p[@class="intro"]/text()').extract()
# 书的链接地址要加上"https:"
book_url = response.xpath('//div[@class="book-mid-info"]/h4/a/@href').extract()
# 封面图地址要加上"https:",并且去掉结尾的数字150,获得大图
cover_img_url = response.xpath('//div[@class="book-img-box"]/a/img/@src').extract()
length = len(book_name)
for i in range(length):
item = QidianItem()
item['num'] = int(num)
item['book_id'] = book_id[i]
item['book_name'] = book_name[i]
item['author'] = author[i]
item['intro'] = intro[i].strip()
item['book_url'] = "https:" + book_url[i]
item['cover_img_url'] = "https:" + cover_img_url[i][:-3]
yield item
5.处理pipelines管道文件保存数据,下载封面图及将结果保存到表格和数据库中(pipelines.py)
import os
import time
import scrapy
import pymysql
from openpyxl import Workbook
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings
# 保存封面图到本地images文件夹
class ImagePipeline(ImagesPipeline):
# 获取settings文件中设置的图片保存地址IMAGES_STORE: ./images
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
def get_media_requests(self, item, info):
yield scrapy.Request(item['cover_img_url'])
def item_completed(self, results, item, info):
print("处理[%s]封面图片" % item['book_name'])
# 图片目录
if (not os.path.exists(self.IMAGES_STORE)):
os.makedirs(self.IMAGES_STORE)
# ['full/7237c3717f9d3eef185e2d6bad9903e2c6eef810.jpg']
image_path = [x['path'] for ok, x in results if ok]
# 图片更名和更换到imags文件夹下,默认是在full文件夹中
os.rename(self.IMAGES_STORE + '/' + image_path[0],self.IMAGES_STORE + '/' + item['book_name'] + '.jpg')
return item
def close_spider(self,spider):
print("图片下载完成")
# full最后如果是空文件夹删掉
path = self.IMAGES_STORE + '/full'
if not os.listdir(path):
os.rmdir(path)
class MySqlPipeline():
@classmethod
def from_crawler(cls,crawler):
cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
return cls()
def __init__(self):
self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
self.currsor = self.db.cursor()
def process_item(self,item,spider):
try:
sql = 'CREATE TABLE IF NOT EXISTS qidian(book_id BIGINT PRIMARY KEY NOT NULL,book_name VARCHAR(100) NOT NULL,num INT(5) COMMENT "所在页码",' \
'author VARCHAR(100),intro TEXT COMMENT "简介",book_url VARCHAR(200),' \
'cover_img_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;'
self.currsor.execute(sql)
except:
pass
try:
self.currsor.execute('SELECT book_id FROM qidian WHERE book_id=%s;',item['book_id'])
switch = self.currsor.fetchone()
keys,values = zip(*item.items())
if switch:
sql = """INSERT INTO qidian({})VALUES ({})ON DUPLICATE KEY UPDATE {};""".format(
','.join(keys),
','.join(['%s']* len(values)),
','.join(['{}=%s'.format(k)for k in keys])
)
self.currsor.execute(sql,values*2)
else:
sql = 'INSERT INTO qidian({})VALUES ({});'.format(
','.join(keys),
','.join(['%s'] * len(values))
)
self.currsor.execute(sql,values)
self.db.commit()
return item
except Exception as e:
print("出错:",e)
self.db.rollback()
def close_spider(self,spider):
print("数据库处理完毕!")
self.currsor.close()
self.db.close()
class XlsxPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.title = "qidian完结书表"
# 创建表头
self.ws.append(['book_id','所在页码','书名','作者','简介','链接地址','封面图链接地址'])
def process_item(self, item, spider):
text = [item['book_id'],item['num'],item['book_name'],item['author'],
item['intro'],item['book_url'],item['cover_img_url']]
self.ws.append(text)
return item
def close_spider(self,spider):
file_date = time.strftime("%Y-%m-%d",time.localtime())
self.wb.save(spider.name + file_date + '.xlsx')
print("表格已保存")
6.配置settings文件(settings.py)
# 将日志存到本地文件中(可选添加设置)
LOG_FILE = "qidian.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True
# 保存封面图片目录地址
IMAGES_STORE = './images'
# 配置MYSQL
MYSQL_HOST = "localhost"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWD = "123456"
MYSQL_DBNAME = "python5"
MYSQL_CHARSET = "utf8mb4"
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'User-Agesettingsnt': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
ITEM_PIPELINES = {
'qidian.pipelines.XlsxPipeline': 300,
'qidian.pipelines.ImagePipeline': 3,
'qidian.pipelines.MySqlPipeline': 301,
}
7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
scrapy crawl wanben

