基于进程+线程实现多任务爬虫程序,爬取站长之家风景图片
- #!/usr/bin/env python
- # coding=utf-8
- """
- 基于进程+线程实现多任务爬虫程序,爬取站长之家风景图片
- """
- import time
- import uuid
- from multiprocessing import Queue, Process
- from threading import Thread
-
- import requests
- from lxml import etree
- from openpyxl import Workbook
-
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
- }
- # 数量计数器
- num = 0
- # 存入表格
- wb = Workbook()
- ws = wb.active
- ws.title = '图片'
- # 创建表头
- ws.append(['id', 'name', 'image_url'])
-
- class DownloadThread(Thread):
- """
- 下载线程
- """
- def __init__(self, url,name):
- self.url = url
- self.show_name = name
- self.content = None
- super().__init__()
-
- def run(self):
- print("开始下载%s:%s" % (self.show_name, self.url))
- resp = requests.get(self.url, headers=headers)
- if resp.status_code == 200:
- resp.encoding = 'utf-8'
- self.content = resp.text
- print("%s下载完成"%self.show_name)
- else:
- print("此页面异常,异常代码:", resp.status_code)
-
- def get_content(self):
- return self.content
-
-
- class DownloadProcess(Process):
- """
- 下载进程
- """
- def __init__(self, url_q, html_q):
- self.url_q: Queue = url_q # 下载
- self.html_q: Queue = html_q # 解析
- super().__init__()
-
- def run(self):
- while True:
- try:
- url = self.url_q.get(timeout=30)
- # 启动下载子线程,下载每页数据
- t = DownloadThread(url,name="列表页")
- t.start()
- t.join()
- # 获取下载数据
- html = t.get_content()
- # 将数据压入到解析队列中
- self.html_q.put((url,html))
- except:
- break
- print("--下载进程DownloadProcess结束--")
-
-
- class ParseThread(Thread):
- """
- 解析线程
- """
- def __init__(self,url_q,html,base_url):
- self.url_q:Queue = url_q
- self.html:Queue = html
- self.base_url = base_url
- super().__init__()
- def run(self):
- html = etree.HTML(self.html)
- page = html.xpath('//div[@class="fenye"]/a[@class="active"]/b/text()')[0]
- print("解析线程开启")
-
- # 一、这个方法获取到的图片是大图,如果没有大图则下载小图
- images = html.xpath('//div[@id="container"]/div[contains(@class,"picblock")]/div/a')
- for img in images:
- item = {}
- item['id'] = uuid.uuid4().hex
- item['name'] = img.xpath('./@alt')[0]
- # 下载大图,如果没有大图则下载小图
- img_url = img.xpath('./@href')[0]
- # 启动下载子线程,下载大图
- t = DownloadThread(img_url,name="图片")
- t.start()
- t.join()
- # 获取下载数据,有可能没有返回None则下载小图
- img_html = t.get_content()
- if img_html:
- img_html = etree.HTML(img_html)
- item['cover'] = img_html.xpath('//div[@class="imga"]/a/@href')[0]
- else:
- try:
- item['cover'] = img.xpath('./img/@src2')[0]
- except:
- item['cover'] = img.xpath('./img/@src')[0]
- global num
- num += 1
- print("已下载数据个数:%s,当前页数:%s"%(num,page))
- print(item)
- text = [item['id'],item['name'],item['cover']]
- ws.append(text)
-
- # 二、这个方法获取到的图片是小图,速度很快
- # images = html.xpath('//div[@id="container"]/div[contains(@class,"picblock")]//img')
- # for img in images:
- # item = {}
- # item['id'] = uuid.uuid4().hex
- # item['name'] = img.xpath('./@alt')[0]
- # try:
- # item['cover'] = img.xpath('./@src2')[0]
- # except:
- # item['cover'] = img.xpath('./@src')[0]
- # print(item)
- try:
- # 获取下一页url,获取最后一页会报错
- get_next_url = html.xpath('//div[@class="fenye"]/a[@class="nextpage"]/@href')[0]
- if get_next_url.startswith("http"):
- next_url = get_next_url
- else:
- next_url = self.base_url + get_next_url
- self.url_q.put(next_url) # 添加到下载队列中
- except:
- pass
-
- class ParseProcess(Process):
- """
- 解析进程
- """
-
- def __init__(self, url_q, html_q):
- self.url_q: Queue = url_q
- self.html_q: Queue = html_q
- super().__init__()
-
- def run(self):
- while True:
- try:
- # 读取解析任务
- url, html = self.html_q.get(timeout=30)
- print("准备启动解析任务子线程", url)
- # 获取base_url'http://sc.chinaz.com/tupian/'
- base_url = url[:url.rindex('/')+1]
- # 启动子线程解析任务
- t = ParseThread(self.url_q,html,base_url)
- t.start()
- except:
- break
- print("-*-解析进程ParseProcess结束-*-")
- file_end_name = time.strftime("%Y-%m-%d", time.localtime())
- wb.save("图片表"+file_end_name + '.xlsx')
- print("保存表格完毕")
-
- if __name__ == '__main__':
- task1 = Queue() # 下载任务队列
- task2 = Queue() # 解析任务队列
-
- # 起始爬虫任务
- task1.put('http://sc.chinaz.com/tupian/fengjingtupian.html')
- p1 = DownloadProcess(task1, task2) # 下载
- p2 = ParseProcess(task1, task2) # 解析
-
- p1.start()
- p2.start()
-
- p1.join()
- p2.join()
- print("--全部完成--")