scrapy获取气象预警
qxyj.py:
- # -*- coding: utf-8 -*-
- import datetime,time
- from bs4 import BeautifulSoup
- import requests
- import scrapy
- from QXYJ.items import QxyjItem
-
- class QxyjSpider(scrapy.Spider):
- name = 'qxyj'
- allowed_domains = ['weather.com.cn']
- # start_urls = ['http://www.weather.com.cn/alarm/newalarmlist.shtml?level=3']
- start_urls = ['http://product.weather.com.cn/alarm/grepalarm_cn.php']
-
- def parse(self, response):
- items = []
- # 获取发布列表
- result_ = response.text[:-1]
- result = result_.split("=")[-1]
- # 注意:eval转换的时候不能有null,否则会报错,将null赋值为空
- null = ''
- result = eval(result)
- data = result['data']
- length = len(data)
- # 今天的日期
- today = datetime.datetime.now().strftime("%Y%m%d")
- today2 = datetime.datetime.now().strftime("%Y-%m-%d")
- # 内容地址的前半部分
- start_url = 'http://product.weather.com.cn/alarm/webdata/'
- if length>30:
- for i in range(0,30):
- item = QxyjItem()
- # 101100503-20210507103742-6801.html
- end_url = data[i][1]
- # 获取中间的日期
- pub_time = end_url.split("-")[1][:8]
- if pub_time == today:
- # 由于是js动态加载的不能直接获取
- # 获取地址http://www.weather.com.cn/alarm/newalarmcontent.shtml?file=1011201-20210507091026-0501.html
- item['news_url'] = start_url + end_url
- item['pub_time'] = today2
- items.append(item)
- else:
- for i in range(0, length):
- item = QxyjItem()
- # 101100503-20210507103742-6801.html
- end_url = data[i][1]
- # 获取中间的日期
- pub_time = end_url.split("-")[1][:8]
- if pub_time == today:
- # 由于是js动态加载的不能直接获取
- # 获取地址http://www.weather.com.cn/alarm/newalarmcontent.shtml?file=1011201-20210507091026-0501.html
- item['news_url'] = start_url + end_url
- item['pub_time'] = today2
- items.append(item)
-
- for item in items:
- time.sleep(1)
- yield scrapy.Request(url=item['news_url'], meta={'meta_1': item}, callback=self.parse_news)
-
- def parse_news(self, response):
- item = QxyjItem()
- meta_1 = response.meta['meta_1']
- result = response.text
- start_content='<span style="color:#222222;font-family:"microsoftyahei";white-space:pre-wrap;font-size:16px;"> '
- end_content = '</span>'
- result = result.split("=")[-1]
- result = eval(result)
- news_title = result['head']
- content = result['ISSUECONTENT']
- key_word = result['SIGNALTYPE']
- item['pub_time'] = meta_1['pub_time']
- item['news_title'] = news_title
- item['source'] = '中国天气网'
- item['key_word'] = key_word
- item['is_pub'] = '否'
- item['content'] = start_content + content + end_content
- if len(content) > 150:
- item['news_guide'] = content[:100] + "......"
- else:
- item['news_guide'] = content
- yield item
-
items.py:
-
- import scrapy
-
- class QxyjItem(scrapy.Item):
- # define the fields for your item here like:
- # 标题
- news_title = scrapy.Field()
- # 来源
- source = scrapy.Field()
- # 关键字
- key_word = scrapy.Field()
- # 文章导读
- news_guide = scrapy.Field()
- # 正文
- content = scrapy.Field()
- # 文章地址
- news_url = scrapy.Field()
- # 发布时间
- pub_time = scrapy.Field()
- # 上传情况
- is_pub = scrapy.Field()
pipelines.py:
- import json,time
- from openpyxl import Workbook
-
- # 转码操作,继承json.JSONEncoder的子类
- class MyEncoder(json.JSONEncoder):
- def default(self, o):
- if isinstance(o, bytes):
- return str(o, encoding='utf-8')
- return json.JSONEncoder.default(self, o)
-
- class QxyjPipeline(object):
- def __init__(self):
- self.wb = Workbook()
- self.ws = self.wb.active
- # 创建表头
- self.ws.append(['标题', '来源', '关键字', '文章导读', '正文', '发布时间','是否上传'])
- def process_item(self, item, spider):
- text = [item['news_title'], item['source'], item['key_word'], item['news_guide'],
- item['content'], item['pub_time'],item['is_pub']]
- self.ws.append(text)
- return item
- def close_spider(self, spider):
- time_file = time.strftime("%Y-%m-%d", time.localtime())
- self.wb.save('qxyj'+time_file+'.xlsx')
- print("数据处理完毕,谢谢使用!")
settings.py:
- ITEM_PIPELINES = {
- 'QXYJ.pipelines.QxyjPipeline': 300,
- }
- DEFAULT_REQUEST_HEADERS = {
- 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- }
-
- # 还可以将日志存到本地文件中(可选添加设置)
- LOG_FILE = "qxyj.log"
- LOG_LEVEL = "DEBUG"
- # 包含打印信息也一起写进日志里
- LOG_STDOUT = True
-
- # Obey robots.txt rules
- ROBOTSTXT_OBEY = False
-