2025年6月5日 星期四 乙巳(蛇)年 三月初九 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

scrapy获取气象预警

时间:12-10来源:作者:点击数:35

scrapy获取气象预警

qxyj.py:

  • # -*- coding: utf-8 -*-
  • import datetime,time
  • from bs4 import BeautifulSoup
  • import requests
  • import scrapy
  • from QXYJ.items import QxyjItem
  • class QxyjSpider(scrapy.Spider):
  • name = 'qxyj'
  • allowed_domains = ['weather.com.cn']
  • # start_urls = ['http://www.weather.com.cn/alarm/newalarmlist.shtml?level=3']
  • start_urls = ['http://product.weather.com.cn/alarm/grepalarm_cn.php']
  • def parse(self, response):
  • items = []
  • # 获取发布列表
  • result_ = response.text[:-1]
  • result = result_.split("=")[-1]
  • # 注意:eval转换的时候不能有null,否则会报错,将null赋值为空
  • null = ''
  • result = eval(result)
  • data = result['data']
  • length = len(data)
  • # 今天的日期
  • today = datetime.datetime.now().strftime("%Y%m%d")
  • today2 = datetime.datetime.now().strftime("%Y-%m-%d")
  • # 内容地址的前半部分
  • start_url = 'http://product.weather.com.cn/alarm/webdata/'
  • if length>30:
  • for i in range(0,30):
  • item = QxyjItem()
  • # 101100503-20210507103742-6801.html
  • end_url = data[i][1]
  • # 获取中间的日期
  • pub_time = end_url.split("-")[1][:8]
  • if pub_time == today:
  • # 由于是js动态加载的不能直接获取
  • # 获取地址http://www.weather.com.cn/alarm/newalarmcontent.shtml?file=1011201-20210507091026-0501.html
  • item['news_url'] = start_url + end_url
  • item['pub_time'] = today2
  • items.append(item)
  • else:
  • for i in range(0, length):
  • item = QxyjItem()
  • # 101100503-20210507103742-6801.html
  • end_url = data[i][1]
  • # 获取中间的日期
  • pub_time = end_url.split("-")[1][:8]
  • if pub_time == today:
  • # 由于是js动态加载的不能直接获取
  • # 获取地址http://www.weather.com.cn/alarm/newalarmcontent.shtml?file=1011201-20210507091026-0501.html
  • item['news_url'] = start_url + end_url
  • item['pub_time'] = today2
  • items.append(item)
  • for item in items:
  • time.sleep(1)
  • yield scrapy.Request(url=item['news_url'], meta={'meta_1': item}, callback=self.parse_news)
  • def parse_news(self, response):
  • item = QxyjItem()
  • meta_1 = response.meta['meta_1']
  • result = response.text
  • start_content='<span style="color:#222222;font-family:&quot;microsoftyahei&quot;;white-space:pre-wrap;font-size:16px;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'
  • end_content = '</span>'
  • result = result.split("=")[-1]
  • result = eval(result)
  • news_title = result['head']
  • content = result['ISSUECONTENT']
  • key_word = result['SIGNALTYPE']
  • item['pub_time'] = meta_1['pub_time']
  • item['news_title'] = news_title
  • item['source'] = '中国天气网'
  • item['key_word'] = key_word
  • item['is_pub'] = '否'
  • item['content'] = start_content + content + end_content
  • if len(content) > 150:
  • item['news_guide'] = content[:100] + "......"
  • else:
  • item['news_guide'] = content
  • yield item

items.py:

  • import scrapy
  • class QxyjItem(scrapy.Item):
  • # define the fields for your item here like:
  • # 标题
  • news_title = scrapy.Field()
  • # 来源
  • source = scrapy.Field()
  • # 关键字
  • key_word = scrapy.Field()
  • # 文章导读
  • news_guide = scrapy.Field()
  • # 正文
  • content = scrapy.Field()
  • # 文章地址
  • news_url = scrapy.Field()
  • # 发布时间
  • pub_time = scrapy.Field()
  • # 上传情况
  • is_pub = scrapy.Field()

pipelines.py:

  • import json,time
  • from openpyxl import Workbook
  • # 转码操作,继承json.JSONEncoder的子类
  • class MyEncoder(json.JSONEncoder):
  • def default(self, o):
  • if isinstance(o, bytes):
  • return str(o, encoding='utf-8')
  • return json.JSONEncoder.default(self, o)
  • class QxyjPipeline(object):
  • def __init__(self):
  • self.wb = Workbook()
  • self.ws = self.wb.active
  • # 创建表头
  • self.ws.append(['标题', '来源', '关键字', '文章导读', '正文', '发布时间','是否上传'])
  • def process_item(self, item, spider):
  • text = [item['news_title'], item['source'], item['key_word'], item['news_guide'],
  • item['content'], item['pub_time'],item['is_pub']]
  • self.ws.append(text)
  • return item
  • def close_spider(self, spider):
  • time_file = time.strftime("%Y-%m-%d", time.localtime())
  • self.wb.save('qxyj'+time_file+'.xlsx')
  • print("数据处理完毕,谢谢使用!")

settings.py:

  • ITEM_PIPELINES = {
  • 'QXYJ.pipelines.QxyjPipeline': 300,
  • }
  • DEFAULT_REQUEST_HEADERS = {
  • 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
  • # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  • # 'Accept-Language': 'en',
  • }
  • # 还可以将日志存到本地文件中(可选添加设置)
  • LOG_FILE = "qxyj.log"
  • LOG_LEVEL = "DEBUG"
  • # 包含打印信息也一起写进日志里
  • LOG_STDOUT = True
  • # Obey robots.txt rules
  • ROBOTSTXT_OBEY = False
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐