Skip to content

Commit 160a785

Browse files
committed
提交代码
1 parent 3267a89 commit 160a785

File tree

3 files changed

+174
-0
lines changed

3 files changed

+174
-0
lines changed

xianhuan/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ Python技术 公众号文章代码库
1010

1111
## 实例代码
1212

13+
[神器!五分钟完成大型爬虫项目!](https://github.com/JustDoPython/python-examples/tree/master/xianhuan/airspider):神器!五分钟完成大型爬虫项目!
14+
1315
[卧槽!几行代码,干掉一个网站!](https://github.com/JustDoPython/python-examples/tree/master/xianhuan/gengif):卧槽!几行代码,干掉一个网站!
1416

1517
[牛逼!用Python为她设计专属签名软件](https://github.com/JustDoPython/python-examples/tree/master/xianhuan/artname):牛逼!用Python为她设计专属签名软件
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on 2021-09-14 15:07:49
4+
---------
5+
@summary:
6+
---------
7+
@author: 闲欢
8+
"""
9+
10+
import feapder
11+
import json
12+
13+
from feapder.db.mysqldb import MysqlDB
14+
15+
16+
class ReportSpider(feapder.AirSpider):
17+
def __init__(self, *args, **kwargs):
18+
super().__init__(*args, **kwargs)
19+
self.db = MysqlDB()
20+
21+
def start_requests(self):
22+
yield feapder.Request("http://reportapi.eastmoney.com/report/list?cb=datatable1351846&industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime=2021-09-13&endTime=2021-09-14&pageNo=1&fields=&qType=0&orgCode=&code=*&rcode=&p=2&pageNum=2&_=1603724062679",
23+
callback=self.parse_report_info, pageNo=1)
24+
25+
def parse_report_info(self, request, response):
26+
print(request.pageNo)
27+
html = response.content.decode("utf-8")
28+
if len(html):
29+
content = html.replace('datatable1351846(', '')[:-1]
30+
content_json = json.loads(content)
31+
print(content_json)
32+
self.save_data(content_json)
33+
34+
def save_data(self, items):
35+
result_list = []
36+
for i in items['data']:
37+
result = {}
38+
obj = i
39+
result['title'] = obj['title'] #报告名称
40+
result['stockName'] = obj['stockName'] #股票名称
41+
result['stockCode'] = obj['stockCode'] #股票code
42+
result['orgCode'] = obj['stockCode'] #机构code
43+
result['orgName'] = obj['orgName'] #机构名称
44+
result['orgSName'] = obj['orgSName'] #机构简称
45+
result['publishDate'] = obj['publishDate'] #发布日期
46+
result['predictNextTwoYearEps'] = obj['predictNextTwoYearEps'] #后年每股盈利
47+
result['predictNextTwoYearPe'] = obj['predictNextTwoYearPe'] #后年市盈率
48+
result['predictNextYearEps'] = obj['predictNextYearEps'] # 明年每股盈利
49+
result['predictNextYearPe'] = obj['predictNextYearPe'] # 明年市盈率
50+
result['predictThisYearEps'] = obj['predictThisYearEps'] #今年每股盈利
51+
result['predictThisYearPe'] = obj['predictThisYearPe'] #今年市盈率
52+
result['indvInduCode'] = obj['indvInduCode'] # 行业代码
53+
result['indvInduName'] = obj['indvInduName'] # 行业名称
54+
result['lastEmRatingName'] = obj['lastEmRatingName'] # 上次评级名称
55+
result['lastEmRatingValue'] = obj['lastEmRatingValue'] # 上次评级代码
56+
result['emRatingValue'] = obj['emRatingValue'] # 评级代码
57+
result['emRatingName'] = obj['emRatingName'] # 评级名称
58+
result['ratingChange'] = obj['ratingChange'] # 评级变动
59+
result['researcher'] = obj['researcher'] # 研究员
60+
result['encodeUrl'] = obj['encodeUrl'] # 链接
61+
result['count'] = int(obj['count']) # 近一月个股研报数
62+
63+
result_list.append(result)
64+
65+
self.insertdb(result_list)
66+
67+
return result_list
68+
69+
def download_midware(self, request):
70+
request.headers = {
71+
"Connection": "keep-alive",
72+
"Cookie": "qgqp_b_id=0f1ac887e1e3e484715bf0e3f148dbd8; intellpositionL=1182.07px; st_si=32385320684787; st_asi=delete; cowCookie=true; intellpositionT=741px; st_pvi=73966577539485; st_sp=2021-03-22%2009%3A25%3A40; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=4; st_psi=20210914160650551-113300303753-3491653988",
73+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36",
74+
"Host": "reportapi.eastmoney.com"
75+
}
76+
return request
77+
78+
def validate(self, request, response):
79+
if response.status_code != 200:
80+
raise Exception("response code not 200") # 重试
81+
82+
83+
def insertdb(self, data_list):
84+
attrs = ['title', 'stockName', 'stockCode', 'orgCode', 'orgName', 'orgSName', 'publishDate', 'predictNextTwoYearEps',
85+
'predictNextTwoYearPe', 'predictNextYearEps', 'predictNextYearPe', 'predictThisYearEps', 'predictThisYearPe',
86+
'indvInduCode', 'indvInduName', 'lastEmRatingName', 'lastEmRatingValue', 'emRatingValue',
87+
'emRatingName', 'ratingChange', 'researcher', 'encodeUrl', 'count']
88+
insert_tuple = []
89+
for obj in data_list:
90+
insert_tuple.append((obj['title'], obj['stockName'], obj['stockCode'], obj['orgCode'], obj['orgName'], obj['orgSName'], obj['publishDate'], obj['predictNextTwoYearEps'], obj['predictNextTwoYearPe'], obj['predictNextYearEps'], obj['predictNextYearPe'], obj['predictThisYearEps'], obj['predictThisYearPe'], obj['indvInduCode'], obj['indvInduName'], obj['lastEmRatingName'], obj['lastEmRatingValue'], obj['emRatingValue'],obj['emRatingName'], obj['ratingChange'], obj['researcher'], obj['encodeUrl'], obj['count']))
91+
values_sql = ['%s' for v in attrs]
92+
attrs_sql = '('+','.join(attrs)+')'
93+
values_sql = ' values('+','.join(values_sql)+')'
94+
sql = 'insert into %s' % 'report'
95+
sql = sql + attrs_sql + values_sql
96+
97+
self.db.add_batch(sql, insert_tuple)
98+
99+
100+
101+
if __name__ == "__main__":
102+
ReportSpider().start()

xianhuan/airspider/setting.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
@author: 闲欢
5+
"""
6+
import os
7+
8+
9+
# MYSQL
10+
MYSQL_IP = "localhost"
11+
MYSQL_PORT = 3306
12+
MYSQL_DB = "xxx"
13+
MYSQL_USER_NAME = "root"
14+
MYSQL_USER_PASS = "xxx"
15+
16+
# REDIS
17+
# IP:PORT
18+
REDISDB_IP_PORTS = "localhost:6379"
19+
REDISDB_USER_PASS = ""
20+
# 默认 0 到 15 共16个数据库
21+
REDISDB_DB = 0
22+
23+
24+
# 爬虫相关
25+
# COLLECTOR
26+
COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔
27+
COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
28+
29+
# SPIDER
30+
SPIDER_THREAD_COUNT = 10 # 爬虫并发数
31+
SPIDER_SLEEP_TIME = [1, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
32+
SPIDER_MAX_RETRY_TIMES = 50 # 每个请求最大重试次数
33+
34+
# 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
35+
RETRY_FAILED_REQUESTS = False
36+
# request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒
37+
REQUEST_LOST_TIMEOUT = 600 # 10分钟
38+
# 保存失败的request
39+
SAVE_FAILED_REQUEST = True
40+
41+
# 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用
42+
RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
43+
RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒
44+
RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True
45+
46+
WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警
47+
48+
# 爬虫是否常驻
49+
KEEP_ALIVE = False
50+
51+
# 随机headers
52+
RANDOM_HEADERS = True
53+
# requests 使用session
54+
USE_SESSION = False
55+
56+
# 去重
57+
ITEM_FILTER_ENABLE = False # item 去重
58+
REQUEST_FILTER_ENABLE = False # request 去重
59+
60+
LOG_NAME = os.path.basename(os.getcwd())
61+
LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径
62+
LOG_LEVEL = "DEBUG"
63+
LOG_COLOR = True # 是否带有颜色
64+
LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
65+
LOG_IS_WRITE_TO_FILE = False # 是否写文件
66+
LOG_MODE = "w" # 写文件的模式
67+
LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
68+
LOG_BACKUP_COUNT = 20 # 日志文件保留数量
69+
LOG_ENCODING = "utf8" # 日志文件编码
70+
OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级

0 commit comments

Comments
 (0)