-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler_yahoo_finance.py
executable file
·90 lines (75 loc) · 2.69 KB
/
crawler_yahoo_finance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/python
import sys
import urllib2
import re
import os
import time
import urllib
import urllib2
import random
import json
from math import log
# output file name: input/stockPrices_raw.json
# json structure: crawl daily price data from yahoo finance
# ticker
# / | \
# open close adjust ...
# / | \
# dates dates dates ...
def calc_finished_ticker():
os.system("awk -F',' '{print $1}' ./input/news_reuters.csv | sort | uniq > ./input/finished.reuters")
def get_stock_Prices():
fin = open('./input/finished.reuters')
output = './input/stockPrices_raw.json'
# exit if the output already existed
if os.path.isfile(output):
sys.exit("Prices data already existed!")
priceSet = {}
priceSet['^GSPC'] = repeatDownload('^GSPC') # download S&P 500
for num, line in enumerate(fin):
ticker = line.strip()
print(num, ticker)
priceSet[ticker] = repeatDownload(ticker)
# if num > 10: break # for testing purpose
with open(output, 'w') as outfile:
json.dump(priceSet, outfile, indent=4)
def repeatDownload(ticker):
repeat_times = 3 # repeat download for N times
for _ in range(repeat_times):
try:
time.sleep(random.uniform(2, 3))
priceStr = PRICE(ticker)
if len(priceStr) > 0: # skip loop if data is not empty
break
except:
if _ == 0: print ticker, "Http error!"
return priceStr
def PRICE(ticker):
start_y, start_m, start_d = '2004', '01', '01' # starting date
end_y, end_m, end_d = '2999', '12', '01' # until now
# Construct url
url1 = "http://chart.finance.yahoo.com/table.csv?s=" + ticker
url2 = "&a=" + start_m + "&b=" + start_d + "&c=" + start_y
url3 = "&d=" + end_m + "&e=" + end_d + "&f=" + end_y + "&g=d&ignore=.csv"
# parse url
response = urllib2.urlopen(url1 + url2 + url3)
csv = response.read().split('\n')
# get historical price
ticker_price = {}
index = ['open', 'high', 'low', 'close', 'volume', 'adjClose']
for num, line in enumerate(csv):
line = line.strip().split(',')
if len(line) < 7 or num == 0: continue
date = line[0]
# check if the date type matched with the standard type
if not re.search(r'^[12]\d{3}-[01]\d-[0123]\d$', date): continue
# open, high, low, close, volume, adjClose : 1,2,3,4,5,6
for num, typeName in enumerate(index):
try:
ticker_price[typeName][date] = round(float(line[num + 1]),2)
except:
ticker_price[typeName] = {}
return ticker_price
if __name__ == "__main__":
calc_finished_ticker()
get_stock_Prices()