-
Notifications
You must be signed in to change notification settings - Fork 1
/
analyze_data.py
executable file
·95 lines (83 loc) · 3.37 KB
/
analyze_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import pandas as pd
def get_date_etf_list_from_data(directory='data'):
"""
Returns list of dates and etfs based on files in the given directory
Parameters
----------
directory : str, optional
Directory where CSV files are stored. The default is 'data'.
Returns
-------
dates : list of str
Strings of dates found in files. Format YYYY-MM-DD.
etfs : list of str
Strings of ETF ticker symbols.
"""
files = os.listdir(directory)
files = [f.split('.csv')[0] for f in files if '.csv' in f]
dates = set()
etfs = set()
ba_data_name = re.compile('[0-9]{4}\-[0-9]{2}\-[0-9]{2}_[A-Z]{2,6}')
for f in files:
if ba_data_name.match(f):
date, etf = f.split('_')
dates.add(date)
etfs.add(etf)
return dates, etfs
def create_and_save_quoated_spread_data(directory='data', sample_frequency=60, ignore_errors=1):
"""
Convert quoted spreads from various CSV files of various days' and ETFs' data to one data frame.
Parameters
----------
directory : str, optional
Folder containing data to be read. The default is 'data'.
sample_frequency : int, optional
Number of seconds of each data lump. The default is 60.
ignore_errors : int, optional
Level of ignoring errors in file reading:
0 = raise exceptions
1 = catch and print unavailable files
2 = catch and pass
Returns
-------
quoted_spread : pd.DataFrame
If an exeption occurs, returns data frame of quoted spread data. If no
exception, returns None.
"""
dates, etfs = get_date_etf_list_from_data(directory)
mi = pd.MultiIndex.from_product([dates, etfs], names=['dates','etf'])
quoted_spread = pd.DataFrame(columns=mi)
for index, date in enumerate(dates):
for etf in etfs:
try:
df = pd.read_csv(os.path.join(directory, '{}_{}.csv'.format(date, etf)), index_col=0)
except FileNotFoundError as e:
if ignore_errors == 0:
raise e
elif ignore_errors == 1:
print("Failed to find file for {} on {}".format(etf, date))
elif ignore_errors == 2:
pass
else:
raise AttributeError("ignore_errors must be 0, 1, 2. Given {}".format(ignore_errors))
quoted_spread[(date, etf)] = df['relative spread']
if index%10 == 0:
print('finished {}/{} dates'.format(index, len(dates)))
try:
basetime = pd.to_datetime('2021-01-01') + pd.Timedelta(hours=9, minutes=30)
timedeltas = pd.TimedeltaIndex([pd.Timedelta(seconds=x) for x in quoted_spread.index])
quoted_spread.index = basetime + timedeltas
if sample_frequency is not None:
resample_str = '{}s'.format(sample_frequency)
quoted_spread = quoted_spread.resample(resample_str).mean()
quoted_spread.index = quoted_spread.index + pd.Timedelta(seconds = sample_frequency / 2)
quoted_spread.to_pickle(os.path.join(directory, 'quoted_spread.pkl'), protocol=4)
quoted_spread.to_csv(os.path.join(directory, 'quoted_spread.csv.zip'))
except:
return quoted_spread
if __name__ == '__main__':
create_and_save_quoated_spread_data()