-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbenchmark.py
194 lines (158 loc) · 7.92 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Run this in Terminal (MacOS):
# python3 benchmark.py --feature_folder /Documents/features --target pgm --year 2018 2019 2020 2021 --benchmark_name boot --month_lag 3 --save_folder_path /Documents/features
import argparse
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from pathlib import Path
import pyarrow.compute as pac
import yaml
import datetime
from dateutil.relativedelta import relativedelta
import os
def date_to_month_id(date: datetime.date):
"""
Convert a date to a corresponding month ID starting from January 1980.
Args:
date (datetime.date): The date to convert.
Returns:
int: The month ID.
"""
views_start_date = datetime.date(1980, 1, 1)
r = relativedelta(date, views_start_date)
return (r.years * 12) + r.months + 1
def main():
parser = argparse.ArgumentParser(description="Global Bootstrap Benchmark Script")
parser.add_argument("--feature_folder", type=str, help="Path to the feature folder")
parser.add_argument("--target", type=str, choices=["pgm", "cm"], help="Target type (pgm or cm)")
parser.add_argument("--years", type=int, nargs='+', help="List of years")
parser.add_argument("--benchmark_name", type=str, help="boot - Bootstrap, last - last historical")
parser.add_argument("--month_lag", type=int, help="Specify the month lag for prediction")
parser.add_argument("--save_folder_path", type=str, help="Specify a folder path name to save parquet files")
args = parser.parse_args()
feature_folder = Path(args.feature_folder)
target = args.target
years = args.years
benchmark_name = args.benchmark_name
month_lag = args.month_lag
save_folder_path = args.save_folder_path
for year in years:
start_date = datetime.date(year = year, month = 1, day = 1)
if benchmark_name == 'boot':
result = global_bootstrap(feature_folder, target, start_date, month_lag, num_samples=1000)
elif benchmark_name == 'last':
result = last_observed_poisson(feature_folder, target, start_date, month_lag, num_samples=1000)
save_results(result, benchmark_name, target, start_date, save_folder_path)
# Do something with the result, e.g., print or save it.
print(result)
def save_results(result, benchmark_name, target, start_date, save_folder_path):
"""
Save benchmark results to Parquet and create a YAML submission details file.
Args:
result (pandas.DataFrame): The benchmark result to be saved.
benchmark_name (str): The name of the benchmark.
target (str): The target type (e.g., 'cm' or 'pgm').
start_date (datetime.date): The start date for the benchmark.
save_folder_path (str): The path to the folder where results will be saved.
"""
# Create a folder with the benchmark name
benchmark_folder = os.path.join(save_folder_path, benchmark_name)
os.makedirs(benchmark_folder, exist_ok=True)
year = start_date.year
# Create a folder for the year within the unit folder (cm or pgm)
unit_folder = os.path.join(benchmark_folder, target)
os.makedirs(unit_folder, exist_ok=True)
# Create a folder for the year within the unit folder
year_folder = os.path.join(unit_folder, f'window=Y{year}')
os.makedirs(year_folder, exist_ok=True)
# Define the file name
file_name = f'bm_{benchmark_name}_{target}_{year}.parquet'
# Save the result to a Parquet file
result.to_parquet(os.path.join(year_folder, file_name))
# Create the YAML file with submission details within the benchmark folder
submission_details = {
"team": "benchmark",
"short_title": f"{benchmark_name} {target} {year}",
"even_shorter_identifier": f"{benchmark_name}_{target}_{year}",
"authors": [
{
"name": "Your Name",
"affil": "Your Affiliation",
"contact": "Your Contact Info"
}
]
}
yaml_file_path = os.path.join(benchmark_folder, "submission_details.yml")
if not os.path.exists(yaml_file_path):
with open(yaml_file_path, 'w') as yaml_file:
yaml.dump(submission_details, yaml_file,default_flow_style=False,sort_keys=False)
def global_bootstrap(feature_folder, target, start_date, month_lag=3, num_samples=1000):
"""
Perform global bootstrap for benchmarking.
Args:
feature_folder (str): The path to the feature folder containing the data.
target (str): The target type, either 'cm' or 'pgm'.
start_date (datetime.date): The start date for the benchmark.
month_lag (int, optional): The month lag for prediction. Default is 3.
num_samples (int, optional): The number of bootstrap samples. Default is 1000.
Returns:
pandas.DataFrame: The benchmark results after global bootstrap.
"""
if target == "pgm":
unit = "priogrid_gid"
elif target == "cm":
unit = "country_id"
else:
raise ValueError('Target must be "pgm" or "cm.')
window_start_month = date_to_month_id(start_date)
last_observed_month_id = pac.field("month_id") == window_start_month - month_lag
last_observed_data = pq.ParquetDataset(feature_folder / target, filters = last_observed_month_id).read(columns = [unit, "ged_sb"]).to_pandas()
filter_last_year = (pac.field("month_id") <= window_start_month - month_lag) & (pac.field("month_id") > window_start_month - (12 + month_lag))
pool = pq.ParquetDataset(feature_folder / target, filters=filter_last_year).read(columns=["ged_sb"]).to_pandas()
dfs = []
for month_id in range(window_start_month, window_start_month+12):
df = last_observed_data.copy()
df["month_id"] = month_id
df['outcome'] = np.random.choice(pool["ged_sb"], size=(df.shape[0], num_samples), replace=True).tolist()
df = df.drop(columns = "ged_sb")
df = df.explode('outcome').astype('int32')
df['draw'] = df.groupby(['month_id', unit]).cumcount()
df.set_index(['month_id', unit, 'draw'], inplace=True)
dfs.append(df)
return pd.concat(dfs)
def last_observed_poisson(feature_folder, target, start_date, month_lag=3, num_samples=1000):
"""
Perform last observed Poisson benchmarking.
Args:
feature_folder (str): The path to the feature folder containing the data.
target (str): The target type, either 'cm' or 'pgm'.
start_date (datetime.date): The start date for the benchmark.
month_lag (int, optional): The month lag for prediction. Default is 3.
num_samples (int, optional): The number of Poisson samples to generate. Default is 1000.
Returns:
pandas.DataFrame: The benchmark results after last observed Poisson benchmarking.
"""
if target == "pgm":
unit = "priogrid_gid"
elif target == "cm":
unit = "country_id"
else:
raise ValueError('Target must be "pgm" or "cm.')
window_start_month = date_to_month_id(start_date)
last_observed_month_id = (pac.field("month_id") == window_start_month - month_lag)
last_observed_data = pq.ParquetDataset(feature_folder / target, filters=last_observed_month_id).read(columns=[unit, "ged_sb"]).to_pandas()
dfs = []
for month_id in range(window_start_month, window_start_month+12):
df = last_observed_data.copy()
df["month_id"] = month_id
df["outcome"] = df["ged_sb"].apply(lambda x: np.random.poisson(x, num_samples))
df = df.drop(columns = "ged_sb")
df = df.explode("outcome").astype("int32")
df['draw'] = df.groupby(['month_id', unit]).cumcount()
df.set_index(['month_id', unit, 'draw'], inplace=True)
dfs.append(df)
# Concatenate the copies row-wise
return pd.concat(dfs)
if __name__ == "__main__":
main()
#python3 benchmark.py --feature_folder /Documents/features --target pgm --year 2018 2019 2020 2021 --benchmark_name boot --month_lag 3 --save_folder_path /Documents/features