-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodules.py
207 lines (184 loc) · 7.95 KB
/
modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import pandas as pd
import requests
import re
import os
from yaml import safe_load, dump
from bs4 import BeautifulSoup as bs
import random
import numpy as np
def get_mapping_dicts(path_to_yaml, url):
"""
Loads dictionaries for the gap filling and mapping of column names
from locally stored .yaml files. Returns only the dictionary for
the specified dataset as the url is the unique identifier for the dataset.
This unique identifier is used as a key on the returned dictionary.
If the unique identifier (key) does not exist, `none` is returned.
Parameters:
path_to_yaml (string): Path to the yaml file storing the values
to override values with
url (string): The source url for the dataset which is a unique identifier
Returns:
dict: dict_from_yam
"""
with open(path_to_yaml) as file:
dict_from_yam = safe_load(file)
if url not in dict_from_yam.keys():
print (f"{url} not in dictionary")
dict_from_yam = dict_from_yam.get(url)
return dict_from_yam
def find_csv_urls(url):
"""
Provided with a data folder URL, this function finds the URLS
of the CSV files within the folder. A generator is yielded with
the links of all files in the folder.
Parameters:
url (string): the URL of the repo/folder which contains
the CSV files to be captured
Yields:
string: generator, the next URL for the CSV file in the
remote data folder """
page = requests.get(url)
soup = bs(page.text, 'html.parser')
csv_link_pattern = r"\/ONSdigital\/sdg-data\/blob\/develop\/data\/indicator_\d-\d{1,2}-\d{1,2}\.csv"
to_repl_pattern = r"\/sdg-data\/blob\/develop"
replacement_pattern = "/sdg-data/develop"
attrs={'href': re.compile(csv_link_pattern)}
for link in soup.findAll('a', attrs=attrs):
link = link.get('href')
link = re.sub(to_repl_pattern, replacement_pattern, link)
yield ("https://raw.githubusercontent.com"+link)
def csvs_to_pandas(url_to_csv):
"""Provided with a URL of a file, the fucntion will check if the CSV
is populated and if not empty return a Pandas dataframe of the CSV
Parameters:
url (string): the URL of a CSV file to be captured
Returns:
pd.DataFrame: a Pandas dataframe of the CSV
"""
if "no data for this indicator yet" in str(bs(requests.get(url_to_csv).text)):
return None
else:
return pd.read_csv(url_to_csv)
def prevent_bad_replacement(overrides_dict, df):
"""Checks that there is not any values equal to the placeholder-values which are used
as keys in the overrides dict. If they did exist, this would cause a bad replacement.
It is very unlikely that this is needed, but just here as an extra safety step"""
place_holder_values = ['FILL_NA', 'OldValue1', 'OldValue2', 'OldValue3', 'to']
results_dict = {}
for search_value in place_holder_values:
if search_value in df.values:
results_dict[search_value] = True
raise Exception(f"Value {search_value} has been found in the dataframe")
return None
def csvsample_to_pandas(path_to_file, pct=1.0):
"""A function to create a sample extract of a csv as a dataframe
Parameters:
path_to_file (string): full path to csv file
p (float): decimal amount of lines to extract
Returns:
pd.Dataframe
"""
p = pct/100
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
path_to_file,
header=0,
skiprows=lambda i: i>0 and random.random() > p)
return df
def fill_gaps(df, gap_filler_dict):
"""
Given a Pandas dataframe and a dictionary containing the column names
the correct 'fillers' for each column, this function will fill
each column with the correct values when empty cells are found.
Parameters:
pd_df (pd.Dataframe): the variable to which the dataframe
containing the csv data is assigned
gap_filler_dict (dict): a dictionary with column name and value
to fill gaps as key value pairs, e.g.
{"Age":"All","Sex":"T"}
Returns:
pd.Dataframe: A dataframe with all cells full"""
df = df.fillna(gap_filler_dict, axis=1)
return df
def standardise_cell_values(pd_df, dict_of_nonstandard_standard):
"""
Maps non-standard values e.g. "Males" to standard values like "M".
Mapping is carried out on a column-specific basis.
"""
df = (pd_df.replace
(to_replace=dict_of_nonstandard_standard,
value=None))
return df
def write_csv(df, out_path, filename):
"""
Converts a Pandas dataframe to CSV and writes it out to a local folder.
Parameters:
pd_df (pd.Dataframe): The pandas data frame of the data
path (string): the path of the local "out" folder
Returns:
Boolean: True is written, False if not written """
status = True
# If the out_path isn't there, make it
if not os.path.exists(out_path):
os.makedirs(out_path, exist_ok=True)
try:
full_path = os.path.join(out_path, filename)
df.to_csv(full_path, index=False)
except Exception:
extended_e = Exception(f"Error encountered when attempting csv write to {full_path}: ") #from e
print(extended_e)
return False
return status
def delete_random_values(df, holes=20):
"""
Smashes holes (creates gaps) in your dataframe to the approximate number that you
request (randint might choose the same cell twice). This function is
just for testing.
Parameters:
df (pd.Dataframe): pd_df (pd.Dataframe): The pandas data frame of the
in which you want gaps to be made
Returns:
pd.Dataframe
"""
for i in range(holes):
row = random.randint(1, df.shape[0]-1)
col = random.randint(0, df.shape[1]-1)
df.iloc[row, col] = float('nan')
return df
def override_writer(df, overrides_dict):
"""Takes the data frame and makes column-specific replacements or overrides.
If fix_headers is True (False is default), it will change the headers to name in the overides dict.
If standardise_cells is True (default), it will search for the value to be replaced and if found
the value will be replaced. If fill_gaps is True (default) it will fill any gaps with the replacement
value.
Parameters:
df (pd.Dataframe): dataframe to be processed
overrides_dict (dict): The overrides dictionary specific to the dataset being processed
Returns:
pd.Dataframe: complete with requested value overrides
"""
fix_headers = overrides_dict['fix_headers']
standardise_cells = overrides_dict['standardise_cells']
fill_gaps = overrides_dict['fill_gaps']
if fix_headers:
#not used at the moment
pass
if standardise_cells:
for column in df.columns:
if column in ['value','Value']:
continue #skipping because Value is never a key in the dict
orig_dtype = str(df[column].dtype)
df[column] = df[column].astype(str)
df[column] = df[column].replace(to_replace=overrides_dict[column])
df[column] = df[column].astype(orig_dtype)
if fill_gaps:
for column in df.columns:
if column in ['value','Value']:
continue #skipping because Value is never a key in the dict
# import ipdb; ipdb.set_trace()
df[column].replace('nan', np.nan, inplace=True) #replacing string 'nan' with numpy.nan
df[column].fillna(
value=overrides_dict[column]['FILL_NA'],
inplace=True)
return df