-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
70 lines (49 loc) · 2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# coding: utf-8
# -*- coding: utf-8 -*-
#
# imports
import pandas as pd
import requests
import re
import os
from yaml import safe_load, dump
from bs4 import BeautifulSoup as bs
import random
from modules import prevent_bad_replacement, delete_random_values, write_csv, standardise_cell_values, fill_gaps, csvs_to_pandas, find_csv_urls, get_mapping_dicts, override_writer
"""
sdg-csv-data-filler is the first module in a data pipeline to take
data from the SDG data repo and make it exportable as CSVW.
"""
# setting paths to directories and files
remote_data_url = "https://github.com/ONSdigital/sdg-data/tree/develop/data"
cwd = os.getcwd()
data_path = os.path.join(cwd, 'data')
out_path = os.path.join(cwd, 'out')
overrides_yam = (os.path.join(cwd,"overrides_dict.yaml"))
def entry_point(data_url):
# generate urls
urls_gen = find_csv_urls(data_url)
with open('overrides_dict.yaml') as file:
generic_from_yam = safe_load(file)['generic_overrides']
# define pattern for name matching outside the for-loop. Used for writing out later
pattern = "(indicator_\d{1,2}-\d{1,2}-\d+\.csv)$"
# create an empty results dict
results = {}
for _url in urls_gen:
# get the overrides dict for this dataset
overrides_dict = get_mapping_dicts(overrides_yam, _url)
# Create df
df = csvs_to_pandas(_url)
#get dataset name
file_name = f"{re.search(pattern, _url).group(0)}"
if df is None or df.empty: # sometimes no df will be returned so it needs to be skipped
results[file_name] = False
continue
# Apply transformations to the df
df = override_writer(df, overrides_dict)
#Writing the df to csv locally.
was_written = write_csv(df, out_path, file_name)
results[file_name] = was_written
return results
results = entry_point(data_url=remote_data_url)
print(f"number of CSVs missing from output = {len(list(find_csv_urls(remote_data_url)))-len(results.keys())}")