-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_livingspace_commute_to_HMAgrid.py
141 lines (103 loc) · 4.57 KB
/
add_livingspace_commute_to_HMAgrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 15 13:22:32 2022
@author: TuoVaisanen-e01
"""
import pandas as pd
import geopandas as gpd
import argparse
# set up argument parser
ap = argparse.ArgumentParser()
# Get grid file
ap.add_argument("-g", "--grid", required=True,
help="Path to 250 m grid geopackage covering the HMA. For example: /path/to/folder/")
# Get path to input file
ap.add_argument("-lg", "--langgrid", required=True,
help="Path to folder containing geopackages with diversity metrics calculated. Files should be named 'HMA_langs_famgen_div_[YEAR].gpkg'")
# Get path to input file
ap.add_argument("-hg", "--homegrid", required=True,
help="Path to folder containing geopackages with diversity metrics calculated. Files should be named 'HMA_langs_famgen_div_[YEAR].gpkg'")
# Get path to input file
ap.add_argument("-c", "--commute", required=True,
help="Path to folder containing CSVs about commutes.")
# Get path to input file
ap.add_argument("-l", "--livspace", required=True,
help="Path to folder containing CSVs about living space data.")
# Get path to output file
ap.add_argument("-o", "--output", required=True,
help="Path to output folder. For example: /path/to/folder/. This script assumes you have access to FOLK data within Fiona")
# parse arguments
args = vars(ap.parse_args())
def format_asva(df, dftype, year):
# calculate types of living arrangements
df = df['asva'].value_counts().reset_index().rename(columns={'index':'asva','asva':'count'})
# calculate proportions
df['prop'] = (df['count'] / df['count'].sum()) * 100
# set type and year
df['type'] = dftype
df['year'] = year
return df
# get hma grid
grid = gpd.read_file(args['grid'] + '250m_HMA_accurate.gpkg')
# set grid id to integer
grid['NRO'] = grid['NRO'].astype(int)
# get grid ID's as list
gridlist = list(grid['NRO'].values)
# dataframe list for concatenation
concatlist = []
# loop over years
for i in range(1987,2020):
print('[INFO] - Processing year ' + str(i))
# get year specific information
lpath = args['langgrid'] + str(i) + '_mothertongues.csv'
tpath = args['commute'] + 'FOLK_tkt_data_' + str(i) + '.csv'
apath = args['livspace'] + 'FOLK_askun_data_' + str(i) + '.csv'
hpath = args['homegrid'] + 'henkilo_paikkatiedot_' + str(i) +'.csv'
# read data in
langs = pd.read_csv(lpath, encoding='utf-8', sep=',')
askun = pd.read_csv(apath, encoding='utf-8', sep=',')
homes = pd.read_csv(hpath, encoding='utf-8', sep=',')
# drop nan values
homes = homes.dropna(subset=['euref_250'])
# convert to integer for joining
homes['euref_250'] = homes['euref_250'].astype(int)
# check if year is not 2019
if i != 2019:
# read worklife stats in
tkt = pd.read_csv(tpath, encoding='utf-8', sep=',')
# merge tkt and askun data to one
combined = pd.merge(tkt, askun[['shnro', 'asva']], on='shnro')
# merge language information
combined = pd.merge(combined, langs[['shnro','kieli']], on='shnro')
else:
#merge language information
combined = pd.merge(askun, langs[['shnro','kieli']], on='shnro')
# merge combined tkt and askun data to homelocation data
combined = pd.merge(combined, homes[['shnro','euref_250']], on='shnro')
# drop folks who do not live inside the HMA
HMA = combined[combined['euref_250'].isin(gridlist)]
# get estonians and somalis
est = HMA[HMA['kieli'] == 'et']
som = HMA[HMA['kieli'] == 'so']
# get living space per year
lsest = format_asva(est, 'est', i)
lssom = format_asva(som, 'som', i)
lshma = format_asva(HMA, 'hma', i)
# append dataframes
concatlist.append(lssom)
concatlist.append(lsest)
concatlist.append(lshma)
# check if year is not 2019
if i != 2019:
# get only workforce
wf = HMA[HMA['ptoim2'] == 11]
# group by grid ids
print('[INFO] - Grouping by grids for year ' + str(i) + '...')
grouped = wf.groupby('euref_250')['tyomatka'].mean().reset_index()
#save to pickled dataframe
grouped.to_pickle(args['output'] + 'commute_euclidean_HMA_' + str(i) + '.pkl')
else:
pass
# create one df
results = pd.concat(concatlist)
results.to_pickle(args[output] + 'living_space_indv_87-19.pkl')