forked from stat697-s19/team-3_project_repo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
STAT697-01_s19-team-3_data_preparation.sas
339 lines (274 loc) · 8.75 KB
/
STAT697-01_s19-team-3_data_preparation.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
*******************************************************************************;
**************** 80-character banner for column width reference ***************;
* (set window width to banner width to calibrate line length to 80 characters *;
*******************************************************************************;
*
[Dataset 1 Name] 2018 Statistical Annex Table 6
[Dataset Description] Multidimensional Poverty Index: developing countries
[Experimental Unit Description] Countries from around the world
[Number of Observations] 1,365
[Number of Features] 13
[Data Source] http://hdr.undp.org/sites/default/files/composite_tables/2018_Statistical_Annex_Table_6.xlsx
[Data Dictionary] http://hdr.undp.org/en//2018-MPI
[Unique ID Schema] The column "Country" is the unique primary key.
;
%let inputDataset3DSN = Statistical_2018_Annex_Table_6;
%let inputDataset3URL =
https://github.com/stat697/team-3_project_repo/blob/master/data/2018_Statistical_Annex_Table_6.xlsx?raw=true
;
%let inputDataset3Type = XLSX;
*
[Dataset 2 Name] 2018 Statistical Annex Table 4
[Dataset Description] Gender Development Index
[Experimental Unit Description] Countries from around the world
[Number of Observations] 2,268
[Number of Features] 12
[Data Source] http://hdr.undp.org/sites/default/files/composite_tables/2018_Statistical_Annex_Table_4.xlsx
[Data Dictionary] http://hdr.undp.org/en/content/gender-development-index-gdi
[Unique ID Schema] The columns "HDI rank" and "Country" form a composite key,
which together are equivalent to the composite key in dataset 2018 Statistical
Annex Table 3.
;
%let inputDataset2DSN = Statistical_2018_Annex_Table_4;
%let inputDataset2URL =
https://github.com/stat697/team-3_project_repo/blob/master/data/2018_Statistical_Annex_Table_4.xlsx?raw=true
;
%let inputDataset2Type = XLSX;
*
[Dataset 3 Name] 2018 Statistical Annex Table 3
[Dataset Description] Inequality-adjusted Human Development Index
[Experimental Unit Description] Countries from around the world
[Number of Observations] 2,646
[Number of Features] 14
[Data Source] http://hdr.undp.org/sites/default/files/composite_tables/2018_Statistical_Annex_Table_3.xlsx
[Data Dictionary] http://hdr.undp.org/content/inequality-adjusted-human-development-index-ihdi
[Unique ID Schema] The columns "HDI rank" and "Country" form a composite key,
which together are equivalent to the composite key in dataset 2018 Statistical
Annex Table 4.
;
%let inputDataset1DSN = Statistical_2018_Annex_Table_3;
%let inputDataset1URL =
https://github.com/stat697/team-3_project_repo/blob/master/data/2018_Statistical_Annex_Table_3.xlsx?raw=true
;
%let inputDataset1Type = XLSX;
* load raw datasets over the wire, if they doesn't already exist;
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype);
%put &=dsn;
%put &=url;
%put &=filetype;
%if
%sysfunc(exist(&dsn.)) = 0
%then
%do;
%put Loading dataset &dsn. over the wire now...;
filename
tempfile
"%sysfunc(getoption(work))/tempfile.&filetype."
;
proc http
method="get"
url="&url."
out=tempfile
;
run;
proc import
file=tempfile
out=&dsn.
dbms=&filetype.;
run;
filename tempfile clear;
%end;
%else
%do;
%put Dataset &dsn. already exists. Please delete and try again.;
%end;
%mend;
%macro loadDatasets;
%do i = 1 %to 3;
%loadDataIfNotAlreadyAvailable(
&&inputDataset&i.DSN.,
&&inputDataset&i.URL.,
&&inputDataset&i.Type.
)
%end;
%mend;
%loadDatasets
*There is no duplicate/missing primary ID value in the table Statistical_2018_Annex_Table_3. Therefore,
there is no need to implement a mitigation strategy for this dataset;
proc sql;
create table Annex_Table_3_dups as
select
Country
,count(*) as row_count_for_unique_id_value
from
Statistical_2018_Annex_Table_3
group by
Country
having
row_count_for_unique_id_value > 1
;
quit;
*There is no duplicate/missing primary ID value in the table Statistical_2018_Annex_Table_4. Therefore,
there is no need to implement a mitigation strategy for this dataset;
proc sql;
create table Annex_Table_4_dups as
select
Country
,count(*) as row_count_for_unique_id_value
from
Statistical_2018_Annex_Table_4
group by
Country
having
row_count_for_unique_id_value > 1
;
quit;
* There are 15 blank rows on the bottom of the Statistical_2018_Annex_Table_6 dataset,
mitigation strategy is developed and implemented per the code below;
proc sql;
create table Annex_Table_6_dups as
select
Country
,count(*) as row_count_for_unique_id_value
from
Statistical_2018_Annex_Table_6
group by
Country
having
row_count_for_unique_id_value > 1
;
create table Annex_Table_6_dups_fix as
select
*
from
Statistical_2018_Annex_Table_6
where
/* remove rows with missing unique id value components */
not(missing(Country))
;
quit;
*Ming's Research Question Column
*inspect columns of interest in cleaned versions of datasets;
/*
title "Inspect the adjusted life expentacy index from Statistical_2018_Annex_Table_3 dataset";
proc sql;
select
min(adjusted_life_index) as min
,max(adjusted_life_index) as max
,nmiss(adjusted_life_index) as missing
from
Statistical_2018_Annex_Table_3
;
quit;
title;
title "year of school for female from Statistical_2018_Annex_Table_4 dataset";
proc sql;
select
min(Year_School_Female) as min
,max(Year_School_Female) as max
,nmiss(Year_School_Female) as missing
from
Statistical_2018_Annex_Table_4
;
quit;
title;
title "Inspect edtimated gross national income on Statistical_2018_national_income";
proc sql;
select
min(Estimated_gross_national_income_) as min
,max(Estimated_gross_national_income_) as max
,nmiss(Estimated_gross_national_income_) as missing
from
Statistical_2018_Annex_Table_4
;
quit;
title;
title "Population_in_severe_multidimens from Annex_Table_6_dups_fix dataset";
proc sql;
select
min(Population_in_severe_multidimens) as min
,max(Population_in_severe_multidimens) as max
,mean(Population_in_severe_multidimens) as max
,median(Population_in_severe_multidimens) as max
,nmiss(Population_in_severe_multidimens) as missing
from
Annex_Table_6_dups_fix
;
quit;
title;
*/
*Mariano's Research Question Column
*inspect columns of interest in cleaned versions of datasets;
title "Inspect Multidimensional Poverty Index in Statistical_2018_Annex_Table_6";
proc sql;
select
min(Multidimensional_Poverty_Index) as min
,max(Multidimensional_Poverty_Index) as max
,mean(Multidimensional_Poverty_Index) as max
,median(Multidimensional_Poverty_Index) as max
,nmiss(Multidimensional_Poverty_Index) as missing
from
Annex_Table_6_dups_fix
;
quit;
title;
title "Inspect Population_living_below_national_poverty_line in Annex_Table_6_
dups_fix";
proc sql;
select
min(Population_living_below_national) as min
,max(Population_living_below_national) as max
,mean(Population_living_below_national) as max
,median(Population_living_below_national) as max
,nmiss(Population_living_below_national) as missing
from
Annex_Table_6_dups_fix
;
quit;
title;
title "Inspect Inequality_in_education in Statistical_2018_annex_table_3";
proc sql;
select
min(Inequality_in_education) as min
,max(Inequality_in_education) as max
,nmiss(Inequality_in_education) as missing
from
Statistical_2018_annex_table_3
;
quit;
title;
title "Inspect Mean_years_of_schooling_female in Statistical_2018_annex_table_4";
proc sql;
select
min(Mean_years_of_schooling_female) as min
,max(Mean_years_of_schooling_female) as max
,nmiss(Mean_years_of_schooling_female) as missing
from
Statistical_2018_annex_table_4
;
quit;
title;
title "Inspect Mean_years_of_schooling_male in Statistical_2018_Annex_Table_4";
proc sql;
select
min(Mean_years_of_schooling_male) as min
,max(Mean_years_of_schooling_male) as max
,nmiss(Mean_years_of_schooling_male) as missing
from
Statistical_2018_Annex_Table_4
;
quit;
title;
title "Inspect HDI in Statistical_2018_Annex_Table_3";
proc sql;
select
min(HDI) as min
,max(HDI) as max
,mean(HDI) as max
,median(HDI) as max
,nmiss(HDI) as missing
from
Statistical_2018_Annex_Table_3
;
quit;
title;