forked from stat660-f18/team-1_project2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
STAT660-01_f18-team-1_project2_data_preparation.sas
executable file
·226 lines (184 loc) · 5.55 KB
/
STAT660-01_f18-team-1_project2_data_preparation.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
*******************************************************************************;
**************** 80-character banner for column width reference ***************;
* (set window width to banner width to calibrate line length to 80 characters *;
*******************************************************************************;
*
[Dataset 1 Name] LoanStat_part1
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 109
[Number of Features] 12
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
--
[Dataset 2 Name] LoanStat_part2
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 109
[Number of Features] 14
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
--
[Dataset 3 Name] LoanStat_part3
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 126
[Number of Features] 12
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
;
* environmental setup;
* setup environmental parameters;
%let inputDataset1URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true
;
%let inputDataset1Type = XLSX;
%let inputDataset1DSN = loanstat1_raw;
%let inputDataset2URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true
;
%let inputDataset2Type = XLSX;
%let inputDataset2DSN = loanstat2_raw;
%let inputDataset3URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true
;
%let inputDataset3Type = XLSX;
%let inputDataset3DSN = loanstat3_raw;
* load raw datasets over the wire, if they doesn't already exist;
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype);
%put &=dsn;
%put &=url;
%put &=filetype;
%if
%sysfunc(exist(&dsn.)) = 0
%then
%do;
%put Loading dataset &dsn. over the wire now...;
filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx";
proc http
method="get"
url="&url."
out=tempfile
;
run;
proc import
file=tempfile
out=&dsn.
dbms=&filetype.;
run;
filename tempfile clear;
%end;
%else
%do;
%put Dataset &dsn. already exists. Please delete and try again.;
%end;
%mend;
%loadDataIfNotAlreadyAvailable(
&inputDataset1DSN.,
&inputDataset1URL.,
&inputDataset1Type.
)
%loadDataIfNotAlreadyAvailable(
&inputDataset2DSN.,
&inputDataset2URL.,
&inputDataset2Type.
)
%loadDataIfNotAlreadyAvailable(
&inputDataset3DSN.,
&inputDataset3URL.,
&inputDataset3Type.
)
* sort and check raw datasets for duplicates with respect to their unique ids,
removing blank rows, if needed;
proc sort
nodupkey
data=Loanstat1_raw
dupout=Loanstat1_raw_dups
out=Loanstat1_raw_sorted(where=(not(missing(member_id))))
;
by
member_id
;
run;
proc sort
nodupkey
data=Loanstat2_raw
dupout=Loanstat2_raw_dups
out=Loanstat2_raw_sorted(where=(not(missing(member_id))))
;
by
member_id
;
run;
proc sort
nodupkey
data=Loanstat3_raw
dupout=Loanstat3_raw_dups
out=Loanstat3_raw_sorted(where=(not(missing(member_id))))
;
by
member_id
;
run;
* combine Loanstat1 and Loanstat3 datasets vertically, indicator variables
Loanstat1_data_ro and Loanstat3_data_row are created using the in= dataset
option, and keep colunms that are used in the research questions;
data Loanstat_analytic_file_v1;
retain
member_id
annual_inc
grade
int_rate
loan_amnt
home_ownership
;
keep
member_id
annual_inc
grade
int_rate
loan_amnt
home_ownership
;
set
Loanstat1_raw(in=Loanstat1_data_row)
Loanstat3_raw(in=Loanstat3_data_row)
;
run;
* build new analytic dataset by horizontally combining datasets
Loanstat1_raw_sorted and Loanstat2_raw_sorted, with the least number of columns
and minimal cleaning/transformation needed to address research questions in
corresponding data-analysis files;
data Loanstat_analytic_file_h1;
retain
member_id
annual_inc
grade
int_rate
purpose
loan_amnt
addr_state
;
keep
member_id
annual_inc
grade
int_rate
purpose
loan_amnt
addr_state
;
merge
Loanstat1_raw_sorted
Loanstat2_raw_sorted
;
by
member_id
;
run;