-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update STAT660-01_f18-team-1_project2_data_preparation.sas
- Loading branch information
1 parent
eb43a03
commit ec3f14d
Showing
1 changed file
with
201 additions
and
133 deletions.
There are no files selected for viewing
334 changes: 201 additions & 133 deletions
334
STAT660-01_f18-team-1_project2_data_preparation.sas
100644 → 100755
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,133 +1,201 @@ | ||
|
||
*******************************************************************************; | ||
**************** 80-character banner for column width reference ***************; | ||
* (set window width to banner width to calibrate line length to 80 characters *; | ||
*******************************************************************************; | ||
|
||
* | ||
[Dataset 1 Name] LoanStat_part1 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 109 | ||
[Number of Features] 12 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
-- | ||
[Dataset 2 Name] LoanStat_part2 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 109 | ||
[Number of Features] 14 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
-- | ||
[Dataset 3 Name] LoanStat_part3 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 126 | ||
[Number of Features] 12 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
; | ||
|
||
* environmental setup; | ||
|
||
* setup environmental parameters; | ||
%let inputDataset1URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true | ||
; | ||
%let inputDataset1Type = XLSX; | ||
%let inputDataset1DSN = loanstat1_raw; | ||
|
||
%let inputDataset2URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true | ||
; | ||
%let inputDataset2Type = XLSX; | ||
%let inputDataset2DSN = loanstat2_raw; | ||
|
||
%let inputDataset3URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true | ||
; | ||
%let inputDataset3Type = XLSX; | ||
%let inputDataset3DSN = loanstat3_raw; | ||
|
||
* load raw datasets over the wire, if they doesn't already exist; | ||
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype); | ||
%put &=dsn; | ||
%put &=url; | ||
%put &=filetype; | ||
%if | ||
%sysfunc(exist(&dsn.)) = 0 | ||
%then | ||
%do; | ||
%put Loading dataset &dsn. over the wire now...; | ||
filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx"; | ||
proc http | ||
method="get" | ||
url="&url." | ||
out=tempfile | ||
; | ||
run; | ||
proc import | ||
file=tempfile | ||
out=&dsn. | ||
dbms=&filetype.; | ||
run; | ||
filename tempfile clear; | ||
%end; | ||
%else | ||
%do; | ||
%put Dataset &dsn. already exists. Please delete and try again.; | ||
%end; | ||
%mend; | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset1DSN., | ||
&inputDataset1URL., | ||
&inputDataset1Type. | ||
) | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset2DSN., | ||
&inputDataset2URL., | ||
&inputDataset2Type. | ||
) | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset3DSN., | ||
&inputDataset3URL., | ||
&inputDataset3Type. | ||
) | ||
|
||
* sort and check raw datasets for duplicates with respect to their unique ids, | ||
removing blank rows, if needed; | ||
|
||
|
||
|
||
*******************************************************************************; | ||
**************** 80-character banner for column width reference ***************; | ||
* (set window width to banner width to calibrate line length to 80 characters *; | ||
*******************************************************************************; | ||
|
||
* | ||
[Dataset 1 Name] LoanStat_part1 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 109 | ||
[Number of Features] 12 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
-- | ||
[Dataset 2 Name] LoanStat_part2 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 109 | ||
[Number of Features] 14 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
-- | ||
[Dataset 3 Name] LoanStat_part3 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 126 | ||
[Number of Features] 12 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
; | ||
|
||
* environmental setup; | ||
|
||
* setup environmental parameters; | ||
%let inputDataset1URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true | ||
; | ||
%let inputDataset1Type = XLSX; | ||
%let inputDataset1DSN = loanstat1_raw; | ||
|
||
%let inputDataset2URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true | ||
; | ||
%let inputDataset2Type = XLSX; | ||
%let inputDataset2DSN = loanstat2_raw; | ||
|
||
%let inputDataset3URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true | ||
; | ||
%let inputDataset3Type = XLSX; | ||
%let inputDataset3DSN = loanstat3_raw; | ||
|
||
* load raw datasets over the wire, if they doesn't already exist; | ||
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype); | ||
%put &=dsn; | ||
%put &=url; | ||
%put &=filetype; | ||
%if | ||
%sysfunc(exist(&dsn.)) = 0 | ||
%then | ||
%do; | ||
%put Loading dataset &dsn. over the wire now...; | ||
filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx"; | ||
proc http | ||
method="get" | ||
url="&url." | ||
out=tempfile | ||
; | ||
run; | ||
proc import | ||
file=tempfile | ||
out=&dsn. | ||
dbms=&filetype.; | ||
run; | ||
filename tempfile clear; | ||
%end; | ||
%else | ||
%do; | ||
%put Dataset &dsn. already exists. Please delete and try again.; | ||
%end; | ||
%mend; | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset1DSN., | ||
&inputDataset1URL., | ||
&inputDataset1Type. | ||
) | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset2DSN., | ||
&inputDataset2URL., | ||
&inputDataset2Type. | ||
) | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset3DSN., | ||
&inputDataset3URL., | ||
&inputDataset3Type. | ||
) | ||
|
||
* sort and check raw datasets for duplicates with respect to their unique ids, | ||
removing blank rows, if needed; | ||
|
||
proc sort | ||
nodupkey | ||
data=Loanstat1_raw | ||
dupout=Loanstat1_raw_dups | ||
out=Loanstat1_raw_sorted(where=(not(missing(member_id)))) | ||
; | ||
by | ||
member_id | ||
; | ||
run; | ||
|
||
|
||
proc sort | ||
nodupkey | ||
data=Loanstat2_raw | ||
dupout=Loanstat2_raw_dups | ||
out=Loanstat2_raw_sorted(where=(not(missing(member_id)))) | ||
; | ||
by | ||
member_id | ||
; | ||
run; | ||
|
||
|
||
proc sort | ||
nodupkey | ||
data=Loanstat3_raw | ||
dupout=Loanstat3_raw_dups | ||
out=Loanstat3_raw_sorted(where=(not(missing(member_id)))) | ||
; | ||
by | ||
member_id | ||
; | ||
run; | ||
|
||
* combine Loanstat1 and Loanstat3 datasets vertically, indicator variables | ||
Loanstat1_data_ro and Loanstat3_data_row are created using the in= dataset | ||
option, and created data source column to show the which dataset does the | ||
data come from; | ||
|
||
data Loanstat_analytic_file_v1; | ||
set | ||
Loanstat1_raw(in=Loanstat1_data_row) | ||
Loanstat3_raw(in=Loanstat3_data_row) | ||
; | ||
if | ||
Loanstat1_data_row=1 | ||
then | ||
do; | ||
data_source="stat1"; | ||
end; | ||
else | ||
do; | ||
data_source="stat3"; | ||
end; | ||
run; | ||
|
||
* build new analytic dataset by horizontally combining datasets | ||
Loanstat1_raw_sorted and Loanstat2_raw_sorted; | ||
|
||
data Loanstat_analytic_file_h1; | ||
merge | ||
Loanstat1_raw_sorted | ||
Loanstat2_raw_sorted | ||
; | ||
by | ||
member_id | ||
; | ||
run; |