Skip to content

Commit

Permalink
Update STAT660-01_f18-team-1_project2_data_preparation.sas
Browse files Browse the repository at this point in the history
  • Loading branch information
jduan10-stat697 committed Oct 28, 2018
1 parent eb43a03 commit ec3f14d
Showing 1 changed file with 201 additions and 133 deletions.
334 changes: 201 additions & 133 deletions STAT660-01_f18-team-1_project2_data_preparation.sas
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,133 +1,201 @@

*******************************************************************************;
**************** 80-character banner for column width reference ***************;
* (set window width to banner width to calibrate line length to 80 characters *;
*******************************************************************************;

*
[Dataset 1 Name] LoanStat_part1
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 109
[Number of Features] 12
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
--
[Dataset 2 Name] LoanStat_part2
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 109
[Number of Features] 14
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
--
[Dataset 3 Name] LoanStat_part3
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 126
[Number of Features] 12
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
;

* environmental setup;

* setup environmental parameters;
%let inputDataset1URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true
;
%let inputDataset1Type = XLSX;
%let inputDataset1DSN = loanstat1_raw;

%let inputDataset2URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true
;
%let inputDataset2Type = XLSX;
%let inputDataset2DSN = loanstat2_raw;

%let inputDataset3URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true
;
%let inputDataset3Type = XLSX;
%let inputDataset3DSN = loanstat3_raw;

* load raw datasets over the wire, if they doesn't already exist;
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype);
%put &=dsn;
%put &=url;
%put &=filetype;
%if
%sysfunc(exist(&dsn.)) = 0
%then
%do;
%put Loading dataset &dsn. over the wire now...;
filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx";
proc http
method="get"
url="&url."
out=tempfile
;
run;
proc import
file=tempfile
out=&dsn.
dbms=&filetype.;
run;
filename tempfile clear;
%end;
%else
%do;
%put Dataset &dsn. already exists. Please delete and try again.;
%end;
%mend;
%loadDataIfNotAlreadyAvailable(
&inputDataset1DSN.,
&inputDataset1URL.,
&inputDataset1Type.
)
%loadDataIfNotAlreadyAvailable(
&inputDataset2DSN.,
&inputDataset2URL.,
&inputDataset2Type.
)
%loadDataIfNotAlreadyAvailable(
&inputDataset3DSN.,
&inputDataset3URL.,
&inputDataset3Type.
)

* sort and check raw datasets for duplicates with respect to their unique ids,
removing blank rows, if needed;



*******************************************************************************;
**************** 80-character banner for column width reference ***************;
* (set window width to banner width to calibrate line length to 80 characters *;
*******************************************************************************;

*
[Dataset 1 Name] LoanStat_part1
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 109
[Number of Features] 12
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
--
[Dataset 2 Name] LoanStat_part2
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 109
[Number of Features] 14
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
--
[Dataset 3 Name] LoanStat_part3
[Dataset Description]
Complete LendingClub loan data for all loans issued in 2018 quarter 2
[Experimental Unit Description] LendingClub loans issued in 2018 Q2
[Number of Observations] 126
[Number of Features] 12
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip
[Data Dictionary] https://www.lendingclub.com/info/download-data.action
[Unique ID Schema] The column member_id is a unique id.
;

* environmental setup;

* setup environmental parameters;
%let inputDataset1URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true
;
%let inputDataset1Type = XLSX;
%let inputDataset1DSN = loanstat1_raw;

%let inputDataset2URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true
;
%let inputDataset2Type = XLSX;
%let inputDataset2DSN = loanstat2_raw;

%let inputDataset3URL =
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true
;
%let inputDataset3Type = XLSX;
%let inputDataset3DSN = loanstat3_raw;

* load raw datasets over the wire, if they doesn't already exist;
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype);
%put &=dsn;
%put &=url;
%put &=filetype;
%if
%sysfunc(exist(&dsn.)) = 0
%then
%do;
%put Loading dataset &dsn. over the wire now...;
filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx";
proc http
method="get"
url="&url."
out=tempfile
;
run;
proc import
file=tempfile
out=&dsn.
dbms=&filetype.;
run;
filename tempfile clear;
%end;
%else
%do;
%put Dataset &dsn. already exists. Please delete and try again.;
%end;
%mend;
%loadDataIfNotAlreadyAvailable(
&inputDataset1DSN.,
&inputDataset1URL.,
&inputDataset1Type.
)
%loadDataIfNotAlreadyAvailable(
&inputDataset2DSN.,
&inputDataset2URL.,
&inputDataset2Type.
)
%loadDataIfNotAlreadyAvailable(
&inputDataset3DSN.,
&inputDataset3URL.,
&inputDataset3Type.
)

* sort and check raw datasets for duplicates with respect to their unique ids,
removing blank rows, if needed;

proc sort
nodupkey
data=Loanstat1_raw
dupout=Loanstat1_raw_dups
out=Loanstat1_raw_sorted(where=(not(missing(member_id))))
;
by
member_id
;
run;


proc sort
nodupkey
data=Loanstat2_raw
dupout=Loanstat2_raw_dups
out=Loanstat2_raw_sorted(where=(not(missing(member_id))))
;
by
member_id
;
run;


proc sort
nodupkey
data=Loanstat3_raw
dupout=Loanstat3_raw_dups
out=Loanstat3_raw_sorted(where=(not(missing(member_id))))
;
by
member_id
;
run;

* combine Loanstat1 and Loanstat3 datasets vertically, indicator variables
Loanstat1_data_ro and Loanstat3_data_row are created using the in= dataset
option, and created data source column to show the which dataset does the
data come from;

data Loanstat_analytic_file_v1;
set
Loanstat1_raw(in=Loanstat1_data_row)
Loanstat3_raw(in=Loanstat3_data_row)
;
if
Loanstat1_data_row=1
then
do;
data_source="stat1";
end;
else
do;
data_source="stat3";
end;
run;

* build new analytic dataset by horizontally combining datasets
Loanstat1_raw_sorted and Loanstat2_raw_sorted;

data Loanstat_analytic_file_h1;
merge
Loanstat1_raw_sorted
Loanstat2_raw_sorted
;
by
member_id
;
run;

0 comments on commit ec3f14d

Please sign in to comment.