From ec3f14df833aff9288c9101d708c5e42695103c2 Mon Sep 17 00:00:00 2001 From: jduan10-stat660 <42588871+jduan10-stat660@users.noreply.github.com> Date: Sat, 27 Oct 2018 20:44:43 -0700 Subject: [PATCH] Update STAT660-01_f18-team-1_project2_data_preparation.sas --- ...1_f18-team-1_project2_data_preparation.sas | 334 +++++++++++------- 1 file changed, 201 insertions(+), 133 deletions(-) mode change 100644 => 100755 STAT660-01_f18-team-1_project2_data_preparation.sas diff --git a/STAT660-01_f18-team-1_project2_data_preparation.sas b/STAT660-01_f18-team-1_project2_data_preparation.sas old mode 100644 new mode 100755 index 58837b9..3623bd5 --- a/STAT660-01_f18-team-1_project2_data_preparation.sas +++ b/STAT660-01_f18-team-1_project2_data_preparation.sas @@ -1,133 +1,201 @@ - -*******************************************************************************; -**************** 80-character banner for column width reference ***************; -* (set window width to banner width to calibrate line length to 80 characters *; -*******************************************************************************; - -* -[Dataset 1 Name] LoanStat_part1 - -[Dataset Description] -Complete LendingClub loan data for all loans issued in 2018 quarter 2 - -[Experimental Unit Description] LendingClub loans issued in 2018 Q2 - -[Number of Observations] 109 - -[Number of Features] 12 - -[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip - -[Data Dictionary] https://www.lendingclub.com/info/download-data.action - -[Unique ID Schema] The column member_id is a unique id. - --- - -[Dataset 2 Name] LoanStat_part2 - -[Dataset Description] -Complete LendingClub loan data for all loans issued in 2018 quarter 2 - -[Experimental Unit Description] LendingClub loans issued in 2018 Q2 - -[Number of Observations] 109 - -[Number of Features] 14 - -[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip - -[Data Dictionary] https://www.lendingclub.com/info/download-data.action - -[Unique ID Schema] The column member_id is a unique id. - --- - -[Dataset 3 Name] LoanStat_part3 - -[Dataset Description] -Complete LendingClub loan data for all loans issued in 2018 quarter 2 - -[Experimental Unit Description] LendingClub loans issued in 2018 Q2 - -[Number of Observations] 126 - -[Number of Features] 12 - -[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip - -[Data Dictionary] https://www.lendingclub.com/info/download-data.action - -[Unique ID Schema] The column member_id is a unique id. -; - -* environmental setup; - -* setup environmental parameters; -%let inputDataset1URL = -https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true -; -%let inputDataset1Type = XLSX; -%let inputDataset1DSN = loanstat1_raw; - -%let inputDataset2URL = -https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true -; -%let inputDataset2Type = XLSX; -%let inputDataset2DSN = loanstat2_raw; - -%let inputDataset3URL = -https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true -; -%let inputDataset3Type = XLSX; -%let inputDataset3DSN = loanstat3_raw; - -* load raw datasets over the wire, if they doesn't already exist; -%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype); - %put &=dsn; - %put &=url; - %put &=filetype; - %if - %sysfunc(exist(&dsn.)) = 0 - %then - %do; - %put Loading dataset &dsn. over the wire now...; - filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx"; - proc http - method="get" - url="&url." - out=tempfile - ; - run; - proc import - file=tempfile - out=&dsn. - dbms=&filetype.; - run; - filename tempfile clear; - %end; - %else - %do; - %put Dataset &dsn. already exists. Please delete and try again.; - %end; -%mend; -%loadDataIfNotAlreadyAvailable( - &inputDataset1DSN., - &inputDataset1URL., - &inputDataset1Type. -) -%loadDataIfNotAlreadyAvailable( - &inputDataset2DSN., - &inputDataset2URL., - &inputDataset2Type. -) -%loadDataIfNotAlreadyAvailable( - &inputDataset3DSN., - &inputDataset3URL., - &inputDataset3Type. -) - -* sort and check raw datasets for duplicates with respect to their unique ids, - removing blank rows, if needed; - - + +*******************************************************************************; +**************** 80-character banner for column width reference ***************; +* (set window width to banner width to calibrate line length to 80 characters *; +*******************************************************************************; + +* +[Dataset 1 Name] LoanStat_part1 + +[Dataset Description] +Complete LendingClub loan data for all loans issued in 2018 quarter 2 + +[Experimental Unit Description] LendingClub loans issued in 2018 Q2 + +[Number of Observations] 109 + +[Number of Features] 12 + +[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip + +[Data Dictionary] https://www.lendingclub.com/info/download-data.action + +[Unique ID Schema] The column member_id is a unique id. + +-- + +[Dataset 2 Name] LoanStat_part2 + +[Dataset Description] +Complete LendingClub loan data for all loans issued in 2018 quarter 2 + +[Experimental Unit Description] LendingClub loans issued in 2018 Q2 + +[Number of Observations] 109 + +[Number of Features] 14 + +[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip + +[Data Dictionary] https://www.lendingclub.com/info/download-data.action + +[Unique ID Schema] The column member_id is a unique id. + +-- + +[Dataset 3 Name] LoanStat_part3 + +[Dataset Description] +Complete LendingClub loan data for all loans issued in 2018 quarter 2 + +[Experimental Unit Description] LendingClub loans issued in 2018 Q2 + +[Number of Observations] 126 + +[Number of Features] 12 + +[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip + +[Data Dictionary] https://www.lendingclub.com/info/download-data.action + +[Unique ID Schema] The column member_id is a unique id. +; + +* environmental setup; + +* setup environmental parameters; +%let inputDataset1URL = +https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true +; +%let inputDataset1Type = XLSX; +%let inputDataset1DSN = loanstat1_raw; + +%let inputDataset2URL = +https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true +; +%let inputDataset2Type = XLSX; +%let inputDataset2DSN = loanstat2_raw; + +%let inputDataset3URL = +https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true +; +%let inputDataset3Type = XLSX; +%let inputDataset3DSN = loanstat3_raw; + +* load raw datasets over the wire, if they doesn't already exist; +%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype); + %put &=dsn; + %put &=url; + %put &=filetype; + %if + %sysfunc(exist(&dsn.)) = 0 + %then + %do; + %put Loading dataset &dsn. over the wire now...; + filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx"; + proc http + method="get" + url="&url." + out=tempfile + ; + run; + proc import + file=tempfile + out=&dsn. + dbms=&filetype.; + run; + filename tempfile clear; + %end; + %else + %do; + %put Dataset &dsn. already exists. Please delete and try again.; + %end; +%mend; +%loadDataIfNotAlreadyAvailable( + &inputDataset1DSN., + &inputDataset1URL., + &inputDataset1Type. +) +%loadDataIfNotAlreadyAvailable( + &inputDataset2DSN., + &inputDataset2URL., + &inputDataset2Type. +) +%loadDataIfNotAlreadyAvailable( + &inputDataset3DSN., + &inputDataset3URL., + &inputDataset3Type. +) + +* sort and check raw datasets for duplicates with respect to their unique ids, + removing blank rows, if needed; + +proc sort + nodupkey + data=Loanstat1_raw + dupout=Loanstat1_raw_dups + out=Loanstat1_raw_sorted(where=(not(missing(member_id)))) + ; + by + member_id + ; +run; + + +proc sort + nodupkey + data=Loanstat2_raw + dupout=Loanstat2_raw_dups + out=Loanstat2_raw_sorted(where=(not(missing(member_id)))) + ; + by + member_id + ; +run; + + +proc sort + nodupkey + data=Loanstat3_raw + dupout=Loanstat3_raw_dups + out=Loanstat3_raw_sorted(where=(not(missing(member_id)))) + ; + by + member_id + ; +run; + +* combine Loanstat1 and Loanstat3 datasets vertically, indicator variables +Loanstat1_data_ro and Loanstat3_data_row are created using the in= dataset +option, and created data source column to show the which dataset does the +data come from; + +data Loanstat_analytic_file_v1; + set + Loanstat1_raw(in=Loanstat1_data_row) + Loanstat3_raw(in=Loanstat3_data_row) + ; + if + Loanstat1_data_row=1 + then + do; + data_source="stat1"; + end; + else + do; + data_source="stat3"; + end; +run; + +* build new analytic dataset by horizontally combining datasets +Loanstat1_raw_sorted and Loanstat2_raw_sorted; + +data Loanstat_analytic_file_h1; + merge + Loanstat1_raw_sorted + Loanstat2_raw_sorted + ; + by + member_id + ; +run;