-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from llopez37-stat660/master
LL half of data prep & Data analysis
- Loading branch information
Showing
2 changed files
with
241 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,110 @@ | ||
*******************************************************************************; | ||
**************** 80-character banner for column width reference ***************; | ||
* (set window width to banner width to calibrate line length to 80 characters *; | ||
*******************************************************************************; | ||
|
||
* | ||
This file uses the following analytic dataset to address several research | ||
questions regarding loan amounts and statistics on loans | ||
Dataset Name: STAT660-01_f18-team-1_project2_data_preparation.sas | ||
See included file for dataset properties | ||
; | ||
|
||
* environmental setup; | ||
|
||
* set relative file import path to current directory (using standard SAS trick); | ||
X "cd ""%substr(%sysget(SAS_EXECFILEPATH),1,%eval(%length(%sysget(SAS_EXECFILEPATH))-%length(%sysget(SAS_EXECFILENAME))))"""; | ||
|
||
|
||
* load external file that generates analytic datasets cde_2014_analytic_file, | ||
cde_2014_analytic_file_sort_frpm, and cde_2014_analytic_file_sort_sat; | ||
%include '.\STAT660-01_f18-team-1_project2_data_preparation.sas'; | ||
|
||
|
||
*******************************************************************************; | ||
* Research Question Analysis Starting Point; | ||
*******************************************************************************; | ||
|
||
title1 | ||
'Research Question:What is the average annual income for those who rent and those who own?' | ||
; | ||
|
||
title2 | ||
'Rationale: This shows the buying power and cost of living for someone to own a home' | ||
; | ||
|
||
* | ||
Methodology: Here I will use proc means to get the mean and median of the homeowners | ||
and renters from the data set. | ||
Limitations: I might run into some issues with trying to differentiate owners versus | ||
mortgage and if owners might be landlords. | ||
Follow Up: Might try to see if we can get more clarification on the homeowners as | ||
well as seperate by state/region. | ||
; | ||
|
||
proc means | ||
data = loanstats | ||
class | ||
home_ownership | ||
; | ||
var | ||
annual inc | ||
; | ||
run | ||
; | ||
|
||
title1 | ||
'Research Question: Is there a correlation between interest rate and the annual income with loan amount and grade?' | ||
; | ||
|
||
title2 | ||
'Rationale: This would help show what is considered in terms of interest rates' | ||
; | ||
|
||
* | ||
Methodology: Here I will try to use proc corr to find a correlation amongst the data | ||
following a model of possibly interest rate = annual income + loan amount + grade | ||
with grade being a character variable so seeing how we apply log regg for that. | ||
Limitations: Will try to see if we can add more variables or possibly have to | ||
add interactions within our model, could get messy. | ||
Follow Up: Might have to change the question around if we find no correlation | ||
by adding other variables. | ||
; | ||
|
||
proc corr | ||
data = loanstats | ||
model | ||
int_rate = annual_inc + loan_amnt + grade | ||
; | ||
run; | ||
|
||
title1 | ||
'Research Question: What is the distribution of the loan amounts based on the purpose of the loan?' | ||
; | ||
|
||
title2 | ||
'Rationale: Would help get a sense of what the majority of people require loans for and roughly the amounts requested' | ||
; | ||
|
||
* | ||
Methodology: Might want to go the box whiskerplot route for this data and graph it with | ||
my y being loan amounts and x being type of loan. | ||
Limitations: Hopefully the data is clean enough to not vary in types of loans and | ||
similarily having to distinguish sub types of loans. | ||
Follow Up: Same as the first question, might want to filter out through state in order | ||
to determine factors of cost of living and debt owed since different states have different | ||
home costs. | ||
; | ||
|
||
proc sgplot | ||
data = loanstats | ||
vbox loan_amnt / category purpose | ||
; | ||
run; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,133 @@ | ||
|
||
*******************************************************************************; | ||
**************** 80-character banner for column width reference ***************; | ||
* (set window width to banner width to calibrate line length to 80 characters *; | ||
*******************************************************************************; | ||
|
||
* | ||
[Dataset 1 Name] LoanStat_part1 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 109 | ||
[Number of Features] 12 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
-- | ||
[Dataset 2 Name] LoanStat_part2 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 109 | ||
[Number of Features] 14 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
-- | ||
[Dataset 3 Name] LoanStat_part3 | ||
[Dataset Description] | ||
Complete LendingClub loan data for all loans issued in 2018 quarter 2 | ||
[Experimental Unit Description] LendingClub loans issued in 2018 Q2 | ||
[Number of Observations] 126 | ||
[Number of Features] 12 | ||
[Data Source] https://resources.lendingclub.com/LoanStats_2018Q2.csv.zip | ||
[Data Dictionary] https://www.lendingclub.com/info/download-data.action | ||
[Unique ID Schema] The column member_id is a unique id. | ||
; | ||
|
||
* environmental setup; | ||
|
||
* setup environmental parameters; | ||
%let inputDataset1URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part1.xlsx?raw=true | ||
; | ||
%let inputDataset1Type = XLSX; | ||
%let inputDataset1DSN = loanstat1_raw; | ||
|
||
%let inputDataset2URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part2.xlsx?raw=true | ||
; | ||
%let inputDataset2Type = XLSX; | ||
%let inputDataset2DSN = loanstat2_raw; | ||
|
||
%let inputDataset3URL = | ||
https://github.com/stat660/team-1_project2/blob/master/data/LoanStats_part3.xlsx?raw=true | ||
; | ||
%let inputDataset3Type = XLSX; | ||
%let inputDataset3DSN = loanstat3_raw; | ||
|
||
* load raw datasets over the wire, if they doesn't already exist; | ||
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype); | ||
%put &=dsn; | ||
%put &=url; | ||
%put &=filetype; | ||
%if | ||
%sysfunc(exist(&dsn.)) = 0 | ||
%then | ||
%do; | ||
%put Loading dataset &dsn. over the wire now...; | ||
filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx"; | ||
proc http | ||
method="get" | ||
url="&url." | ||
out=tempfile | ||
; | ||
run; | ||
proc import | ||
file=tempfile | ||
out=&dsn. | ||
dbms=&filetype.; | ||
run; | ||
filename tempfile clear; | ||
%end; | ||
%else | ||
%do; | ||
%put Dataset &dsn. already exists. Please delete and try again.; | ||
%end; | ||
%mend; | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset1DSN., | ||
&inputDataset1URL., | ||
&inputDataset1Type. | ||
) | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset2DSN., | ||
&inputDataset2URL., | ||
&inputDataset2Type. | ||
) | ||
%loadDataIfNotAlreadyAvailable( | ||
&inputDataset3DSN., | ||
&inputDataset3URL., | ||
&inputDataset3Type. | ||
) | ||
|
||
* sort and check raw datasets for duplicates with respect to their unique ids, | ||
removing blank rows, if needed; | ||
|
||
|