Merge pull request #5 from stat660/v0.3

V0.3 merged to master
stat660-f18 · Nov 11, 2018 · faf1e1e · faf1e1e
2 parents 95a7d07 + c640eed
commit faf1e1e
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 138 deletions.
diff --git a/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas b/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas
@@ -5,8 +5,8 @@
 
 *
 This file uses the following analytic dataset to address several research
-questions regarding happiness, peace index and biocapacity / 
-ecological-footprint
+questions regarding happiness,peace index and biocapacity,ecological-footprint
+
 Dataset Name: cotw_2016_analytic_file created in external file
 STAT660-01 f18-team-3 project2 data preparation.sas, which is assumed to be
 in the same directory as this file
@@ -50,12 +50,12 @@ footnote3
 ;
 
 *
-Note:This compares the column “Happiness Rank” from happy_2015 to the column 
+Note:This compares the column “Happiness Rank” from happy_2015 to the column 
 of the same name from happy_2016.
 
 Methodology: When combining happy_2015 with happy_2016 during data preparation,
 take the difference of values of "happiness_rank" for each school and create 
-a new variable called happiness_rank_yoy . Then, use proc sort to create a 
+a new variable called happiness_rank_yoy. Then, use proc sort to create a 
 temporary sorted table in descending by happiness_rank_yoy. Finally, use proc 
 print here to display the first five rows of the sorted dataset.
 
@@ -88,11 +88,11 @@ footnote;
 *******************************************************************************;
 
 title1
-"Research Question: Is "GPI" in a relationship with "Happiness Score" ?  "
+'Research Question: Is "GPI" in a relationship with "Happiness Score" ?'
 ;
 
 title2
-"Rationale: This will help determine if peace status of a country makes people happier"
+'Rationale: This will help determine if peace status of a country makes people happier'
 ;
 
 footnote1
@@ -108,7 +108,7 @@ footnote3
 ; 
 
 *
-Note:This compares the column “gpi” from gpi_2008-2016 to the column 
+Note:This compares the column “gpi” from gpi_2008-2016 to the column 
 happiness_score from happy_2016.
 
 Methodology:Use proc means to compute 5-number summaries of gpi 
@@ -130,7 +130,7 @@ proc freq
     table
              gpi
             *happiness_score
-			/ missing norow nocol nopercent
+	     / missing norow nocol nopercent
     ;
         where
             not(missing(happiness_score))
@@ -150,23 +150,19 @@ footnote;
 *******************************************************************************;
 
 title1
-'Is there a correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”? '
+'Is there a correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”? '
 ;
 
 title2
-"Rationale: Rationale: Biocapacity Deficit or Reserve reveals the sustainability status of a country, which can lead to a happier fulfilled life"
+'Rationale:Biocapacity Deficit or Reserve reveals the sustainability status of a country, which can lead to a happier fulfilled life'
 ;
 
 footnote1
 "The result shows a negative relation between the 2 variables "
 ;
 
 footnote2
-"However, Pearson Chi-Sq Test shows p-value > 0.05, therefore there is not enough evidence to show the correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”"
-;
-
-footnote3
-""
+"However, Pearson Chi-Sq Test shows p-value > 0.05, therefore there is not enough evidence to show the correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”"
 ;
 
 *
@@ -193,9 +189,9 @@ proc corr
     ;
     var 
         biocapacity_deficit_or_reserve
-		happiness_score
+	 happiness_score
     ;
 run;
+
 title;
 footnote;
-
diff --git a/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas b/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas
@@ -3,28 +3,29 @@
 **************** 80-character banner for column width reference ***************;
 * (set window width to banner width to calibrate line length to 80 characters *;
 *******************************************************************************;
-*
+/*
 This file uses the following analytic dataset to address several research
 questions regarding country's happiness.
 
-Dataset Name: cotw_2016_analytic_file created in external file
+Dataset Name: cotw_2016_analytic_file, cotw_2016_analytic_file_sort_hs, 
+and cotw_2016_analytic_file_sort_hr were created in external file
 STAT660-01_f18-team-3_project2_data_preparation.sas, which is assumed to be
-in the same directory as this file
+in the same directory as this file.
 
 See included file for dataset properties
-;
+*/
 
 * environmental setup;
 
 * set relative file import path to current directory (using standard SAS trick);
 X "cd ""%substr(%sysget(SAS_EXECFILEPATH),1,%eval(%length(%sysget(SAS_EXECFILEPATH))-%length(%sysget(SAS_EXECFILENAME))))""";
 
-* load external file that generates analytic datasets cde_2014_analytic_file,
-  cde_2014_analytic_file_sort_frpm, and cde_2014_analytic_file_sort_sat;
+* load external file that generates analytic datasets cotw_2016_analytic_file,
+  cotw_2016_analytic_file_sort_hs , and cotw_2016_analytic_file_sort_hr ;
 %include '.\STAT660-01_f18-team-3_project2_data_preparation.sas';
 
 *******************************************************************************;
-* Research Question Analysis Starting Point;
+* Research Question Analysis Starting Point                                    ;
 *******************************************************************************;
 title1
 "Research Question:  For the 20 largest countries, what are the top five countries that experienced the biggest decrease in Happiness Score between 2015 and 2016?"
@@ -46,7 +47,7 @@ footnote3
 "However, assuming there are no data issues underlying this analysis, possible explanations for such volatilities is due to the small population size. Note China has 1.4B people while U.S. has 320MM people."
 ;
 *******************************************************************************;
-*
+/*
 Note: This compares the column Happiness Score from happy_2015 to the 
 column of the same name from happy_2016.
 
@@ -61,10 +62,11 @@ Limitations: This methodology does not account for country's with missing data,
 nor does it attempt to validate data in any way, like filtering for percentages
 between 0 and 1.
 
-Follow-up Steps: More carefully clean values in order to filter out any possible
-illegal values, and better handle missing data, e.g., by using a previous 
-year's data or a rolling average of previous years' data as a proxy.
-; 
+Follow-up Steps: More carefully clean values in order to filter out any 
+possible illegal values, and better handle missing data, e.g., by using a 
+previous year's data or a rolling average of previous years' data as a proxy.
+*/
+*******************************************************************************;
 proc print 
     noobs label
     data = cotw_2016_analytic_file_sort_hs  (obs=5) 
@@ -109,7 +111,7 @@ proc sgplot
 run;
 
 *******************************************************************************;
-* Research Question Analysis Starting Point;
+* Research Question Analysis Starting Point                                    ;
 *******************************************************************************;
 title1
 'Research Question: Can "GPI" predict the "Happiness Score" in 2016?'
@@ -132,7 +134,7 @@ footnote3
 ;
 
 *******************************************************************************;
-*
+/*
 Note: This compares the change in GPI between 2015 and 2016 in gpi_raw 
 dataset to the column "Happiness Score" in happy_2016 dataset.
 
@@ -148,6 +150,15 @@ correlated. If not continue to build model than check test model assumptions.
    - Error variance is the same for all observations
   3) Y observations are not correlated with each other
 
+    Model Results
+    Happiness Score = 7.648 + (-1.104)*GPI
+    Type III SS p-value < 0.0001
+    22% of the variability in happiness score is explained by GPI
+
+    Correlation shows show -46.911% correlation. Thusly, not correlated can go 
+    to next step. Test for Residual normality, shows Shapiro-Wilk 0.2089>=0.05, 
+    failed to reject Ho, residuals are normally distributed.
+
 Goal: Find straight line that minimizes sum of squared distances from actual 
 weight to fitted line
 
@@ -157,30 +168,8 @@ only 22% can be explained by GPI.
 
 Follow-up Steps: A possible follow-up is add additional X variables to improve
 model predictiveness.
-;
-*******************************************************************************;
-/*
-Model Results
-Happiness Score = 7.648 + (-1.104)*GPI
-
-Results:
-Type III SS p-value < 0.0001
-22% of the variability in happiness score is explained by GPI
 */
-*******************************************************************************;
-/*
-proc corr 
-    pearson spearman nomiss
-    data = cotw_2016_analytic_file 
-    plots = scatter (nvar=2 alpha=0.05) 
-    ;
-    var 
-        happiness_score gpi
-    ;
-run;
-*/
-*Results show -46.911% correlation. Thusly, not correlated can go to next step ;
-
+
 proc glm   
     data= cotw_2016_analytic_file 
     ;
@@ -195,18 +184,6 @@ proc glm
 run; 
 quit; 
 
-/* Test of Normality Assumption*/
-/*
-proc univariate 
-    data = resids normal plot
-    ;
-    var 
-       res
-    ;
-run;
-*/
-/* Since Shapiro-Wilk 0.2089 >= 0.05, failed to reject Ho, residuals are normally distributed*/
-
 title;
 footnote;
 
@@ -234,10 +211,10 @@ footnote3
 ;
 
 *******************************************************************************;
-*
+/*
 Note: This compares the column "HDI" from eco_2016 to the column 
 "Life Expectancy" in happy_2016. 
-*
+
 Methodology: Use PROC CORR can to compute Pearson product-moment correlation 
 coefficient between hdi and life_expectancy, as well as Spearman's 
 rank-order correlation, a nonparametric measures of association. PROC CORR 
@@ -251,7 +228,7 @@ Possible Follow-up Steps: More carefully clean the values of the variables
 so that the means computed do not include any possible illegal values. 
 And use proc plot to generate a graph of the variable hdi against 
 life expectancy.
-;
+*/
 *******************************************************************************;
 
 proc corr 
@@ -280,19 +257,3 @@ proc sgplot
     ; 
 quit;
 
-
-/*
-proc glm   
-    data= cotw_2016_analytic_file 
-    ;
-    model 
-        happiness_score = gpi hdi
-        /solution   
-    ;
-    output 
-        out=resids 
-        r  =res
-    ;
-run; 
-quit; 
-*/
diff --git a/STAT660-01_f18-team-3_project2_data_preparation.sas b/STAT660-01_f18-team-3_project2_data_preparation.sas
@@ -122,7 +122,26 @@ proc format;
        -0.014<- 0.000="Q2 Happiness Score %"
         0.000<- 0.021="Q3 Happiness Score %"
         0.021<- high ="Q4 Happiness Score %"
-    ; 
+    ;
+    value $ country 
+	'Palestinian Territorie'   = 'Palestinian Territories'
+	'Somaliland region'        = 'Somalia'
+	'Taiwan Province of China' = 'Taiwan'
+	'Congo, Democratic Republ' = 'Congo (Kinshasa)'
+	'Congo'                    = 'Congo (Brazzaville)'
+	'Iran, Islamic Republic o' = 'Iran'
+	"Lao People's Democratic"  = 'Laos'
+	'Macedonia TFYR'           = 'Macedonia'
+	'Korea, Republic of'       = 'South Korea'
+	'Korea, Democratic People' = 'North Korea'
+	'Syrian Arab Republic'     = 'Syria'
+	'Tanzania, United Republi' = 'Tanzania' 
+	'United States of America' = 'United States'
+	'Venezuela, Bolivarian Re' = 'Venezuela'
+	'Viet Nam' 		   = 'Vietnam' 
+	'Palestine'                = 'Palestinian Territories'
+	'Republic of the Congo'    = 'Congo (Kinshasa)'  
+    ;
 run;
 
 * setup environmental parameters;
@@ -181,56 +200,7 @@ run;
                 set 
                     &dsn
 		;
-                if country = 'Palestinian Territorie'   
-                    then country = 'Palestinian Territories'
-		;
-                if country = 'Somaliland region'        
-                    then country = 'Somalia'
-		; 
-                if country = 'Taiwan Province of China' 
-                    then country = 'Taiwan'
-		;
-                if country = 'Congo, Democratic Republ' 
-                    then country = 'Congo (Kinshasa)'
-		;
-                if country = 'Congo'                    
-                    then country = 'Congo (Brazzaville)'
-		;
-                if country = 'Iran, Islamic Republic o' 
-                    then country = 'Iran'
-		;
-                if country = "Lao People's Democratic"  
-                    then country = 'Laos'
-		;
-                if country = 'Macedonia TFYR'           
-                    then country = 'Macedonia'
-		;
-                if country = 'Korea, Republic of'       
-                    then country = 'South Korea'
-		;
-                if country = 'Korea, Democratic People' 
-                    then country = 'North Korea'
-		;
-                if country = 'Syrian Arab Republic'     
-                    then country = 'Syria'
-		;
-                if country = 'Tanzania, United Republi' 
-                    then country = 'Tanzania'
-		;
-                if country = 'United States of America' 
-                    then country = 'United States'
-		;
-                if country = 'Venezuela, Bolivarian Re' 
-                    then country = 'Venezuela'
-		;
-                if country = 'Viet Nam' 				
-                    then country = 'Vietnam'
-		;	
-                if country = 'Palestine'                
-                    then country = 'Palestinian Territories'
-		;
-                if country = 'Republic of the Congo'    
-                    then country = 'Congo (Kinshasa)'  
+		country=put(country,country.)
 		;
             run;
 
@@ -480,8 +450,9 @@ run;
   cotw_2016_analytic_file_sort_hs by descending happiness_score_yoy  
   for largest 20 countries                                                     ;
 *******************************************************************************;
-proc sort nodupkey 
+proc sort 
     data = cotw_2016_analytic_file 
+    out  = cotw_2016_analytic_file_sort_hs
     ; 
     by descending 
         population_mm
@@ -490,7 +461,7 @@ run;
 
 data cotw_2016_analytic_file_sort_hs;
     set 
-        cotw_2016_analytic_file
+        cotw_2016_analytic_file_sort_hs
     ;
     if _n_<=20
     ;