From e275c9f97a9806d3a4be3eb99b5d1f5c86cb7163 Mon Sep 17 00:00:00 2001 From: mphan12-stat660 <42551898+mphan12-stat660@users.noreply.github.com> Date: Sun, 4 Nov 2018 17:10:17 -0800 Subject: [PATCH 1/7] updated data analaysis MP wrapped 80 characters deleted unused code --- ...18-team-3_project2_data_analysis_by_MP.sas | 71 +++++-------------- 1 file changed, 18 insertions(+), 53 deletions(-) diff --git a/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas b/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas index 770a237..73ff70c 100755 --- a/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas +++ b/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas @@ -3,7 +3,7 @@ **************** 80-character banner for column width reference ***************; * (set window width to banner width to calibrate line length to 80 characters *; *******************************************************************************; -* +/* This file uses the following analytic dataset to address several research questions regarding country's happiness. @@ -12,7 +12,7 @@ STAT660-01_f18-team-3_project2_data_preparation.sas, which is assumed to be in the same directory as this file See included file for dataset properties -; +*/ * environmental setup; @@ -46,7 +46,7 @@ footnote3 "However, assuming there are no data issues underlying this analysis, possible explanations for such volatilities is due to the small population size. Note China has 1.4B people while U.S. has 320MM people." ; *******************************************************************************; -* +/* Note: This compares the column Happiness Score from happy_2015 to the column of the same name from happy_2016. @@ -61,10 +61,11 @@ Limitations: This methodology does not account for country's with missing data, nor does it attempt to validate data in any way, like filtering for percentages between 0 and 1. -Follow-up Steps: More carefully clean values in order to filter out any possible -illegal values, and better handle missing data, e.g., by using a previous -year's data or a rolling average of previous years' data as a proxy. -; +Follow-up Steps: More carefully clean values in order to filter out any +possible illegal values, and better handle missing data, e.g., by using a +previous year's data or a rolling average of previous years' data as a proxy. +*/ +*******************************************************************************; proc print noobs label data = cotw_2016_analytic_file_sort_hs (obs=5) @@ -132,7 +133,7 @@ footnote3 ; *******************************************************************************; -* +/* Note: This compares the change in GPI between 2015 and 2016 in gpi_raw dataset to the column "Happiness Score" in happy_2016 dataset. @@ -157,7 +158,7 @@ only 22% can be explained by GPI. Follow-up Steps: A possible follow-up is add additional X variables to improve model predictiveness. -; +*/ *******************************************************************************; /* Model Results @@ -166,21 +167,13 @@ Happiness Score = 7.648 + (-1.104)*GPI Results: Type III SS p-value < 0.0001 22% of the variability in happiness score is explained by GPI + +Correlation shows show -46.911% correlation. Thusly, not correlated can go +to next step. Test for Residual normality, shows Shapiro-Wilk 0.2089 >= 0.05, +failed to reject Ho, residuals are normally distributed. */ *******************************************************************************; -/* -proc corr - pearson spearman nomiss - data = cotw_2016_analytic_file - plots = scatter (nvar=2 alpha=0.05) - ; - var - happiness_score gpi - ; -run; -*/ -*Results show -46.911% correlation. Thusly, not correlated can go to next step ; - + proc glm data= cotw_2016_analytic_file ; @@ -195,18 +188,6 @@ proc glm run; quit; -/* Test of Normality Assumption*/ -/* -proc univariate - data = resids normal plot - ; - var - res - ; -run; -*/ -/* Since Shapiro-Wilk 0.2089 >= 0.05, failed to reject Ho, residuals are normally distributed*/ - title; footnote; @@ -234,10 +215,10 @@ footnote3 ; *******************************************************************************; -* +/* Note: This compares the column "HDI" from eco_2016 to the column "Life Expectancy" in happy_2016. -* + Methodology: Use PROC CORR can to compute Pearson product-moment correlation coefficient between hdi and life_expectancy, as well as Spearman's rank-order correlation, a nonparametric measures of association. PROC CORR @@ -251,7 +232,7 @@ Possible Follow-up Steps: More carefully clean the values of the variables so that the means computed do not include any possible illegal values. And use proc plot to generate a graph of the variable hdi against life expectancy. -; +*/ *******************************************************************************; proc corr @@ -280,19 +261,3 @@ proc sgplot ; quit; - -/* -proc glm - data= cotw_2016_analytic_file - ; - model - happiness_score = gpi hdi - /solution - ; - output - out=resids - r =res - ; -run; -quit; -*/ From 43f3bf2f34d6577d02105862afd03cb31327217a Mon Sep 17 00:00:00 2001 From: mphan12 Date: Sun, 4 Nov 2018 17:41:12 -0800 Subject: [PATCH 2/7] updated data prep and data analysis MP 1) Replace if-then with put/format in data prep --- ...18-team-3_project2_data_analysis_by_MP.sas | 36 ++++----- ...1_f18-team-3_project2_data_preparation.sas | 80 ++++++------------- 2 files changed, 42 insertions(+), 74 deletions(-) diff --git a/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas b/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas index 73ff70c..d35d9d1 100755 --- a/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas +++ b/STAT660-01_f18-team-3_project2_data_analysis_by_MP.sas @@ -7,9 +7,10 @@ This file uses the following analytic dataset to address several research questions regarding country's happiness. -Dataset Name: cotw_2016_analytic_file created in external file +Dataset Name: cotw_2016_analytic_file, cotw_2016_analytic_file_sort_hs, +and cotw_2016_analytic_file_sort_hr were created in external file STAT660-01_f18-team-3_project2_data_preparation.sas, which is assumed to be -in the same directory as this file +in the same directory as this file. See included file for dataset properties */ @@ -19,12 +20,12 @@ See included file for dataset properties * set relative file import path to current directory (using standard SAS trick); X "cd ""%substr(%sysget(SAS_EXECFILEPATH),1,%eval(%length(%sysget(SAS_EXECFILEPATH))-%length(%sysget(SAS_EXECFILENAME))))"""; -* load external file that generates analytic datasets cde_2014_analytic_file, - cde_2014_analytic_file_sort_frpm, and cde_2014_analytic_file_sort_sat; +* load external file that generates analytic datasets cotw_2016_analytic_file, + cotw_2016_analytic_file_sort_hs , and cotw_2016_analytic_file_sort_hr ; %include '.\STAT660-01_f18-team-3_project2_data_preparation.sas'; *******************************************************************************; -* Research Question Analysis Starting Point; +* Research Question Analysis Starting Point ; *******************************************************************************; title1 "Research Question: For the 20 largest countries, what are the top five countries that experienced the biggest decrease in Happiness Score between 2015 and 2016?" @@ -110,7 +111,7 @@ proc sgplot run; *******************************************************************************; -* Research Question Analysis Starting Point; +* Research Question Analysis Starting Point ; *******************************************************************************; title1 'Research Question: Can "GPI" predict the "Happiness Score" in 2016?' @@ -149,6 +150,15 @@ correlated. If not continue to build model than check test model assumptions. - Error variance is the same for all observations 3) Y observations are not correlated with each other + Model Results + Happiness Score = 7.648 + (-1.104)*GPI + Type III SS p-value < 0.0001 + 22% of the variability in happiness score is explained by GPI + + Correlation shows show -46.911% correlation. Thusly, not correlated can go + to next step. Test for Residual normality, shows Shapiro-Wilk 0.2089>=0.05, + failed to reject Ho, residuals are normally distributed. + Goal: Find straight line that minimizes sum of squared distances from actual weight to fitted line @@ -159,20 +169,6 @@ only 22% can be explained by GPI. Follow-up Steps: A possible follow-up is add additional X variables to improve model predictiveness. */ -*******************************************************************************; -/* -Model Results -Happiness Score = 7.648 + (-1.104)*GPI - -Results: -Type III SS p-value < 0.0001 -22% of the variability in happiness score is explained by GPI - -Correlation shows show -46.911% correlation. Thusly, not correlated can go -to next step. Test for Residual normality, shows Shapiro-Wilk 0.2089 >= 0.05, -failed to reject Ho, residuals are normally distributed. -*/ -*******************************************************************************; proc glm data= cotw_2016_analytic_file diff --git a/STAT660-01_f18-team-3_project2_data_preparation.sas b/STAT660-01_f18-team-3_project2_data_preparation.sas index 4a02b93..54832ea 100755 --- a/STAT660-01_f18-team-3_project2_data_preparation.sas +++ b/STAT660-01_f18-team-3_project2_data_preparation.sas @@ -122,7 +122,27 @@ proc format; -0.014<- 0.000="Q2 Happiness Score %" 0.000<- 0.021="Q3 Happiness Score %" 0.021<- high ="Q4 Happiness Score %" - ; + ; + value $ country + 'Palestinian Territorie' = 'Palestinian Territories' + 'Somaliland region' = 'Somalia' + 'Taiwan Province of China' = 'Taiwan' + 'Congo, Democratic Republ' = 'Congo (Kinshasa)' + 'Congo' = 'Congo (Brazzaville)' + 'Iran, Islamic Republic o' = 'Iran' + "Lao People's Democratic" = 'Laos' + 'Macedonia TFYR' = 'Macedonia' + 'Korea, Republic of' = 'South Korea' + 'Korea, Democratic People' = 'North Korea' + 'Syrian Arab Republic' = 'Syria' + 'Tanzania, United Republi' = 'Tanzania' + 'United States of America' = 'United States' + 'Venezuela, Bolivarian Re' = 'Venezuela' + 'Viet Nam' = 'Vietnam' + 'Palestine' = 'Palestinian Territories' + 'Republic of the Congo' = 'Congo (Kinshasa)' + ; + run; * setup environmental parameters; @@ -181,57 +201,8 @@ run; set &dsn ; - if country = 'Palestinian Territorie' - then country = 'Palestinian Territories' - ; - if country = 'Somaliland region' - then country = 'Somalia' - ; - if country = 'Taiwan Province of China' - then country = 'Taiwan' - ; - if country = 'Congo, Democratic Republ' - then country = 'Congo (Kinshasa)' - ; - if country = 'Congo' - then country = 'Congo (Brazzaville)' - ; - if country = 'Iran, Islamic Republic o' - then country = 'Iran' - ; - if country = "Lao People's Democratic" - then country = 'Laos' - ; - if country = 'Macedonia TFYR' - then country = 'Macedonia' - ; - if country = 'Korea, Republic of' - then country = 'South Korea' - ; - if country = 'Korea, Democratic People' - then country = 'North Korea' - ; - if country = 'Syrian Arab Republic' - then country = 'Syria' - ; - if country = 'Tanzania, United Republi' - then country = 'Tanzania' - ; - if country = 'United States of America' - then country = 'United States' - ; - if country = 'Venezuela, Bolivarian Re' - then country = 'Venezuela' - ; - if country = 'Viet Nam' - then country = 'Vietnam' - ; - if country = 'Palestine' - then country = 'Palestinian Territories' - ; - if country = 'Republic of the Congo' - then country = 'Congo (Kinshasa)' - ; + country=put(country,country.) + ; run; ***************************************************************************; @@ -480,8 +451,9 @@ run; cotw_2016_analytic_file_sort_hs by descending happiness_score_yoy for largest 20 countries ; *******************************************************************************; -proc sort nodupkey +proc sort data = cotw_2016_analytic_file + out = cotw_2016_analytic_file_sort_hs ; by descending population_mm @@ -490,7 +462,7 @@ run; data cotw_2016_analytic_file_sort_hs; set - cotw_2016_analytic_file + cotw_2016_analytic_file_sort_hs ; if _n_<=20 ; From de3b67e9813daa8e84f6ed8dc272d61521ab2e8a Mon Sep 17 00:00:00 2001 From: mphan12-stat660 <42551898+mphan12-stat660@users.noreply.github.com> Date: Sun, 4 Nov 2018 18:54:05 -0800 Subject: [PATCH 3/7] updated data prep fixed indentation for country format --- ...1_f18-team-3_project2_data_preparation.sas | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/STAT660-01_f18-team-3_project2_data_preparation.sas b/STAT660-01_f18-team-3_project2_data_preparation.sas index 54832ea..2463c7c 100755 --- a/STAT660-01_f18-team-3_project2_data_preparation.sas +++ b/STAT660-01_f18-team-3_project2_data_preparation.sas @@ -125,24 +125,23 @@ proc format; ; value $ country 'Palestinian Territorie' = 'Palestinian Territories' - 'Somaliland region' = 'Somalia' - 'Taiwan Province of China' = 'Taiwan' - 'Congo, Democratic Republ' = 'Congo (Kinshasa)' - 'Congo' = 'Congo (Brazzaville)' - 'Iran, Islamic Republic o' = 'Iran' - "Lao People's Democratic" = 'Laos' - 'Macedonia TFYR' = 'Macedonia' - 'Korea, Republic of' = 'South Korea' - 'Korea, Democratic People' = 'North Korea' - 'Syrian Arab Republic' = 'Syria' - 'Tanzania, United Republi' = 'Tanzania' - 'United States of America' = 'United States' - 'Venezuela, Bolivarian Re' = 'Venezuela' - 'Viet Nam' = 'Vietnam' - 'Palestine' = 'Palestinian Territories' + 'Somaliland region' = 'Somalia' + 'Taiwan Province of China' = 'Taiwan' + 'Congo, Democratic Republ' = 'Congo (Kinshasa)' + 'Congo' = 'Congo (Brazzaville)' + 'Iran, Islamic Republic o' = 'Iran' + "Lao People's Democratic" = 'Laos' + 'Macedonia TFYR' = 'Macedonia' + 'Korea, Republic of' = 'South Korea' + 'Korea, Democratic People' = 'North Korea' + 'Syrian Arab Republic' = 'Syria' + 'Tanzania, United Republi' = 'Tanzania' + 'United States of America' = 'United States' + 'Venezuela, Bolivarian Re' = 'Venezuela' + 'Viet Nam' = 'Vietnam' + 'Palestine' = 'Palestinian Territories' 'Republic of the Congo' = 'Congo (Kinshasa)' ; - run; * setup environmental parameters; @@ -201,8 +200,8 @@ run; set &dsn ; - country=put(country,country.) - ; + country=put(country,country.) + ; run; ***************************************************************************; From da83bb0b91f2465fe9cdc3518eb5f7f1da552bf4 Mon Sep 17 00:00:00 2001 From: mphan12-stat660 <42551898+mphan12-stat660@users.noreply.github.com> Date: Sun, 4 Nov 2018 18:55:10 -0800 Subject: [PATCH 4/7] Update STAT660-01_f18-team-3_project2_data_preparation.sas --- ...1_f18-team-3_project2_data_preparation.sas | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/STAT660-01_f18-team-3_project2_data_preparation.sas b/STAT660-01_f18-team-3_project2_data_preparation.sas index 2463c7c..43506fe 100755 --- a/STAT660-01_f18-team-3_project2_data_preparation.sas +++ b/STAT660-01_f18-team-3_project2_data_preparation.sas @@ -125,21 +125,21 @@ proc format; ; value $ country 'Palestinian Territorie' = 'Palestinian Territories' - 'Somaliland region' = 'Somalia' - 'Taiwan Province of China' = 'Taiwan' - 'Congo, Democratic Republ' = 'Congo (Kinshasa)' - 'Congo' = 'Congo (Brazzaville)' - 'Iran, Islamic Republic o' = 'Iran' - "Lao People's Democratic" = 'Laos' - 'Macedonia TFYR' = 'Macedonia' - 'Korea, Republic of' = 'South Korea' - 'Korea, Democratic People' = 'North Korea' - 'Syrian Arab Republic' = 'Syria' - 'Tanzania, United Republi' = 'Tanzania' - 'United States of America' = 'United States' - 'Venezuela, Bolivarian Re' = 'Venezuela' - 'Viet Nam' = 'Vietnam' - 'Palestine' = 'Palestinian Territories' + 'Somaliland region' = 'Somalia' + 'Taiwan Province of China' = 'Taiwan' + 'Congo, Democratic Republ' = 'Congo (Kinshasa)' + 'Congo' = 'Congo (Brazzaville)' + 'Iran, Islamic Republic o' = 'Iran' + "Lao People's Democratic" = 'Laos' + 'Macedonia TFYR' = 'Macedonia' + 'Korea, Republic of' = 'South Korea' + 'Korea, Democratic People' = 'North Korea' + 'Syrian Arab Republic' = 'Syria' + 'Tanzania, United Republi' = 'Tanzania' + 'United States of America' = 'United States' + 'Venezuela, Bolivarian Re' = 'Venezuela' + 'Viet Nam' = 'Vietnam' + 'Palestine' = 'Palestinian Territories' 'Republic of the Congo' = 'Congo (Kinshasa)' ; run; From 8c96d627134737aac72f7eff6592a4f16aebbf37 Mon Sep 17 00:00:00 2001 From: anguyen152-stat660 <42627974+anguyen152-stat660@users.noreply.github.com> Date: Sat, 10 Nov 2018 22:50:29 -0800 Subject: [PATCH 5/7] Update STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas - Update for consistent indentation - Blank lines are used to separate comment paragraph - Unused footnote is deleted. --- ...18-team-3_project2_data_analysis_by_AN.sas | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas b/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas index b2f3633..ad766d5 100644 --- a/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas +++ b/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas @@ -5,8 +5,8 @@ * This file uses the following analytic dataset to address several research -questions regarding happiness, peace index and biocapacity / -ecological-footprint +questions regarding happiness,peace index and biocapacity,ecological-footprint + Dataset Name: cotw_2016_analytic_file created in external file STAT660-01 f18-team-3 project2 data preparation.sas, which is assumed to be in the same directory as this file @@ -50,12 +50,12 @@ footnote3 ; * -Note:This compares the column “Happiness Rank” from happy_2015 to the column +Note:This compares the column “Happiness Rank” from happy_2015 to the column of the same name from happy_2016. Methodology: When combining happy_2015 with happy_2016 during data preparation, take the difference of values of "happiness_rank" for each school and create -a new variable called happiness_rank_yoy . Then, use proc sort to create a +a new variable called happiness_rank_yoy. Then, use proc sort to create a temporary sorted table in descending by happiness_rank_yoy. Finally, use proc print here to display the first five rows of the sorted dataset. @@ -88,11 +88,11 @@ footnote; *******************************************************************************; title1 -"Research Question: Is "GPI" in a relationship with "Happiness Score" ? " +'Research Question: Is "GPI" in a relationship with "Happiness Score" ?' ; title2 -"Rationale: This will help determine if peace status of a country makes people happier" +'Rationale: This will help determine if peace status of a country makes people happier' ; footnote1 @@ -108,7 +108,7 @@ footnote3 ; * -Note:This compares the column “gpi” from gpi_2008-2016 to the column +Note:This compares the column “gpi” from gpi_2008-2016 to the column happiness_score from happy_2016. Methodology:Use proc means to compute 5-number summaries of gpi @@ -130,7 +130,7 @@ proc freq table gpi *happiness_score - / missing norow nocol nopercent + / missing norow nocol nopercent ; where not(missing(happiness_score)) @@ -150,11 +150,11 @@ footnote; *******************************************************************************; title1 -'Is there a correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”? ' +'Is there a correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”? ' ; title2 -"Rationale: Rationale: Biocapacity Deficit or Reserve reveals the sustainability status of a country, which can lead to a happier fulfilled life" +'Rationale:Biocapacity Deficit or Reserve reveals the sustainability status of a country, which can lead to a happier fulfilled life' ; footnote1 @@ -162,11 +162,7 @@ footnote1 ; footnote2 -"However, Pearson Chi-Sq Test shows p-value > 0.05, therefore there is not enough evidence to show the correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”" -; - -footnote3 -"" +"However, Pearson Chi-Sq Test shows p-value > 0.05, therefore there is not enough evidence to show the correlation between “Biocapacity Deficit or Reserve” and “Happiness Score”" ; * @@ -193,9 +189,9 @@ proc corr ; var biocapacity_deficit_or_reserve - happiness_score + happiness_score ; run; + title; footnote; - From 2ede0d56bb58b574eb1daa9023dd3b2a14b459ab Mon Sep 17 00:00:00 2001 From: anguyen152-stat660 <42627974+anguyen152-stat660@users.noreply.github.com> Date: Sat, 10 Nov 2018 22:53:45 -0800 Subject: [PATCH 6/7] Update STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas --- STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas b/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas index ad766d5..af3f599 100644 --- a/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas +++ b/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas @@ -130,7 +130,7 @@ proc freq table gpi *happiness_score - / missing norow nocol nopercent + / missing norow nocol nopercent ; where not(missing(happiness_score)) From c640eed7dd4e19f9fe42e43c8c0337c02d3246f8 Mon Sep 17 00:00:00 2001 From: anguyen152-stat660 <42627974+anguyen152-stat660@users.noreply.github.com> Date: Sat, 10 Nov 2018 22:57:24 -0800 Subject: [PATCH 7/7] Update STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas --- STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas b/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas index af3f599..fe3c9b5 100644 --- a/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas +++ b/STAT660-01_f18-team-3_project2_data_analysis_by_AN.sas @@ -189,7 +189,7 @@ proc corr ; var biocapacity_deficit_or_reserve - happiness_score + happiness_score ; run;