forked from stat660-f18/team-1_project1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
STAT660-01_f18-team-1_project1_data_preparation.sas
executable file
·177 lines (155 loc) · 3.76 KB
/
STAT660-01_f18-team-1_project1_data_preparation.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
*******************************************************************************;
**************** 80-character banner for column width reference ***************;
* (set window width to banner width to calibrate line length to 80 characters *;
*******************************************************************************;
*
This file prepares the dataset described below for analysis.
[Dataset Name] FIFA 18 More Complete Player Dataset
[Experimental Units] FIFA 18 Players
[Number of Observations] 9,758
[Number of Features] 185
[Data Source]
https://www.kaggle.com/kevinmh/fifa-18-more-complete-player-dataset
data found in kaggle
[Data Dictionary]
https://www.kaggle.com/kevinmh/fifa-18-more-complete-player-dataset
[Unique ID Schema] The column "Player ID" is a primary key.
;
* setup environmental parameters;
%let inputDatasetURL =
https://github.com/stat660/team-1_project1/blob/master/FIFA_Player_Data.xls?raw=true
;
* load raw FRPM dataset over the wire;
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype);
%put &=dsn;
%put &=url;
%put &=filetype;
%if
%sysfunc(exist(&dsn.)) = 0
%then
%do;
%put Loading dataset &dsn. over the wire now...;
filename tempfile TEMP;
proc http
method="get"
url="&url."
out=tempfile
;
run;
proc import
file=tempfile
out=&dsn.
dbms=&filetype.;
run;
filename tempfile clear;
%end;
%else
%do;
%put Dataset &dsn. already exists. Please delete and try again.;
%end;
%mend;
%loadDataIfNotAlreadyAvailable(
fifa18_raw,
&inputDatasetURL.,
xls
)
* check raw fifa18 dataset for duplicates with respect to its composite key;
proc sort
noduprecs
data=fifa18_raw
dupout=fifa18_raw_dups
out=fifa18_nodups
;
by
ID
;
run;
* build analytic dataset from fifa18 dataset with the least number of
columns and minimal cleaning/transformation needed to address research
questions in corresponding data-analysis files;
data fifa18_analytic_file;
retain
Club
Special
Age
League
birth_date
height_cm
weight_kg
body_type
nationality
eur_value
eur_wage
overall
;
keep
Club
Special
Age
League
birth_date
height_cm
weight_kg
body_type
nationality
eur_value
eur_wage
overall
;
set fifa18_nodups;
run;
*
Use PROC MEANS to compute the mean of eur_wage for
League, and output the results to a temporary dataset, and use PROC SORT
to extract and sort just the means the temporary dateset, which will be
used as part of data analysis by LL.
;
proc means
mean
noprint
data=fifa18_analytic_file
;
class
League
;
var
eur_wage
;
output
out=fifa18_League_EurWage
;
run;
proc sort
data=fifa18_LEague_EurWage(where=(_stat_="mean"))
;
by
descending eur_wage
;
run;
*
To answering the third question in JD data-analysis file, use PROC MEANS
to compute the mean of eur_value for user club, and output the results to
dataset "fifa18_Club_EurVal". Use PROC SORT extract and sort just the
means in the dataset "fifa18_Club_EurVal" by descending oreder;
proc means
mean
noprint
data=fifa18_analytic_file
;
class
club
;
var
eur_value
;
output
out=fifa18_Club_EurVal
;
run;
proc sort
data=fifa18_Club_EurVal(where=(_STAT_="MEAN"))
;
by
descending eur_value
;
run;