forked from stat660-f18/team-2_project1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
STAT660-01_f18-team-2_project1_data_preparation.sas
151 lines (136 loc) · 3.76 KB
/
STAT660-01_f18-team-2_project1_data_preparation.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
*******************************************************************************;
**************** 80-character banner for column width reference ***************;
* (set window width to banner width to calibrate line length to 80 characters *;
*******************************************************************************;
*
This file prepares the dataset described below for analysis.
[Dataset Name] Absenteeism at work
[Experimental Units] working staff
[Number of Observations] 740
[Number of Features] 21
[Data Source] The file
https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work was
downloaded from the UCI machine learning repository.
[Data Dictionary] https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work#
[Unique ID] ID, Absenteeism_time_in_hours form a composite key
;
* environmental setup;
*create output formats;
proc format;
value reasonofabsence
0 = 'NA'
1 = 'Infectious and parasitic disease'
2 = 'neoplasm'
3 = 'blood disease'
4 = 'endocrine disease'
5 = 'mental and behaviour disorder'
6 = 'nervous disease'
7 = 'eye'
8 = 'ear'
9 = 'circulatory'
10= 'respiratory'
11= 'digestive'
12= 'skin'
13= 'muscle'
14= 'genitourinary'
15= 'pregnancy'
16= 'perinatal'
17= 'congenital'
18= 'clinical'
19= 'injury'
20= 'morbidity and mortality'
21= 'factors'
22= 'followup'
23= 'medical consultation'
24= 'blodd donation'
25= 'lab'
26= 'unjustified'
27= 'physiotherapy'
28= 'dental'
;
run;
*setup environmental parameters;
%let inputDatasetURL =
https://github.com/stat660/team-2_project1/blob/master/Absenteeism_at_work.xls?raw=true
;
*load raw absenteeism_at_work dataset over the wire;
%macro loadDataIfNotAlreadyAvailable(dsn,url,filetype);
%put &=dsn;
%put &=url;
%put &=filetype;
%if
%sysfunc(exist(&dsn.)) = 0
%then
%do;
%put Loading dataset &dsn. over the wire now...;
filename tempfile "%sysfunc(getoption(work))/tempfile.xlsx";
proc http
method="get"
url="&url."
out=tempfile
;
run;
proc import
file=tempfile
out=&dsn.
dbms=&filetype.;
run;
filename tempfile clear;
%end;
%else
%do;
%put Dataset &dsn. already exists. Please delete and try again.;
%end;
%mend;
%loadDataIfNotAlreadyAvailable(
Absenteeism_at_work_raw,
&inputDatasetURL.,
xls
)
*check raw absenteeism_at_work dataset for duplicate records;
proc sort
noduprecs
data=Absenteeism_at_work_raw
dupout=Absenteeism_at_work_dups
out=Absenteeism_at_work_noduprecs
;
by
id
;
run;
* build analytic dataset from Absenteeism_at_work_noduprecs dataset with the
least number of columns and minimal cleaning/transformation needed to
address research questions in corresponding data-analysis files. And then
create a new variable which shows 1 when absenteeism_time_in_hours >0, and
0 otherwise;
data Absenteeism_analytic_file;
retain
ID
Reason_for_absence
Month_of_absence
Day_of_the_week
Absenteeism_time_in_hours
Work_load_Average_day
absence
;
keep
ID
Reason_for_absence
Month_of_absence
Day_of_the_week
Absenteeism_time_in_hours
Work_load_Average_day
absence
;
set Absenteeism_at_work_noduprecs;
if
absenteeism_time_in_hours >0
then
do;
absence = 1;
end;
else
do;
absence = 0;
end;
run;