RunExamples/rCh07.html

<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">&#39;ggplot2&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c07_Linear_and_logistic_regression&#39;</span>,
      <span class="st">&#39;../PUMS&#39;</span>,<span class="dt">last=</span><span class="dv">111</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  100 Fri Jun 17 10:40:33 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00100_example_7.1_of_section_7.1.1.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 7.1 of section 7.1.1 
&gt; # (example 7.1 of section 7.1.1)  : Linear and logistic regression : Using linear regression : Understanding linear regression 
&gt; # Title: Loading the PUMS data 
&gt; 
&gt; load(&quot;psub.RData&quot;)

&gt; dtrain &lt;- subset(psub,ORIGRANDGROUP &gt;= 500)

&gt; dtest &lt;- subset(psub,ORIGRANDGROUP &lt; 500)

&gt; model &lt;- lm(log(PINCP,base=10) ~ AGEP + SEX + COW + SCHL,data=dtrain)

&gt; dtest$predLogPINCP &lt;- predict(model,newdata=dtest)

&gt; dtrain$predLogPINCP &lt;- predict(model,newdata=dtrain)
[1] &quot;############################### end  100 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  101 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00101_example_7.2_of_section_7.1.3.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 7.2 of section 7.1.3 
&gt; # (example 7.2 of section 7.1.3)  : Linear and logistic regression : Using linear regression : Making predictions 
&gt; # Title: Plotting log income as a function of predicted log income 
&gt; 
&gt; library(&#39;ggplot2&#39;)

&gt; ggplot(data=dtest,aes(x=predLogPINCP,y=log(PINCP,base=10))) +
    geom_point(alpha=0.2,color=&quot;black&quot;) +
    geom_smooth(aes(x=predLogPINCP,
       y=log(PINCP,base=10)),color=&quot;black&quot;) +
    geom_line(aes(x=log(PINCP,base=10),
       y=log(PINCP,base=10)),color=&quot;blue&quot;,linetype=2) +
    scale_x_continuous(limits=c(4,5)) +
    scale_y_continuous(limits=c(3.5,5.5))

Warning: Removed 9 rows containing non-finite values (stat_smooth).

Warning: Removed 9 rows containing missing values (geom_point).

Warning: Removed 67 rows containing missing values (geom_path).</code></pre>
<div class="figure">
<img src="rCh07_files/figure-markdown_github/ch7ex1-1.png" alt="" />

</div>
<pre><code>[1] &quot;############################### end  101 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  102 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00102_example_7.3_of_section_7.1.3.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 7.3 of section 7.1.3 
&gt; # (example 7.3 of section 7.1.3)  : Linear and logistic regression : Using linear regression : Making predictions 
&gt; # Title: Plotting residuals income as a function of predicted log income 
&gt; 
&gt; ggplot(data=dtest,aes(x=predLogPINCP,
                      y=predLogPINCP-log(PINCP,base=10))) +
   geom_point(alpha=0.2,color=&quot;black&quot;) +
   geom_smooth(aes(x=predLogPINCP,
                   y=predLogPINCP-log(PINCP,base=10)),
                   color=&quot;black&quot;)</code></pre>
<div class="figure">
<img src="rCh07_files/figure-markdown_github/ch7ex1-2.png" alt="" />

</div>
<pre><code>[1] &quot;############################### end  102 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  103 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00103_example_7.4_of_section_7.1.3.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 7.4 of section 7.1.3 
&gt; # (example 7.4 of section 7.1.3)  : Linear and logistic regression : Using linear regression : Making predictions 
&gt; # Title: Computing R-squared 
&gt; 
&gt; rsq &lt;- function(y,f) { 1 - sum((y-f)^2)/sum((y-mean(y))^2) }

&gt; rsq(log(dtrain$PINCP,base=10),predict(model,newdata=dtrain))
[1] 0.3382568

&gt; rsq(log(dtest$PINCP,base=10),predict(model,newdata=dtest))
[1] 0.2605496
[1] &quot;############################### end  103 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  104 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00104_example_7.5_of_section_7.1.3.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 7.5 of section 7.1.3 
&gt; # (example 7.5 of section 7.1.3)  : Linear and logistic regression : Using linear regression : Making predictions 
&gt; # Title: Calculating root mean square error 
&gt; 
&gt; rmse &lt;- function(y, f) { sqrt(mean( (y-f)^2 )) }

&gt; rmse(log(dtrain$PINCP,base=10),predict(model,newdata=dtrain))
[1] 0.2651856

&gt; rmse(log(dtest$PINCP,base=10),predict(model,newdata=dtest))
[1] 0.2752171
[1] &quot;############################### end  104 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  107 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00107_example_7.6_of_section_7.1.5.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # example 7.6 of section 7.1.5 
&gt; # (example 7.6 of section 7.1.5)  : Linear and logistic regression : Using linear regression : Reading the model summary and characterizing coefficient quality 
&gt; # Title: Summarizing residuals 
&gt; 
&gt; summary(log(dtrain$PINCP,base=10) - predict(model,newdata=dtrain))
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-1.29200 -0.14150  0.02458  0.00000  0.17630  0.62530 

&gt; ##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
&gt; ## -1.29200 -0.14150  0.02458  0.00000  0.17630  0.62530
&gt; summary(log(dtest$PINCP,base=10) - predict(model,newdata=dtest))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-1.494000 -0.165300  0.018920 -0.004637  0.175500  0.868100 

&gt; ##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
&gt; ## -1.494000 -0.165300  0.018920 -0.004637  0.175500  0.868100
&gt; 
[1] &quot;############################### end  107 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  109 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00109_informalexample_7.9_of_section_7.1.5.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # informalexample 7.9 of section 7.1.5 
&gt; # (informalexample 7.9 of section 7.1.5)  : Linear and logistic regression : Using linear regression : Reading the model summary and characterizing coefficient quality 
&gt; 
&gt; df &lt;- dim(dtrain)[1] - dim(summary(model)$coefficients)[1]
[1] &quot;############################### end  109 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  110 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00110_informalexample_7.10_of_section_7.1.5.R&quot;
[1] &quot;#####   in directory ../PUMS&quot;

&gt; # informalexample 7.10 of section 7.1.5 
&gt; # (informalexample 7.10 of section 7.1.5)  : Linear and logistic regression : Using linear regression : Reading the model summary and characterizing coefficient quality 
&gt; 
&gt; modelResidualError &lt;- sqrt(sum(residuals(model)^2)/df)
[1] &quot;############################### end  110 Fri Jun 17 10:40:34 2016&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c07_Linear_and_logistic_regression&#39;</span>,
      <span class="st">&#39;../CDC&#39;</span>,<span class="dt">first=</span><span class="dv">112</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  112 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00112_example_7.7_of_section_7.2.1.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.7 of section 7.2.1 
&gt; # (example 7.7 of section 7.2.1)  : Linear and logistic regression : Using logistic regression : Understanding logistic regression 
&gt; # Title: Loading the CDC data 
&gt; 
&gt; load(&quot;NatalRiskData.rData&quot;)

&gt; train &lt;- sdata[sdata$ORIGRANDGROUP&lt;=5,]

&gt; test &lt;- sdata[sdata$ORIGRANDGROUP&gt;5,]
[1] &quot;############################### end  112 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  113 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00113_example_7.8_of_section_7.2.2.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.8 of section 7.2.2 
&gt; # (example 7.8 of section 7.2.2)  : Linear and logistic regression : Using logistic regression : Building a logistic regression model 
&gt; # Title: Building the model formula 
&gt; 
&gt; complications &lt;- c(&quot;ULD_MECO&quot;,&quot;ULD_PRECIP&quot;,&quot;ULD_BREECH&quot;)

&gt;  riskfactors &lt;- c(&quot;URF_DIAB&quot;, &quot;URF_CHYPER&quot;, &quot;URF_PHYPER&quot;,
                 &quot;URF_ECLAM&quot;)

&gt; y &lt;- &quot;atRisk&quot;

&gt; x &lt;- c(&quot;PWGT&quot;,
       &quot;UPREVIS&quot;,
       &quot;CIG_REC&quot;,
       &quot;GESTREC3&quot;,
       &quot;DPLURAL&quot;,
       complications,
       riskfactors)

&gt; fmla &lt;- paste(y, paste(x, collapse=&quot;+&quot;), sep=&quot;~&quot;)
[1] &quot;############################### end  113 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;############################### start  114 Fri Jun 17 10:40:34 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00114_example_7.9_of_section_7.2.2.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.9 of section 7.2.2 
&gt; # (example 7.9 of section 7.2.2)  : Linear and logistic regression : Using logistic regression : Building a logistic regression model 
&gt; # Title: Fitting the logistic regression model 
&gt; 
&gt; print(fmla)
[1] &quot;atRisk~PWGT+UPREVIS+CIG_REC+GESTREC3+DPLURAL+ULD_MECO+ULD_PRECIP+ULD_BREECH+URF_DIAB+URF_CHYPER+URF_PHYPER+URF_ECLAM&quot;

&gt; ## [1] &quot;atRisk ~ PWGT+UPREVIS+CIG_REC+GESTREC3+DPLURAL+ULD_MECO+ULD_PRECIP+
&gt; ##                    ULD_BREECH+URF_DIAB+URF_CHYPER+URF_PHYPER+URF_ECLAM&quot;
&gt; 
&gt; model &lt;- glm(fmla, data=train, family=binomial(link=&quot;logit&quot;))
[1] &quot;############################### end  114 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  115 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00115_example_7.10_of_section_7.2.3.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.10 of section 7.2.3 
&gt; # (example 7.10 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions 
&gt; # Title: Applying the logistic regression model 
&gt; 
&gt; train$pred &lt;- predict(model, newdata=train, type=&quot;response&quot;)

&gt; test$pred &lt;- predict(model, newdata=test, type=&quot;response&quot;)
[1] &quot;############################### end  115 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  116 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00116_example_7.11_of_section_7.2.3.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.11 of section 7.2.3 
&gt; # (example 7.11 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions 
&gt; # Title: Plotting distribution of prediction score grouped by known outcome 
&gt; 
&gt; library(&#39;ggplot2&#39;)

&gt; ggplot(train, aes(x=pred, color=atRisk, linetype=atRisk)) +
        geom_density()

[1] &quot;############################### end  116 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  117 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00117_example_7.12_of_section_7.2.3.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.12 of section 7.2.3 
&gt; # (example 7.12 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions 
&gt; # Title: Exploring modeling trade-offs 
&gt; 
&gt; library(ROCR)                                         # Note: 1 

Loading required package: gplots


Attaching package: &#39;gplots&#39;

The following object is masked from &#39;package:stats&#39;:

    lowess</code></pre>
<div class="figure">
<img src="rCh07_files/figure-markdown_github/ch7ex2-1.png" alt="" />

</div>
<pre><code>&gt; library(grid)                                         # Note: 2 

&gt; predObj &lt;- prediction(train$pred, train$atRisk)       # Note: 3 

&gt; precObj &lt;- performance(predObj, measure=&quot;prec&quot;)       # Note: 4 

&gt; recObj &lt;- performance(predObj, measure=&quot;rec&quot;)         # Note: 5 

&gt; precision &lt;- (precObj@y.values)[[1]]                  # Note: 6 

&gt; prec.x &lt;- (precObj@x.values)[[1]]                     # Note: 7 

&gt; recall &lt;- (recObj@y.values)[[1]]

&gt; rocFrame &lt;- data.frame(threshold=prec.x, precision=precision,
                       recall=recall)                   # Note: 8 

&gt; nplot &lt;- function(plist) {                            # Note: 9 
   n &lt;- length(plist)
   grid.newpage()
   pushViewport(viewport(layout=grid.layout(n,1)))
   vplayout=function(x,y) {viewport(layout.pos.row=x, layout.pos.col=y)}
   for(i in 1:n) {
     print(plist[[i]], vp=vplayout(i,1))
   }
 }

&gt; pnull &lt;- mean(as.numeric(train$atRisk))               # Note: 10 

&gt; p1 &lt;- ggplot(rocFrame, aes(x=threshold)) +            # Note: 11 
   geom_line(aes(y=precision/pnull)) +
   coord_cartesian(xlim = c(0,0.05), ylim=c(0,10) )

&gt; p2 &lt;- ggplot(rocFrame, aes(x=threshold)) +            # Note: 12 
   geom_line(aes(y=recall)) +
   coord_cartesian(xlim = c(0,0.05) )

&gt; nplot(list(p1, p2))                                   # Note: 13

Warning: Removed 1 rows containing missing values (geom_path).</code></pre>
<div class="figure">
<img src="rCh07_files/figure-markdown_github/ch7ex2-2.png" alt="" />

</div>
<pre><code>&gt; # Note 1: 
&gt; #   Load ROCR library. 
&gt; 
&gt; # Note 2: 
&gt; #   Load grid library (you’ll need this for the 
&gt; #   nplot function below). 
&gt; 
&gt; # Note 3: 
&gt; #   Create ROCR prediction object. 
&gt; 
&gt; # Note 4: 
&gt; #   Create ROCR object to calculate precision as 
&gt; #   a function of threshold. 
&gt; 
&gt; # Note 5: 
&gt; #   Create ROCR object to calculate recall as a 
&gt; #   function of threshold. 
&gt; 
&gt; # Note 6: 
&gt; #   at ( @ ) symbol@ (at) symbolROCR objects are what R calls S4 objects; 
&gt; #   the slots (or fields) of an S4 object are stored 
&gt; #   as lists within the object. You extract the slots 
&gt; #   from an S4 object using @ notation. 
&gt; 
&gt; # Note 7: 
&gt; #   The x values (thresholds) are the same in 
&gt; #   both predObj and recObj, so you only need to 
&gt; #   extract them once. 
&gt; 
&gt; # Note 8: 
&gt; #   Build data frame with thresholds, precision, 
&gt; #   and recall. 
&gt; 
&gt; # Note 9: 
&gt; #   Function to plot multiple plots on one page 
&gt; #   (stacked). 
&gt; 
&gt; # Note 10: 
&gt; #   Calculate rate of at-risk births in the 
&gt; #   training set. 
&gt; 
&gt; # Note 11: 
&gt; #   Plot enrichment rate as a function of 
&gt; #   threshold. 
&gt; 
&gt; # Note 12: 
&gt; #   Plot recall as a function of 
&gt; #   threshold. 
&gt; 
&gt; # Note 13: 
&gt; #   Show both plots simultaneously. 
&gt; 
[1] &quot;############################### end  117 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  118 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00118_example_7.13_of_section_7.2.3.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.13 of section 7.2.3 
&gt; # (example 7.13 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions 
&gt; # Title: Evaluating our chosen model 
&gt; 
&gt; ctab.test &lt;- table(pred=test$pred&gt;0.02, atRisk=test$atRisk)   # Note: 1 

&gt; ctab.test                                                         # Note: 2 
       atRisk
pred    FALSE TRUE
  FALSE  9487   93
  TRUE   2405  116

&gt; ##        atRisk
&gt; ## pred    FALSE TRUE
&gt; ##   FALSE  9487   93
&gt; ##   TRUE   2405  116
&gt; precision &lt;- ctab.test[2,2]/sum(ctab.test[2,])

&gt; precision
[1] 0.04601349

&gt; ## [1] 0.04601349
&gt; recall &lt;- ctab.test[2,2]/sum(ctab.test[,2])

&gt; recall
[1] 0.5550239

&gt; ## [1] 0.5550239
&gt; enrich &lt;- precision/mean(as.numeric(test$atRisk))

&gt; enrich
[1] 2.664159

&gt; ## [1] 2.664159
&gt; 
&gt; # Note 1: 
&gt; #   Build confusion matrix. 
&gt; 
&gt; # Note 2: 
&gt; #   Rows contain predicted negatives and 
&gt; #   positives; columns contain actual negatives and 
&gt; #   positives. 
&gt; 
[1] &quot;############################### end  118 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  119 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00119_example_7.14_of_section_7.2.4.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.14 of section 7.2.4 
&gt; # (example 7.14 of section 7.2.4)  : Linear and logistic regression : Using logistic regression : Finding relations and extracting advice from logistic models 
&gt; # Title: The model coefficients 
&gt; 
&gt; coefficients(model)
             (Intercept)                     PWGT                  UPREVIS 
             -4.41218940               0.00376166              -0.06328943 
             CIG_RECTRUE       GESTREC3&lt; 37 weeks DPLURALtriplet or higher 
              0.31316930               1.54518311               1.39419294 
             DPLURALtwin             ULD_MECOTRUE           ULD_PRECIPTRUE 
              0.31231871               0.81842627               0.19172008 
          ULD_BREECHTRUE             URF_DIABTRUE           URF_CHYPERTRUE 
              0.74923672              -0.34646672               0.56002503 
          URF_PHYPERTRUE            URF_ECLAMTRUE 
              0.16159872               0.49806435 

&gt; ##              (Intercept)                     PWGT
&gt; ##              -4.41218940               0.00376166
&gt; ##                  UPREVIS              CIG_RECTRUE
&gt; ##              -0.06328943               0.31316930
&gt; ##       GESTREC3&lt; 37 weeks DPLURALtriplet or higher
&gt; ##               1.54518311               1.39419294
&gt; ##              DPLURALtwin             ULD_MECOTRUE
&gt; ##               0.31231871               0.81842627
&gt; ##           ULD_PRECIPTRUE           ULD_BREECHTRUE
&gt; ##               0.19172008               0.74923672
&gt; ##             URF_DIABTRUE           URF_CHYPERTRUE
&gt; ##              -0.34646672               0.56002503
&gt; ##           URF_PHYPERTRUE            URF_ECLAMTRUE
&gt; ##               0.16159872               0.49806435
&gt; 
[1] &quot;############################### end  119 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  120 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00120_example_7.15_of_section_7.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.15 of section 7.2.5 
&gt; # (example 7.15 of section 7.2.5)  : Linear and logistic regression : Using logistic regression : Reading the model summary and characterizing coefficients 
&gt; # Title: The model summary 
&gt; 
&gt; summary(model)

Call:
glm(formula = fmla, family = binomial(link = &quot;logit&quot;), data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9732  -0.1818  -0.1511  -0.1358   3.2641  

Coefficients:
                          Estimate Std. Error z value Pr(&gt;|z|)    
(Intercept)              -4.412189   0.289352 -15.249  &lt; 2e-16 ***
PWGT                      0.003762   0.001487   2.530 0.011417 *  
UPREVIS                  -0.063289   0.015252  -4.150 3.33e-05 ***
CIG_RECTRUE               0.313169   0.187230   1.673 0.094398 .  
GESTREC3&lt; 37 weeks        1.545183   0.140795  10.975  &lt; 2e-16 ***
DPLURALtriplet or higher  1.394193   0.498866   2.795 0.005194 ** 
DPLURALtwin               0.312319   0.241088   1.295 0.195163    
ULD_MECOTRUE              0.818426   0.235798   3.471 0.000519 ***
ULD_PRECIPTRUE            0.191720   0.357680   0.536 0.591951    
ULD_BREECHTRUE            0.749237   0.178129   4.206 2.60e-05 ***
URF_DIABTRUE             -0.346467   0.287514  -1.205 0.228187    
URF_CHYPERTRUE            0.560025   0.389678   1.437 0.150676    
URF_PHYPERTRUE            0.161599   0.250003   0.646 0.518029    
URF_ECLAMTRUE             0.498064   0.776948   0.641 0.521489    
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2698.7  on 14211  degrees of freedom
Residual deviance: 2463.0  on 14198  degrees of freedom
AIC: 2491

Number of Fisher Scoring iterations: 7


&gt; ## Call:
&gt; ## glm(formula = fmla, family = binomial(link = &quot;logit&quot;), data = train)
&gt; ## 
&gt; ## Deviance Residuals:
&gt; ##     Min       1Q   Median       3Q      Max
&gt; ## -0.9732  -0.1818  -0.1511  -0.1358   3.2641
&gt; ## 
&gt; ## Coefficients:
&gt; ##                           Estimate Std. Error z value Pr(&gt;|z|)
&gt; ## (Intercept)              -4.412189   0.289352 -15.249  &lt; 2e-16 ***
&gt; ## PWGT                      0.003762   0.001487   2.530 0.011417 *
&gt; ## UPREVIS                  -0.063289   0.015252  -4.150 3.33e-05 ***
&gt; ## CIG_RECTRUE               0.313169   0.187230   1.673 0.094398 .
&gt; ## GESTREC3&lt; 37 weeks        1.545183   0.140795  10.975  &lt; 2e-16 ***
&gt; ## DPLURALtriplet or higher  1.394193   0.498866   2.795 0.005194 **
&gt; ## DPLURALtwin               0.312319   0.241088   1.295 0.195163
&gt; ## ULD_MECOTRUE              0.818426   0.235798   3.471 0.000519 ***
&gt; ## ULD_PRECIPTRUE            0.191720   0.357680   0.536 0.591951
&gt; ## ULD_BREECHTRUE            0.749237   0.178129   4.206 2.60e-05 ***
&gt; ## URF_DIABTRUE             -0.346467   0.287514  -1.205 0.228187
&gt; ## URF_CHYPERTRUE            0.560025   0.389678   1.437 0.150676
&gt; ## URF_PHYPERTRUE            0.161599   0.250003   0.646 0.518029
&gt; ## URF_ECLAMTRUE             0.498064   0.776948   0.641 0.521489
&gt; ## ---
&gt; ## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
&gt; ## 
&gt; ## (Dispersion parameter for binomial family taken to be 1)
&gt; ## 
&gt; ##    Null deviance: 2698.7  on 14211  degrees of freedom
&gt; ## Residual deviance: 2463.0  on 14198  degrees of freedom
&gt; ## AIC: 2491
&gt; ## 
&gt; ## Number of Fisher Scoring iterations: 7
&gt; 
[1] &quot;############################### end  120 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  123 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00123_example_7.16_of_section_7.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.16 of section 7.2.5 
&gt; # (example 7.16 of section 7.2.5)  : Linear and logistic regression : Using logistic regression : Reading the model summary and characterizing coefficients 
&gt; # Title: Calculating deviance residuals 
&gt; 
&gt; pred &lt;- predict(model, newdata=train, type=&quot;response&quot;)    # Note: 1 

&gt; llcomponents &lt;- function(y, py) {                         # Note: 2 
   y*log(py) + (1-y)*log(1-py)
 }

&gt; edev &lt;- sign(as.numeric(train$atRisk) - pred) *           # Note: 3 
   sqrt(-2*llcomponents(as.numeric(train$atRisk), pred))

&gt; summary(edev)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.9732 -0.1818 -0.1511 -0.1244 -0.1358  3.2640 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ## -0.9732 -0.1818 -0.1511 -0.1244 -0.1358  3.2640
&gt; 
&gt; # Note 1: 
&gt; #   Create vector of predictions for training 
&gt; #   data. 
&gt; 
&gt; # Note 2: 
&gt; #   Function to return the log likelihoods for 
&gt; #   each data point. Argument y is the true outcome 
&gt; #   (as a numeric variable, 0/1); argument py is the 
&gt; #   predicted probability. 
&gt; 
&gt; # Note 3: 
&gt; #   Calculate deviance residuals. 
&gt; 
[1] &quot;############################### end  123 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;############################### start  126 Fri Jun 17 10:40:35 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00126_example_7.17_of_section_7.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.17 of section 7.2.5 
&gt; # (example 7.17 of section 7.2.5)  : Linear and logistic regression : Using logistic regression : Reading the model summary and characterizing coefficients 
&gt; # Title: Computing deviance 
&gt; 
&gt; loglikelihood &lt;- function(y, py) {                                    # Note: 1 
   sum(y * log(py) + (1-y)*log(1 - py))
 }

&gt; pnull &lt;- mean(as.numeric(train$atRisk))                               # Note: 2 

&gt; null.dev &lt;- -2*loglikelihood(as.numeric(train$atRisk), pnull)         # Note: 3 

&gt; pnull
[1] 0.01920912

&gt; ## [1] 0.01920912
&gt; null.dev
[1] 2698.716

&gt; ## [1] 2698.716
&gt; model$null.deviance                                                   # Note: 4 
[1] 2698.716

&gt; ## [1] 2698.716
&gt; 
&gt; pred &lt;- predict(model, newdata=train, type=&quot;response&quot;)    # Note: 5 

&gt; resid.dev &lt;- -2*loglikelihood(as.numeric(train$atRisk), pred)         # Note: 6 

&gt; resid.dev
[1] 2462.992

&gt; ## [1] 2462.992
&gt; model$deviance                                                        # Note: 7 
[1] 2462.992

&gt; ## [1] 2462.992
&gt; 
&gt; testy &lt;- as.numeric(test$atRisk)                                      # Note: 8 

&gt; testpred &lt;- predict(model, newdata=test,
                         type=&quot;response&quot;)

&gt; pnull.test &lt;- mean(testy)

&gt; null.dev.test &lt;- -2*loglikelihood(testy, pnull.test)

&gt; resid.dev.test &lt;- -2*loglikelihood(testy, testpred)

&gt; pnull.test
[1] 0.0172713

&gt; ## [1] 0.0172713
&gt; null.dev.test
[1] 2110.91

&gt; ## [1] 2110.91
&gt; resid.dev.test
[1] 1947.094

&gt; ## [1] 1947.094
&gt; 
&gt; # Note 1: 
&gt; #   Function to calculate the log likelihood of 
&gt; #   a dataset. Variable y is the outcome 
&gt; #   in numeric form (1 for positive examples, 0 for 
&gt; #   negative). Variable py is the 
&gt; #   predicted probability that 
&gt; #   y==1. 
&gt; 
&gt; # Note 2: 
&gt; #   Calculate rate of positive examples in 
&gt; #   dataset. 
&gt; 
&gt; # Note 3: 
&gt; #   Calculate null deviance. 
&gt; 
&gt; # Note 4: 
&gt; #   For training data, the null deviance is 
&gt; #   stored in the slot model$null.deviance. 
&gt; 
&gt; # Note 5: 
&gt; #   Predict probabilities for training 
&gt; #   data. 
&gt; 
&gt; # Note 6: 
&gt; #   Calculate deviance of model for training 
&gt; #   data. 
&gt; 
&gt; # Note 7: 
&gt; #   For training data, model deviance is stored 
&gt; #   in the slot model$deviance. 
&gt; 
&gt; # Note 8: 
&gt; #   Calculate null deviance and residual 
&gt; #   deviance for test data. 
&gt; 
[1] &quot;############################### end  126 Fri Jun 17 10:40:36 2016&quot;
[1] &quot;############################### start  127 Fri Jun 17 10:40:36 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00127_example_7.18_of_section_7.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.18 of section 7.2.5 
&gt; # (example 7.18 of section 7.2.5)  : Linear and logistic regression : Using logistic regression : Reading the model summary and characterizing coefficients 
&gt; # Title: Calculating the significance of the observed fit 
&gt; 
&gt; df.null &lt;- dim(train)[[1]] - 1                                # Note: 1 

&gt; df.model &lt;- dim(train)[[1]] - length(model$coefficients)      # Note: 2 

&gt; df.null
[1] 14211

&gt; ## [1] 14211
&gt; df.model
[1] 14198

&gt; ## [1] 14198
&gt; 
&gt; delDev &lt;- null.dev - resid.dev                                # Note: 3 

&gt; deldf &lt;- df.null - df.model

&gt; p &lt;- pchisq(delDev, deldf, lower.tail=F)                      # Note: 4 

&gt; delDev
[1] 235.724

&gt; ## [1] 235.724
&gt; deldf
[1] 13

&gt; ## [1] 13
&gt; p
[1] 5.84896e-43

&gt; ## [1] 5.84896e-43
&gt; 
&gt; # Note 1: 
&gt; #   Null model has (number of data points - 1) 
&gt; #   degrees of freedom. 
&gt; 
&gt; # Note 2: 
&gt; #   Fitted model has (number of data points - 
&gt; #   number of coefficients) degrees of freedom. 
&gt; 
&gt; # Note 3: 
&gt; #   Compute difference in deviances and 
&gt; #   difference in degrees of freedom. 
&gt; 
&gt; # Note 4: 
&gt; #   Estimate probability of seeing the observed 
&gt; #   difference in deviances under null model (the 
&gt; #   p-value) using chi-squared distribution. 
&gt; 
[1] &quot;############################### end  127 Fri Jun 17 10:40:36 2016&quot;
[1] &quot;############################### start  128 Fri Jun 17 10:40:36 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00128_example_7.19_of_section_7.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.19 of section 7.2.5 
&gt; # (example 7.19 of section 7.2.5)  : Linear and logistic regression : Using logistic regression : Reading the model summary and characterizing coefficients 
&gt; # Title: Calculating the pseudo R-squared 
&gt; 
&gt; pr2 &lt;- 1-(resid.dev/null.dev)

&gt; print(pr2)
[1] 0.08734674

&gt; ## [1] 0.08734674
&gt; pr2.test &lt;- 1-(resid.dev.test/null.dev.test)

&gt; print(pr2.test)
[1] 0.07760427

&gt; ## [1] 0.07760427
&gt; 
[1] &quot;############################### end  128 Fri Jun 17 10:40:36 2016&quot;
[1] &quot;############################### start  129 Fri Jun 17 10:40:36 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c07_Linear_and_logistic_regression/00129_example_7.20_of_section_7.2.5.R&quot;
[1] &quot;#####   in directory ../CDC&quot;

&gt; # example 7.20 of section 7.2.5 
&gt; # (example 7.20 of section 7.2.5)  : Linear and logistic regression : Using logistic regression : Reading the model summary and characterizing coefficients 
&gt; # Title: Calculating the Akaike information criterion 
&gt; 
&gt; aic &lt;- 2*(length(model$coefficients) -
          loglikelihood(as.numeric(train$atRisk), pred))

&gt; aic
[1] 2490.992

&gt; ## [1] 2490.992
&gt; 
[1] &quot;############################### end  129 Fri Jun 17 10:40:36 2016&quot;</code></pre>