RunExamples/rCh01.html

<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c01_The_data_science_process&#39;</span>,
       <span class="st">&#39;../Statlog&#39;</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  1 Fri Jun 17 10:32:19 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c01_The_data_science_process/00001_example_1.1_of_section_1.2.3.R&quot;
[1] &quot;#####   in directory ../Statlog&quot;

&gt; # example 1.1 of section 1.2.3 
&gt; # (example 1.1 of section 1.2.3)  : The data science process : Stages of a data science project : Modeling 
&gt; # Title: Building a decision tree 
&gt; 
&gt; library(&#39;rpart&#39;)

&gt; load(&#39;GCDData.RData&#39;)

&gt; model &lt;- rpart(Good.Loan ~
    Duration.in.month +
    Installment.rate.in.percentage.of.disposable.income +
    Credit.amount  +
    Other.installment.plans,
    data=d,
    control=rpart.control(maxdepth=4),
    method=&quot;class&quot;)
[1] &quot;############################### end  1 Fri Jun 17 10:32:20 2016&quot;
[1] &quot;############################### start  2 Fri Jun 17 10:32:20 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c01_The_data_science_process/00002_example_1.2_of_section_1.2.4.R&quot;
[1] &quot;#####   in directory ../Statlog&quot;

&gt; # example 1.2 of section 1.2.4 
&gt; # (example 1.2 of section 1.2.4)  : The data science process : Stages of a data science project : Model evaluation and critique 
&gt; # Title: Plotting the confusion matrix 
&gt; 
&gt; creditdata &lt;- d

&gt; resultframe &lt;- data.frame(Good.Loan=creditdata$Good.Loan,
                            pred=predict(model, type=&quot;class&quot;))

&gt; rtab &lt;- table(resultframe)    # Note: 1 

&gt; rtab
          pred
Good.Loan  BadLoan GoodLoan
  BadLoan       41      259
  GoodLoan      13      687

&gt; ##           pred
&gt; ## Good.Loan  BadLoan GoodLoan
&gt; ##   BadLoan       41      259
&gt; ##   GoodLoan      13      687
&gt; 
&gt; sum(diag(rtab))/sum(rtab)     # Note: 2 
[1] 0.728

&gt; ## [1] 0.728
&gt; sum(rtab[1,1])/sum(rtab[,1])  # Note: 3 
[1] 0.7592593

&gt; ## [1] 0.7592593
&gt; sum(rtab[1,1])/sum(rtab[1,])  # Note: 4 
[1] 0.1366667

&gt; ## [1] 0.1366667
&gt; sum(rtab[2,1])/sum(rtab[2,])  # Note: 5 
[1] 0.01857143

&gt; ## [1] 0.01857143
&gt; 
&gt; # Note 1: 
&gt; #   Create the confusion matrix. Rows represent 
&gt; #   actual loan status; columns represent predicted 
&gt; #   loan status. The diagonal entries represent 
&gt; #   correct predictions. 
&gt; 
&gt; # Note 2: 
&gt; #   accuracyconfusion matrixOverall model accuracy: 73% of the predictions 
&gt; #   were correct. 
&gt; 
&gt; # Note 3: 
&gt; #   precisionconfusion matrixModel precision: 76% of the applicants 
&gt; #   predicted as bad really did default. 
&gt; 
&gt; # Note 4: 
&gt; #   recallconfusion matrixModel recall: the model found 14% of the 
&gt; #   defaulting loans. 
&gt; 
&gt; # Note 5: 
&gt; #   false positive rateconfusion matrixFalse positive rate: 2% of the good applicants 
&gt; #   were mistakenly identified as bad. 
&gt; 
[1] &quot;############################### end  2 Fri Jun 17 10:32:20 2016&quot;
[1] &quot;############################### start  3 Fri Jun 17 10:32:20 2016&quot;
[1] &quot;#####  running  ../CodeExamples/c01_The_data_science_process/00003_example_1.3_of_section_1.3.1.R&quot;
[1] &quot;#####   in directory ../Statlog&quot;

&gt; # example 1.3 of section 1.3.1 
&gt; # (example 1.3 of section 1.3.1)  : The data science process : Setting expectations : Determining lower and upper bounds on model performance 
&gt; # Title: Plotting the relation between disposable income and loan outcome 
&gt; 
&gt; tab1 &lt;- as.table(matrix(data=c(50,6,0,44),nrow=2,ncol=2))

&gt; dimnames(tab1) &lt;- list(&#39;loan.as.pct.disposable.income&#39;=
       c(&#39;LT.15pct&#39;,&#39;GT.15pct&#39;),
    &#39;loan.quality.pop1&#39;=
       c(&#39;goodloan&#39;,&#39;badloan&#39;))

&gt; tab2 &lt;- as.table(matrix(data=c(34,18,16,32),nrow=2,ncol=2))

&gt; dimnames(tab2) &lt;- list(&#39;loan.as.pct.disposable.income&#39;=
       c(&#39;LT.15pct&#39;,&#39;GT.15pct&#39;),
    &#39;loan.quality.pop2&#39;=
       c(&#39;goodloan&#39;,&#39;badloan&#39;))

&gt; tab1
                             loan.quality.pop1
loan.as.pct.disposable.income goodloan badloan
                     LT.15pct       50       0
                     GT.15pct        6      44

&gt; ##                              loan.quality.pop1     # Note: 1 
&gt; ## loan.as.pct.disposable.income goodloan badloan
&gt; ##                      LT.15pct       50       0
&gt; ##                      GT.15pct        6      44
&gt; sum(diag(tab1))/sum(tab1)                     # Note: 2 
[1] 0.94

&gt; ## [1] 0.94
&gt; 
&gt; tab2
                             loan.quality.pop2
loan.as.pct.disposable.income goodloan badloan
                     LT.15pct       34      16
                     GT.15pct       18      32

&gt; ##                              loan.quality.pop2     # Note: 3 
&gt; ## loan.as.pct.disposable.income goodloan badloan
&gt; ##                      LT.15pct       34      16
&gt; ##                      GT.15pct       18      32
&gt; sum(diag(tab2))/sum(tab2)
[1] 0.66

&gt; ## [1] 0.66                                                           # Note: 4
&gt; 
&gt; # Note 1: 
&gt; #   The count of correct predictions is on the 
&gt; #   diagonal of tab1. In this first population, all 
&gt; #   the loans that were less than 15% of disposable 
&gt; #   income were good loans, and all but six of the 
&gt; #   loans that were greater than 15% of disposable 
&gt; #   income defaulted. So you know that 
&gt; #   loan.as.pct.disposable.income models loan quality 
&gt; #   well in this population. Or as statisticians might 
&gt; #   say, loan.as.pct.disposable.income “explains” the 
&gt; #   output (loan quality). 
&gt; 
&gt; # Note 2: 
&gt; #   In fact, it’s 94% accurate. 
&gt; 
&gt; # Note 3: 
&gt; #   In the second population, about a third of 
&gt; #   the loans that were less than 15% of disposable 
&gt; #   income defaulted, and over half of the loans that 
&gt; #   were greater than 15% of disposable income were 
&gt; #   good. So you know that 
&gt; #   loan.as.pct.disposable.income doesn’t model loan 
&gt; #   quality well in this population. 
&gt; 
&gt; # Note 4: 
&gt; #   The rule of thumb is only 66% 
&gt; #   accurate. 
&gt; 
[1] &quot;############################### end  3 Fri Jun 17 10:32:20 2016&quot;</code></pre>