RunExamples/rCh08.html

<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">&#39;ggplot2&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c08_Unsupervised_methods&#39;</span>,
      <span class="st">&#39;../Protein&#39;</span>,<span class="dt">last=</span><span class="dv">151</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  136 Tue May  2 20:40:45 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00136_example_8.1_of_section_8.1.2.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.1 of section 8.1.2 
&gt; # (example 8.1 of section 8.1.2)  : Unsupervised methods : Cluster analysis : Preparing the data 
&gt; # Title: Reading the protein data 
&gt; 
&gt; protein &lt;- read.table(&quot;protein.txt&quot;, sep=&quot;\t&quot;, header=TRUE)

&gt; summary(protein)
           Country      RedMeat         WhiteMeat           Eggs      
 Albania       : 1   Min.   : 4.400   Min.   : 1.400   Min.   :0.500  
 Austria       : 1   1st Qu.: 7.800   1st Qu.: 4.900   1st Qu.:2.700  
 Belgium       : 1   Median : 9.500   Median : 7.800   Median :2.900  
 Bulgaria      : 1   Mean   : 9.828   Mean   : 7.896   Mean   :2.936  
 Czechoslovakia: 1   3rd Qu.:10.600   3rd Qu.:10.800   3rd Qu.:3.700  
 Denmark       : 1   Max.   :18.000   Max.   :14.000   Max.   :4.700  
 (Other)       :19                                                    
      Milk            Fish           Cereals          Starch     
 Min.   : 4.90   Min.   : 0.200   Min.   :18.60   Min.   :0.600  
 1st Qu.:11.10   1st Qu.: 2.100   1st Qu.:24.30   1st Qu.:3.100  
 Median :17.60   Median : 3.400   Median :28.00   Median :4.700  
 Mean   :17.11   Mean   : 4.284   Mean   :32.25   Mean   :4.276  
 3rd Qu.:23.30   3rd Qu.: 5.800   3rd Qu.:40.10   3rd Qu.:5.700  
 Max.   :33.70   Max.   :14.200   Max.   :56.70   Max.   :6.500  
                                                                 
      Nuts           Fr.Veg     
 Min.   :0.700   Min.   :1.400  
 1st Qu.:1.500   1st Qu.:2.900  
 Median :2.400   Median :3.800  
 Mean   :3.072   Mean   :4.136  
 3rd Qu.:4.700   3rd Qu.:4.900  
 Max.   :7.800   Max.   :7.900  
                                

&gt; ##            Country      RedMeat         WhiteMeat           Eggs
&gt; ##  Albania       : 1   Min.   : 4.400   Min.   : 1.400   Min.   :0.500
&gt; ##  Austria       : 1   1st Qu.: 7.800   1st Qu.: 4.900   1st Qu.:2.700
&gt; ##  Belgium       : 1   Median : 9.500   Median : 7.800   Median :2.900
&gt; ##  Bulgaria      : 1   Mean   : 9.828   Mean   : 7.896   Mean   :2.936
&gt; ##  Czechoslovakia: 1   3rd Qu.:10.600   3rd Qu.:10.800   3rd Qu.:3.700
&gt; ##  Denmark       : 1   Max.   :18.000   Max.   :14.000   Max.   :4.700
&gt; ##  (Other)       :19
&gt; ##       Milk            Fish           Cereals          Starch
&gt; ##  Min.   : 4.90   Min.   : 0.200   Min.   :18.60   Min.   :0.600
&gt; ##  1st Qu.:11.10   1st Qu.: 2.100   1st Qu.:24.30   1st Qu.:3.100
&gt; ##  Median :17.60   Median : 3.400   Median :28.00   Median :4.700
&gt; ##  Mean   :17.11   Mean   : 4.284   Mean   :32.25   Mean   :4.276
&gt; ##  3rd Qu.:23.30   3rd Qu.: 5.800   3rd Qu.:40.10   3rd Qu.:5.700
&gt; ##  Max.   :33.70   Max.   :14.200   Max.   :56.70   Max.   :6.500
&gt; ##
&gt; ##       Nuts           Fr.Veg
&gt; ##  Min.   :0.700   Min.   :1.400
&gt; ##  1st Qu.:1.500   1st Qu.:2.900
&gt; ##  Median :2.400   Median :3.800
&gt; ##  Mean   :3.072   Mean   :4.136
&gt; ##  3rd Qu.:4.700   3rd Qu.:4.900
&gt; ##  Max.   :7.800   Max.   :7.900
&gt; 
[1] &quot;############################### end  136 Tue May  2 20:40:45 2017&quot;
[1] &quot;############################### start  137 Tue May  2 20:40:45 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00137_example_8.2_of_section_8.1.2.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.2 of section 8.1.2 
&gt; # (example 8.2 of section 8.1.2)  : Unsupervised methods : Cluster analysis : Preparing the data 
&gt; # Title: Rescaling the dataset 
&gt; 
&gt; vars.to.use &lt;- colnames(protein)[-1]          # Note: 1 

&gt; pmatrix &lt;- scale(protein[,vars.to.use])       # Note: 2 

&gt; pcenter &lt;- attr(pmatrix, &quot;scaled:center&quot;)     # Note: 3 

&gt; pscale &lt;- attr(pmatrix, &quot;scaled:scale&quot;)

&gt; attr(pmatrix, &quot;scaled:center&quot;) &lt;- NULL

&gt; attr(pmatrix, &quot;scaled:scale&quot;) &lt;- NULL

&gt; # Note 1: 
&gt; #   Use all the columns except the first 
&gt; #   (Country). 
&gt; 
&gt; # Note 2: 
&gt; #   The output of scale() is a matrix. For the 
&gt; #   purposes of this chapter, you can think of a 
&gt; #   matrix as a data frame with all numeric columns 
&gt; #   (this isn’t strictly true, but it’s close enough). 
&gt; 
&gt; # Note 3: 
&gt; #   The scale() function annotates its output 
&gt; #   with two attributes—scaled:center returns the mean 
&gt; #   values of all the columns, and scaled:scale 
&gt; #   returns the standard deviations. You’ll store 
&gt; #   these away so you can “unscale” the data 
&gt; #   later. 
&gt; 
[1] &quot;############################### end  137 Tue May  2 20:40:45 2017&quot;
[1] &quot;############################### start  138 Tue May  2 20:40:45 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00138_example_8.3_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.3 of section 8.1.3 
&gt; # (example 8.3 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; # Title: Hierarchical clustering 
&gt; 
&gt; d &lt;- dist(pmatrix, method=&quot;euclidean&quot;)    # Note: 1 

&gt; pfit &lt;- hclust(d, method=&quot;ward.D&quot;)            # Note: 2 

&gt; plot(pfit, labels=protein$Country)        # Note: 3</code></pre>
<div class="figure">
<img src="rCh08_files/figure-markdown_github/ch8ex1-1.png" alt="" />

</div>
<pre><code>&gt; # Note 1: 
&gt; #   Create the distance matrix. 
&gt; 
&gt; # Note 2: 
&gt; #   Do the clustering. 
&gt; 
&gt; # Note 3: 
&gt; #   Plot the dendrogram. 
&gt; 
[1] &quot;############################### end  138 Tue May  2 20:40:45 2017&quot;
[1] &quot;############################### start  139 Tue May  2 20:40:45 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00139_informalexample_8.5_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # informalexample 8.5 of section 8.1.3 
&gt; # (informalexample 8.5 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; 
&gt; rect.hclust(pfit, k=5)
[1] &quot;############################### end  139 Tue May  2 20:40:45 2017&quot;
[1] &quot;############################### start  140 Tue May  2 20:40:45 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00140_example_8.4_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.4 of section 8.1.3 
&gt; # (example 8.4 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; # Title: Extracting the clusters found by hclust() 
&gt; 
&gt; groups &lt;- cutree(pfit, k=5)

&gt; print_clusters &lt;- function(labels, k) {               # Note: 1 
   for(i in 1:k) {
     print(paste(&quot;cluster&quot;, i))
     print(protein[labels==i,c(&quot;Country&quot;,&quot;RedMeat&quot;,&quot;Fish&quot;,&quot;Fr.Veg&quot;)])
   }
 }

&gt; print_clusters(groups, 5)
[1] &quot;cluster 1&quot;
      Country RedMeat Fish Fr.Veg
1     Albania    10.1  0.2    1.7
4    Bulgaria     7.8  1.2    4.2
18    Romania     6.2  1.0    2.8
25 Yugoslavia     4.4  0.6    3.2
[1] &quot;cluster 2&quot;
       Country RedMeat Fish Fr.Veg
2      Austria     8.9  2.1    4.3
3      Belgium    13.5  4.5    4.0
9       France    18.0  5.7    6.5
12     Ireland    13.9  2.2    2.9
14 Netherlands     9.5  2.5    3.7
21 Switzerland    13.1  2.3    4.9
22          UK    17.4  4.3    3.3
24   W Germany    11.4  3.4    3.8
[1] &quot;cluster 3&quot;
          Country RedMeat Fish Fr.Veg
5  Czechoslovakia     9.7  2.0    4.0
7       E Germany     8.4  5.4    3.6
11        Hungary     5.3  0.3    4.2
16         Poland     6.9  3.0    6.6
23           USSR     9.3  3.0    2.9
[1] &quot;cluster 4&quot;
   Country RedMeat Fish Fr.Veg
6  Denmark    10.6  9.9    2.4
8  Finland     9.5  5.8    1.4
15  Norway     9.4  9.7    2.7
20  Sweden     9.9  7.5    2.0
[1] &quot;cluster 5&quot;
    Country RedMeat Fish Fr.Veg
10   Greece    10.2  5.9    6.5
13    Italy     9.0  3.4    6.7
17 Portugal     6.2 14.2    7.9
19    Spain     7.1  7.0    7.2

&gt; ## [1] &quot;cluster 1&quot;
&gt; ##       Country RedMeat Fish Fr.Veg
&gt; ## 1     Albania    10.1  0.2    1.7
&gt; ## 4    Bulgaria     7.8  1.2    4.2
&gt; ## 18    Romania     6.2  1.0    2.8
&gt; ## 25 Yugoslavia     4.4  0.6    3.2
&gt; ## [1] &quot;cluster 2&quot;
&gt; ##        Country RedMeat Fish Fr.Veg
&gt; ## 2      Austria     8.9  2.1    4.3
&gt; ## 3      Belgium    13.5  4.5    4.0
&gt; ## 9       France    18.0  5.7    6.5
&gt; ## 12     Ireland    13.9  2.2    2.9
&gt; ## 14 Netherlands     9.5  2.5    3.7
&gt; ## 21 Switzerland    13.1  2.3    4.9
&gt; ## 22          UK    17.4  4.3    3.3
&gt; ## 24   W Germany    11.4  3.4    3.8
&gt; ## [1] &quot;cluster 3&quot;
&gt; ##           Country RedMeat Fish Fr.Veg
&gt; ## 5  Czechoslovakia     9.7  2.0    4.0
&gt; ## 7       E Germany     8.4  5.4    3.6
&gt; ## 11        Hungary     5.3  0.3    4.2
&gt; ## 16         Poland     6.9  3.0    6.6
&gt; ## 23           USSR     9.3  3.0    2.9
&gt; ## [1] &quot;cluster 4&quot;
&gt; ##    Country RedMeat Fish Fr.Veg
&gt; ## 6  Denmark    10.6  9.9    2.4
&gt; ## 8  Finland     9.5  5.8    1.4
&gt; ## 15  Norway     9.4  9.7    2.7
&gt; ## 20  Sweden     9.9  7.5    2.0
&gt; ## [1] &quot;cluster 5&quot;
&gt; ##     Country RedMeat Fish Fr.Veg
&gt; ## 10   Greece    10.2  5.9    6.5
&gt; ## 13    Italy     9.0  3.4    6.7
&gt; ## 17 Portugal     6.2 14.2    7.9
&gt; ## 19    Spain     7.1  7.0    7.2
&gt; 
&gt; # Note 1: 
&gt; #   A convenience function for printing out the 
&gt; #   countries in each cluster, along with the values 
&gt; #   for red meat, fish, and fruit/vegetable 
&gt; #   consumption. We’ll use this function throughout 
&gt; #   this section. Note that the function is hardcoded 
&gt; #   for the protein dataset. 
&gt; 
[1] &quot;############################### end  140 Tue May  2 20:40:45 2017&quot;
[1] &quot;############################### start  141 Tue May  2 20:40:45 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00141_example_8.5_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.5 of section 8.1.3 
&gt; # (example 8.5 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; # Title: Projecting the clusters on the first two principal components 
&gt; 
&gt; library(ggplot2)

&gt; princ &lt;- prcomp(pmatrix)      # Note: 1 

&gt; nComp &lt;- 2

&gt; project &lt;- (pmatrix %*% princ$rotation)[,1:nComp]             # Note: 2 

&gt; project.plus &lt;- cbind(as.data.frame(project),                 # Note: 3 
                      cluster=as.factor(groups),
                      country=protein$Country)

&gt; ggplot(project.plus, aes(x=PC1, y=PC2)) +                 # Note: 4 
   geom_point(aes(shape=cluster)) +
   geom_text(aes(label=country),
             hjust=0, vjust=1)</code></pre>
<div class="figure">
<img src="rCh08_files/figure-markdown_github/ch8ex1-2.png" alt="" />

</div>
<pre><code>&gt; # Note 1: 
&gt; #   Calculate the principal components of the 
&gt; #   data. 
&gt; 
&gt; # Note 2: 
&gt; #   The predict() function will rotate the data 
&gt; #   into the space described by the principal 
&gt; #   components. We only want the projection on the 
&gt; #   first two axes. 
&gt; 
&gt; # Note 3: 
&gt; #   Create a data frame with the transformed 
&gt; #   data, along with the cluster label and country 
&gt; #   label of each point. 
&gt; 
&gt; # Note 4: 
&gt; #   Plot it. 
&gt; 
[1] &quot;############################### end  141 Tue May  2 20:40:45 2017&quot;
[1] &quot;############################### start  142 Tue May  2 20:40:45 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00142_example_8.6_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.6 of section 8.1.3 
&gt; # (example 8.6 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; # Title: Running clusterboot() on the protein data 
&gt; 
&gt; library(fpc)                                      # Note: 1 

&gt; kbest.p&lt;-5                                                    # Note: 2 

&gt; cboot.hclust &lt;- clusterboot(pmatrix,clustermethod=hclustCBI,  # Note: 3 
                            method=&quot;ward.D&quot;, k=kbest.p)
boot 1 
boot 2 
boot 3 
boot 4 
boot 5 
boot 6 
boot 7 
boot 8 
boot 9 
boot 10 
boot 11 
boot 12 
boot 13 
boot 14 
boot 15 
boot 16 
boot 17 
boot 18 
boot 19 
boot 20 
boot 21 
boot 22 
boot 23 
boot 24 
boot 25 
boot 26 
boot 27 
boot 28 
boot 29 
boot 30 
boot 31 
boot 32 
boot 33 
boot 34 
boot 35 
boot 36 
boot 37 
boot 38 
boot 39 
boot 40 
boot 41 
boot 42 
boot 43 
boot 44 
boot 45 
boot 46 
boot 47 
boot 48 
boot 49 
boot 50 
boot 51 
boot 52 
boot 53 
boot 54 
boot 55 
boot 56 
boot 57 
boot 58 
boot 59 
boot 60 
boot 61 
boot 62 
boot 63 
boot 64 
boot 65 
boot 66 
boot 67 
boot 68 
boot 69 
boot 70 
boot 71 
boot 72 
boot 73 
boot 74 
boot 75 
boot 76 
boot 77 
boot 78 
boot 79 
boot 80 
boot 81 
boot 82 
boot 83 
boot 84 
boot 85 
boot 86 
boot 87 
boot 88 
boot 89 
boot 90 
boot 91 
boot 92 
boot 93 
boot 94 
boot 95 
boot 96 
boot 97 
boot 98 
boot 99 
boot 100 

&gt; summary(cboot.hclust$result)                                  # Note: 4 
              Length Class  Mode     
result         7     hclust list     
noise          1     -none- logical  
nc             1     -none- numeric  
clusterlist    5     -none- list     
partition     25     -none- numeric  
clustermethod  1     -none- character
nccl           1     -none- numeric  

&gt; ##               Length Class  Mode
&gt; ## result         7     hclust list
&gt; ## noise          1     -none- logical
&gt; ## nc             1     -none- numeric
&gt; ## clusterlist    5     -none- list
&gt; ## partition     25     -none- numeric
&gt; ## clustermethod  1     -none- character
&gt; ## nccl           1     -none- numeric
&gt; 
&gt; groups&lt;-cboot.hclust$result$partition                         # Note: 5 

&gt; print_clusters(groups, kbest.p)                               # Note: 6 
[1] &quot;cluster 1&quot;
      Country RedMeat Fish Fr.Veg
1     Albania    10.1  0.2    1.7
4    Bulgaria     7.8  1.2    4.2
18    Romania     6.2  1.0    2.8
25 Yugoslavia     4.4  0.6    3.2
[1] &quot;cluster 2&quot;
       Country RedMeat Fish Fr.Veg
2      Austria     8.9  2.1    4.3
3      Belgium    13.5  4.5    4.0
9       France    18.0  5.7    6.5
12     Ireland    13.9  2.2    2.9
14 Netherlands     9.5  2.5    3.7
21 Switzerland    13.1  2.3    4.9
22          UK    17.4  4.3    3.3
24   W Germany    11.4  3.4    3.8
[1] &quot;cluster 3&quot;
          Country RedMeat Fish Fr.Veg
5  Czechoslovakia     9.7  2.0    4.0
7       E Germany     8.4  5.4    3.6
11        Hungary     5.3  0.3    4.2
16         Poland     6.9  3.0    6.6
23           USSR     9.3  3.0    2.9
[1] &quot;cluster 4&quot;
   Country RedMeat Fish Fr.Veg
6  Denmark    10.6  9.9    2.4
8  Finland     9.5  5.8    1.4
15  Norway     9.4  9.7    2.7
20  Sweden     9.9  7.5    2.0
[1] &quot;cluster 5&quot;
    Country RedMeat Fish Fr.Veg
10   Greece    10.2  5.9    6.5
13    Italy     9.0  3.4    6.7
17 Portugal     6.2 14.2    7.9
19    Spain     7.1  7.0    7.2

&gt; ## [1] &quot;cluster 1&quot;
&gt; ##       Country RedMeat Fish Fr.Veg
&gt; ## 1     Albania    10.1  0.2    1.7
&gt; ## 4    Bulgaria     7.8  1.2    4.2
&gt; ## 18    Romania     6.2  1.0    2.8
&gt; ## 25 Yugoslavia     4.4  0.6    3.2
&gt; ## [1] &quot;cluster 2&quot;
&gt; ##        Country RedMeat Fish Fr.Veg
&gt; ## 2      Austria     8.9  2.1    4.3
&gt; ## 3      Belgium    13.5  4.5    4.0
&gt; ## 9       France    18.0  5.7    6.5
&gt; ## 12     Ireland    13.9  2.2    2.9
&gt; ## 14 Netherlands     9.5  2.5    3.7
&gt; ## 21 Switzerland    13.1  2.3    4.9
&gt; ## 22          UK    17.4  4.3    3.3
&gt; ## 24   W Germany    11.4  3.4    3.8
&gt; ## [1] &quot;cluster 3&quot;
&gt; ##           Country RedMeat Fish Fr.Veg
&gt; ## 5  Czechoslovakia     9.7  2.0    4.0
&gt; ## 7       E Germany     8.4  5.4    3.6
&gt; ## 11        Hungary     5.3  0.3    4.2
&gt; ## 16         Poland     6.9  3.0    6.6
&gt; ## 23           USSR     9.3  3.0    2.9
&gt; ## [1] &quot;cluster 4&quot;
&gt; ##    Country RedMeat Fish Fr.Veg
&gt; ## 6  Denmark    10.6  9.9    2.4
&gt; ## 8  Finland     9.5  5.8    1.4
&gt; ## 15  Norway     9.4  9.7    2.7
&gt; ## 20  Sweden     9.9  7.5    2.0
&gt; ## [1] &quot;cluster 5&quot;
&gt; ##     Country RedMeat Fish Fr.Veg
&gt; ## 10   Greece    10.2  5.9    6.5
&gt; ## 13    Italy     9.0  3.4    6.7
&gt; ## 17 Portugal     6.2 14.2    7.9
&gt; ## 19    Spain     7.1  7.0    7.2
&gt; cboot.hclust$bootmean                                     # Note: 7 
[1] 0.7958333 0.7882421 0.6631746 0.9151190 0.7596667

&gt; ## [1] 0.7905000 0.7990913 0.6173056 0.9312857 0.7560000
&gt; cboot.hclust$bootbrd                                      # Note: 8 
[1] 23 15 41 11 36

&gt; ## [1] 25 11 47  8 35
&gt; 
&gt; # Note 1: 
&gt; #   Load the fpc package. You may have to 
&gt; #   install it first. We’ll discuss installing R 
&gt; #   packages in appendix . 
&gt; 
&gt; # Note 2: 
&gt; #   Set the desired number of clusters. 
&gt; 
&gt; # Note 3: 
&gt; #   Run clusterboot() with hclust 
&gt; #   (&#39;clustermethod=hclustCBI&#39;) using Ward’s method 
&gt; #   (&#39;method=&quot;ward.D&quot;&#39;) and kbest.p clusters 
&gt; #   (&#39;k=kbest.p&#39;). Return the results in an object 
&gt; #   called cboot.hclust. 
&gt; 
&gt; # Note 4: 
&gt; #   The results of the clustering are in 
&gt; #   cboot.hclust$result. The output of the hclust() 
&gt; #   function is in cboot.hclust$result$result. 
&gt; 
&gt; # Note 5: 
&gt; #   cboot.hclust$result$partition returns a 
&gt; #   vector of clusterlabels. 
&gt; 
&gt; # Note 6: 
&gt; #   The clusters are the same as those produced 
&gt; #   by a direct call to hclust(). 
&gt; 
&gt; # Note 7: 
&gt; #   The vector of cluster stabilities. 
&gt; 
&gt; # Note 8: 
&gt; #   The count of how many times each cluster was 
&gt; #   dissolved. By default clusterboot() runs 100 
&gt; #   bootstrap iterations. 
&gt; 
[1] &quot;############################### end  142 Tue May  2 20:40:46 2017&quot;
[1] &quot;############################### start  143 Tue May  2 20:40:46 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00143_example_8.7_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.7 of section 8.1.3 
&gt; # (example 8.7 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; # Title: Calculating total within sum of squares 
&gt; 
&gt; sqr_edist &lt;- function(x, y) {                 # Note: 1 
   sum((x-y)^2)
 }

&gt; wss.cluster &lt;- function(clustermat) {         # Note: 2 
   c0 &lt;- apply(clustermat, 2, FUN=mean)     # Note: 3 
   sum(apply(clustermat, 1, FUN=function(row){sqr_edist(row,c0)}))      # Note: 4 
 }

&gt; wss.total &lt;- function(dmatrix, labels) {                                  # Note: 5 
   wsstot &lt;- 0
   k &lt;- length(unique(labels))
   for(i in 1:k)
     wsstot &lt;- wsstot + wss.cluster(subset(dmatrix, labels==i))             # Note: 6 
   wsstot
 }

&gt; # Note 1: 
&gt; #   Function to calculate squared distance 
&gt; #   between two vectors. 
&gt; 
&gt; # Note 2: 
&gt; #   Function to calculate the WSS for a single 
&gt; #   cluster, which is represented as a matrix (one row 
&gt; #   for every point). 
&gt; 
&gt; # Note 3: 
&gt; #   Calculate the centroid of the cluster (the 
&gt; #   mean of all the points). 
&gt; 
&gt; # Note 4: 
&gt; #   Calculate the squared difference of every 
&gt; #   point in the cluster from the centroid, and sum 
&gt; #   all the distances. 
&gt; 
&gt; # Note 5: 
&gt; #   Function to compute the total WSS from a set 
&gt; #   of data points and cluster labels. 
&gt; 
&gt; # Note 6: 
&gt; #   Extract each cluster, calculate the 
&gt; #   cluster’s WSS, and sum all the values. 
&gt; 
[1] &quot;############################### end  143 Tue May  2 20:40:46 2017&quot;
[1] &quot;############################### start  144 Tue May  2 20:40:46 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00144_example_8.8_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.8 of section 8.1.3 
&gt; # (example 8.8 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; # Title: The Calinski-Harabasz index 
&gt; 
&gt; totss &lt;- function(dmatrix) {                  # Note: 1 
   grandmean &lt;- apply(dmatrix, 2, FUN=mean)
   sum(apply(dmatrix, 1, FUN=function(row){sqr_edist(row, grandmean)}))
 }

&gt; ch_criterion &lt;- function(dmatrix, kmax, method=&quot;kmeans&quot;) {        # Note: 2 
   if(!(method %in% c(&quot;kmeans&quot;, &quot;hclust&quot;))) {
     stop(&quot;method must be one of c(&#39;kmeans&#39;, &#39;hclust&#39;)&quot;)
   }
   npts &lt;- dim(dmatrix)[1]  # number of rows.
 
   totss &lt;- totss(dmatrix)                                          # Note: 3 
 
   wss &lt;- numeric(kmax)
   crit &lt;- numeric(kmax)
   wss[1] &lt;- (npts-1)*sum(apply(dmatrix, 2, var))                   # Note: 4 
   for(k in 2:kmax) {                                               # Note: 5 
     if(method==&quot;kmeans&quot;) {
       clustering&lt;-kmeans(dmatrix, k, nstart=10, iter.max=100)
       wss[k] &lt;- clustering$tot.withinss
     }else {  # hclust                                              # Note: 6 
       d &lt;- dist(dmatrix, method=&quot;euclidean&quot;)
       pfit &lt;- hclust(d, method=&quot;ward.D&quot;)
       labels &lt;- cutree(pfit, k=k)
       wss[k] &lt;- wss.total(dmatrix, labels)
     }
   }
   bss &lt;- totss - wss                                               # Note: 7 
   crit.num &lt;- bss/(0:(kmax-1))                                     # Note: 8 
   crit.denom &lt;- wss/(npts - 1:kmax)                                # Note: 9 
   list(crit = crit.num/crit.denom, wss = wss, totss = totss)       # Note: 10 
 }

&gt; # Note 1: 
&gt; #   Convenience function to calculate the total 
&gt; #   sum of squares. 
&gt; 
&gt; # Note 2: 
&gt; #   A function to calculate the CH index for a 
&gt; #   number of clusters from 1 to kmax. 
&gt; 
&gt; # Note 3: 
&gt; #   The total sum of squares is independent of 
&gt; #   the clustering. 
&gt; 
&gt; # Note 4: 
&gt; #   Calculate WSS for k=1 (which is really just 
&gt; #   total sum of squares). 
&gt; 
&gt; # Note 5: 
&gt; #   Calculate WSS for k from 2 to kmax. kmeans() 
&gt; #   returns the total WSS as one of its 
&gt; #   outputs. 
&gt; 
&gt; # Note 6: 
&gt; #   For hclust(), calculate total WSS by 
&gt; #   hand. 
&gt; 
&gt; # Note 7: 
&gt; #   Calculate BSS for k from 1 to kmax. 
&gt; 
&gt; # Note 8: 
&gt; #   Normalize BSS by k-1. 
&gt; 
&gt; # Note 9: 
&gt; #   Normalize WSS by npts - k. 
&gt; 
&gt; # Note 10: 
&gt; #   Return a vector of CH indices and of WSS for 
&gt; #   k from 1 to kmax. Also return total sum of 
&gt; #   squares. 
&gt; 
[1] &quot;############################### end  144 Tue May  2 20:40:46 2017&quot;
[1] &quot;############################### start  145 Tue May  2 20:40:46 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00145_example_8.9_of_section_8.1.3.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.9 of section 8.1.3 
&gt; # (example 8.9 of section 8.1.3)  : Unsupervised methods : Cluster analysis : Hierarchical clustering with hclust 
&gt; # Title: Evaluating clusterings with different numbers of clusters 
&gt; 
&gt; library(reshape2)                                             # Note: 1 

&gt; clustcrit &lt;- ch_criterion(pmatrix, 10, method=&quot;hclust&quot;)       # Note: 2 

&gt; critframe &lt;- data.frame(k=1:10, ch=scale(clustcrit$crit),     # Note: 3 
                        wss=scale(clustcrit$wss))

&gt; critframe &lt;- melt(critframe, id.vars=c(&quot;k&quot;),                  # Note: 4 
                  variable.name=&quot;measure&quot;,
                  value.name=&quot;score&quot;)

&gt; ggplot(critframe, aes(x=k, y=score, color=measure)) +         # Note: 5 
   geom_point(aes(shape=measure)) + geom_line(aes(linetype=measure)) +
   scale_x_continuous(breaks=1:10, labels=1:10)

Warning: Removed 1 rows containing missing values (geom_point).

Warning: Removed 1 rows containing missing values (geom_path).</code></pre>
<div class="figure">
<img src="rCh08_files/figure-markdown_github/ch8ex1-3.png" alt="" />

</div>
<pre><code>&gt; # Note 1: 
&gt; #   Load the reshape2 package (for the melt() 
&gt; #   function). 
&gt; 
&gt; # Note 2: 
&gt; #   Calculate both criteria for 1–10 
&gt; #   clusters. 
&gt; 
&gt; # Note 3: 
&gt; #   Create a data frame with the number of 
&gt; #   clusters, the CH criterion, and the WSS criterion. 
&gt; #   We’ll scale both the CH and WSS criteria to 
&gt; #   similar ranges so that we can plot them both on 
&gt; #   the same graph. 
&gt; 
&gt; # Note 4: 
&gt; #   Use the melt() function to put the data 
&gt; #   frame in a shape suitable for ggplot 
&gt; 
&gt; # Note 5: 
&gt; #   Plot it. 
&gt; 
[1] &quot;############################### end  145 Tue May  2 20:40:46 2017&quot;
[1] &quot;############################### start  146 Tue May  2 20:40:46 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00146_example_8.10_of_section_8.1.4.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.10 of section 8.1.4 
&gt; # (example 8.10 of section 8.1.4)  : Unsupervised methods : Cluster analysis : The k-means algorithm 
&gt; # Title: Running k-means with k=5 
&gt; 
&gt; pclusters &lt;- kmeans(pmatrix, kbest.p, nstart=100, iter.max=100)       # Note: 1 

&gt; summary(pclusters)                                                # Note: 2 
             Length Class  Mode   
cluster      25     -none- numeric
centers      45     -none- numeric
totss         1     -none- numeric
withinss      5     -none- numeric
tot.withinss  1     -none- numeric
betweenss     1     -none- numeric
size          5     -none- numeric
iter          1     -none- numeric
ifault        1     -none- numeric

&gt; ##              Length Class  Mode
&gt; ## cluster      25     -none- numeric
&gt; ## centers      45     -none- numeric
&gt; ## totss         1     -none- numeric
&gt; ## withinss      5     -none- numeric
&gt; ## tot.withinss  1     -none- numeric
&gt; ## betweenss     1     -none- numeric
&gt; ## size          5     -none- numeric
&gt; 
&gt; pclusters$centers                                                 # Note: 3 
       RedMeat  WhiteMeat        Eggs       Milk       Fish    Cereals
1  1.011180399  0.7421332  0.94084150  0.5700581 -0.2671539 -0.6877583
2  0.006572897 -0.2290150  0.19147892  1.3458748  1.1582546 -0.8722721
3 -0.570049402  0.5803879 -0.08589708 -0.4604938 -0.4537795  0.3181839
4 -0.807569986 -0.8719354 -1.55330561 -1.0783324 -1.0386379  1.7200335
5 -0.508801956 -1.1088009 -0.41248496 -0.8320414  0.9819154  0.1300253
      Starch       Nuts      Fr.Veg
1  0.2288743 -0.5083895  0.02161979
2  0.1676780 -0.9553392 -1.11480485
3  0.7857609 -0.2679180  0.06873983
4 -1.4234267  0.9961313 -0.64360439
5 -0.1842010  1.3108846  1.62924487

&gt; ##        RedMeat  WhiteMeat        Eggs       Milk       Fish
&gt; ## 1 -0.807569986 -0.8719354 -1.55330561 -1.0783324 -1.0386379
&gt; ## 2  0.006572897 -0.2290150  0.19147892  1.3458748  1.1582546
&gt; ## 3 -0.570049402  0.5803879 -0.08589708 -0.4604938 -0.4537795
&gt; ## 4  1.011180399  0.7421332  0.94084150  0.5700581 -0.2671539
&gt; ## 5 -0.508801956 -1.1088009 -0.41248496 -0.8320414  0.9819154
&gt; ##      Cereals     Starch       Nuts      Fr.Veg
&gt; ## 1  1.7200335 -1.4234267  0.9961313 -0.64360439
&gt; ## 2 -0.8722721  0.1676780 -0.9553392 -1.11480485
&gt; ## 3  0.3181839  0.7857609 -0.2679180  0.06873983
&gt; ## 4 -0.6877583  0.2288743 -0.5083895  0.02161979
&gt; ## 5  0.1300253 -0.1842010  1.3108846  1.62924487
&gt; pclusters$size                                                    # Note: 4 
[1] 8 4 5 4 4

&gt; ## [1] 4 4 5 8 4
&gt; 
&gt; groups &lt;- pclusters$cluster                                       # Note: 5 

&gt; print_clusters(groups, kbest.p)                                   # Note: 6 
[1] &quot;cluster 1&quot;
       Country RedMeat Fish Fr.Veg
2      Austria     8.9  2.1    4.3
3      Belgium    13.5  4.5    4.0
9       France    18.0  5.7    6.5
12     Ireland    13.9  2.2    2.9
14 Netherlands     9.5  2.5    3.7
21 Switzerland    13.1  2.3    4.9
22          UK    17.4  4.3    3.3
24   W Germany    11.4  3.4    3.8
[1] &quot;cluster 2&quot;
   Country RedMeat Fish Fr.Veg
6  Denmark    10.6  9.9    2.4
8  Finland     9.5  5.8    1.4
15  Norway     9.4  9.7    2.7
20  Sweden     9.9  7.5    2.0
[1] &quot;cluster 3&quot;
          Country RedMeat Fish Fr.Veg
5  Czechoslovakia     9.7  2.0    4.0
7       E Germany     8.4  5.4    3.6
11        Hungary     5.3  0.3    4.2
16         Poland     6.9  3.0    6.6
23           USSR     9.3  3.0    2.9
[1] &quot;cluster 4&quot;
      Country RedMeat Fish Fr.Veg
1     Albania    10.1  0.2    1.7
4    Bulgaria     7.8  1.2    4.2
18    Romania     6.2  1.0    2.8
25 Yugoslavia     4.4  0.6    3.2
[1] &quot;cluster 5&quot;
    Country RedMeat Fish Fr.Veg
10   Greece    10.2  5.9    6.5
13    Italy     9.0  3.4    6.7
17 Portugal     6.2 14.2    7.9
19    Spain     7.1  7.0    7.2

&gt; ## [1] &quot;cluster 1&quot;
&gt; ##       Country RedMeat Fish Fr.Veg
&gt; ## 1     Albania    10.1  0.2    1.7
&gt; ## 4    Bulgaria     7.8  1.2    4.2
&gt; ## 18    Romania     6.2  1.0    2.8
&gt; ## 25 Yugoslavia     4.4  0.6    3.2
&gt; ## [1] &quot;cluster 2&quot;
&gt; ##    Country RedMeat Fish Fr.Veg
&gt; ## 6  Denmark    10.6  9.9    2.4
&gt; ## 8  Finland     9.5  5.8    1.4
&gt; ## 15  Norway     9.4  9.7    2.7
&gt; ## 20  Sweden     9.9  7.5    2.0
&gt; ## [1] &quot;cluster 3&quot;
&gt; ##           Country RedMeat Fish Fr.Veg
&gt; ## 5  Czechoslovakia     9.7  2.0    4.0
&gt; ## 7       E Germany     8.4  5.4    3.6
&gt; ## 11        Hungary     5.3  0.3    4.2
&gt; ## 16         Poland     6.9  3.0    6.6
&gt; ## 23           USSR     9.3  3.0    2.9
&gt; ## [1] &quot;cluster 4&quot;
&gt; ##        Country RedMeat Fish Fr.Veg
&gt; ## 2      Austria     8.9  2.1    4.3
&gt; ## 3      Belgium    13.5  4.5    4.0
&gt; ## 9       France    18.0  5.7    6.5
&gt; ## 12     Ireland    13.9  2.2    2.9
&gt; ## 14 Netherlands     9.5  2.5    3.7
&gt; ## 21 Switzerland    13.1  2.3    4.9
&gt; ## 22          UK    17.4  4.3    3.3
&gt; ## 24   W Germany    11.4  3.4    3.8
&gt; ## [1] &quot;cluster 5&quot;
&gt; ##     Country RedMeat Fish Fr.Veg
&gt; ## 10   Greece    10.2  5.9    6.5
&gt; ## 13    Italy     9.0  3.4    6.7
&gt; ## 17 Portugal     6.2 14.2    7.9
&gt; ## 19    Spain     7.1  7.0    7.2
&gt; 
&gt; # Note 1: 
&gt; #   Run kmeans() with five clusters (kbest.p=5), 
&gt; #   100 random starts, and 100 maximum iterations per 
&gt; #   run. 
&gt; 
&gt; # Note 2: 
&gt; #   kmeans() returns all the sum of squares 
&gt; #   measures. 
&gt; 
&gt; # Note 3: 
&gt; #   pclusters$centers is a matrix whose rows are 
&gt; #   the centroids of the clusters. Note that 
&gt; #   pclusters$centers is in the scaled coordinates, 
&gt; #   not the original protein coordinates. 
&gt; 
&gt; # Note 4: 
&gt; #   pclusters$size returns the number of points 
&gt; #   in each cluster. Generally (though not always) a 
&gt; #   good clustering will be fairly well balanced: no 
&gt; #   extremely small clusters and no extremely large 
&gt; #   ones. 
&gt; 
&gt; # Note 5: 
&gt; #   pclusters$cluster is a vector of cluster 
&gt; #   labels. 
&gt; 
&gt; # Note 6: 
&gt; #   In this case, kmeans() and hclust() returned 
&gt; #   the same clustering. This won’t always be 
&gt; #   true. 
&gt; 
[1] &quot;############################### end  146 Tue May  2 20:40:46 2017&quot;
[1] &quot;############################### start  147 Tue May  2 20:40:46 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00147_example_8.11_of_section_8.1.4.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.11 of section 8.1.4 
&gt; # (example 8.11 of section 8.1.4)  : Unsupervised methods : Cluster analysis : The k-means algorithm 
&gt; # Title: Plotting cluster criteria 
&gt; 
&gt; clustering.ch &lt;- kmeansruns(pmatrix, krange=1:10, criterion=&quot;ch&quot;)     # Note: 1 

&gt; clustering.ch$bestk                                                   # Note: 2 
[1] 2

&gt; ##  [1] 2
&gt; clustering.asw &lt;- kmeansruns(pmatrix, krange=1:10, criterion=&quot;asw&quot;)   # Note: 3 

&gt; clustering.asw$bestk
[1] 3

&gt; ##  [1] 3
&gt; 
&gt; clustering.ch$crit                                                    # Note: 4 
 [1]  0.000000 14.094814 11.417985 10.418801 10.011797  9.964967  9.861682
 [8]  9.412089  9.166676  9.075569

&gt; ##  [1]  0.000000 14.094814 11.417985 10.418801 10.011797  9.964967
&gt; ##  [7]  9.861682  9.412089  9.166676  9.075569
&gt; clustcrit$crit                                                        # Note: 5 
 [1]       NaN 12.215107 10.359587  9.690891 10.011797  9.964967  9.506978
 [8]  9.092065  8.822406  8.695065

&gt; ##  [1]       NaN 12.215107 10.359587  9.690891 10.011797  9.964967
&gt; ##  [7]  9.506978  9.092065  8.822406  8.695065
&gt; 
&gt; critframe &lt;- data.frame(k=1:10, ch=scale(clustering.ch$crit),         # Note: 6 
              asw=scale(clustering.asw$crit))

&gt; critframe &lt;- melt(critframe, id.vars=c(&quot;k&quot;),
                  variable.name=&quot;measure&quot;,
                   value.name=&quot;score&quot;)

&gt; ggplot(critframe, aes(x=k, y=score, color=measure)) +
    geom_point(aes(shape=measure)) + geom_line(aes(linetype=measure)) +
    scale_x_continuous(breaks=1:10, labels=1:10)</code></pre>
<div class="figure">
<img src="rCh08_files/figure-markdown_github/ch8ex1-4.png" alt="" />

</div>
<pre><code>&gt; summary(clustering.ch)                                                # Note: 7 
             Length Class  Mode   
cluster      25     -none- numeric
centers      18     -none- numeric
totss         1     -none- numeric
withinss      2     -none- numeric
tot.withinss  1     -none- numeric
betweenss     1     -none- numeric
size          2     -none- numeric
iter          1     -none- numeric
ifault        1     -none- numeric
crit         10     -none- numeric
bestk         1     -none- numeric

&gt; ##              Length Class  Mode
&gt; ## cluster      25     -none- numeric
&gt; ## centers      18     -none- numeric
&gt; ## totss         1     -none- numeric
&gt; ## withinss      2     -none- numeric
&gt; ## tot.withinss  1     -none- numeric
&gt; ## betweenss     1     -none- numeric
&gt; ## size          2     -none- numeric
&gt; ## crit         10     -none- numeric
&gt; ## bestk         1     -none- numeric
&gt; 
&gt; # Note 1: 
&gt; #   Run kmeansruns() from 1–10 clusters, and the 
&gt; #   CH criterion. By default, kmeansruns() uses 100 
&gt; #   random starts and 100 maximum iterations per 
&gt; #   run. 
&gt; 
&gt; # Note 2: 
&gt; #   The CH criterion picks two clusters. 
&gt; 
&gt; # Note 3: 
&gt; #   Run kmeansruns() from 1–10 clusters, and the 
&gt; #   average silhouette width criterion. Average 
&gt; #   silhouette width picks 3 clusters. 
&gt; 
&gt; # Note 4: 
&gt; #   The vector of criterion values is called 
&gt; #   crit. 
&gt; 
&gt; # Note 5: 
&gt; #   Compare the CH values for kmeans() and 
&gt; #   hclust(). They’re not quite the same, because the 
&gt; #   two algorithms didn’t pick the same 
&gt; #   clusters. 
&gt; 
&gt; # Note 6: 
&gt; #   Plot the values for the two criteria. 
&gt; 
&gt; # Note 7: 
&gt; #   kmeansruns() also returns the output of 
&gt; #   kmeans for k=bestk. 
&gt; 
[1] &quot;############################### end  147 Tue May  2 20:40:47 2017&quot;
[1] &quot;############################### start  148 Tue May  2 20:40:47 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00148_example_8.12_of_section_8.1.4.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.12 of section 8.1.4 
&gt; # (example 8.12 of section 8.1.4)  : Unsupervised methods : Cluster analysis : The k-means algorithm 
&gt; # Title: Running clusterboot() with k-means 
&gt; 
&gt; kbest.p&lt;-5

&gt; cboot&lt;-clusterboot(pmatrix, clustermethod=kmeansCBI,
             runs=100,iter.max=100,
             krange=kbest.p, seed=15555)                # Note: 1 
boot 1 
boot 2 
boot 3 
boot 4 
boot 5 
boot 6 
boot 7 
boot 8 
boot 9 
boot 10 
boot 11 
boot 12 
boot 13 
boot 14 
boot 15 
boot 16 
boot 17 
boot 18 
boot 19 
boot 20 
boot 21 
boot 22 
boot 23 
boot 24 
boot 25 
boot 26 
boot 27 
boot 28 
boot 29 
boot 30 
boot 31 
boot 32 
boot 33 
boot 34 
boot 35 
boot 36 
boot 37 
boot 38 
boot 39 
boot 40 
boot 41 
boot 42 
boot 43 
boot 44 
boot 45 
boot 46 
boot 47 
boot 48 
boot 49 
boot 50 
boot 51 
boot 52 
boot 53 
boot 54 
boot 55 
boot 56 
boot 57 
boot 58 
boot 59 
boot 60 
boot 61 
boot 62 
boot 63 
boot 64 
boot 65 
boot 66 
boot 67 
boot 68 
boot 69 
boot 70 
boot 71 
boot 72 
boot 73 
boot 74 
boot 75 
boot 76 
boot 77 
boot 78 
boot 79 
boot 80 
boot 81 
boot 82 
boot 83 
boot 84 
boot 85 
boot 86 
boot 87 
boot 88 
boot 89 
boot 90 
boot 91 
boot 92 
boot 93 
boot 94 
boot 95 
boot 96 
boot 97 
boot 98 
boot 99 
boot 100 

&gt; groups &lt;- cboot$result$partition

&gt; print_clusters(cboot$result$partition, kbest.p)
[1] &quot;cluster 1&quot;
      Country RedMeat Fish Fr.Veg
1     Albania    10.1  0.2    1.7
4    Bulgaria     7.8  1.2    4.2
18    Romania     6.2  1.0    2.8
25 Yugoslavia     4.4  0.6    3.2
[1] &quot;cluster 2&quot;
   Country RedMeat Fish Fr.Veg
6  Denmark    10.6  9.9    2.4
8  Finland     9.5  5.8    1.4
15  Norway     9.4  9.7    2.7
20  Sweden     9.9  7.5    2.0
[1] &quot;cluster 3&quot;
          Country RedMeat Fish Fr.Veg
5  Czechoslovakia     9.7  2.0    4.0
7       E Germany     8.4  5.4    3.6
11        Hungary     5.3  0.3    4.2
16         Poland     6.9  3.0    6.6
23           USSR     9.3  3.0    2.9
[1] &quot;cluster 4&quot;
       Country RedMeat Fish Fr.Veg
2      Austria     8.9  2.1    4.3
3      Belgium    13.5  4.5    4.0
9       France    18.0  5.7    6.5
12     Ireland    13.9  2.2    2.9
14 Netherlands     9.5  2.5    3.7
21 Switzerland    13.1  2.3    4.9
22          UK    17.4  4.3    3.3
24   W Germany    11.4  3.4    3.8
[1] &quot;cluster 5&quot;
    Country RedMeat Fish Fr.Veg
10   Greece    10.2  5.9    6.5
13    Italy     9.0  3.4    6.7
17 Portugal     6.2 14.2    7.9
19    Spain     7.1  7.0    7.2

&gt; ## [1] &quot;cluster 1&quot;
&gt; ##       Country RedMeat Fish Fr.Veg
&gt; ## 1     Albania    10.1  0.2    1.7
&gt; ## 4    Bulgaria     7.8  1.2    4.2
&gt; ## 18    Romania     6.2  1.0    2.8
&gt; ## 25 Yugoslavia     4.4  0.6    3.2
&gt; ## [1] &quot;cluster 2&quot;
&gt; ##    Country RedMeat Fish Fr.Veg
&gt; ## 6  Denmark    10.6  9.9    2.4
&gt; ## 8  Finland     9.5  5.8    1.4
&gt; ## 15  Norway     9.4  9.7    2.7
&gt; ## 20  Sweden     9.9  7.5    2.0
&gt; ## [1] &quot;cluster 3&quot;
&gt; ##           Country RedMeat Fish Fr.Veg
&gt; ## 5  Czechoslovakia     9.7  2.0    4.0
&gt; ## 7       E Germany     8.4  5.4    3.6
&gt; ## 11        Hungary     5.3  0.3    4.2
&gt; ## 16         Poland     6.9  3.0    6.6
&gt; ## 23           USSR     9.3  3.0    2.9
&gt; ## [1] &quot;cluster 4&quot;
&gt; ##        Country RedMeat Fish Fr.Veg
&gt; ## 2      Austria     8.9  2.1    4.3
&gt; ## 3      Belgium    13.5  4.5    4.0
&gt; ## 9       France    18.0  5.7    6.5
&gt; ## 12     Ireland    13.9  2.2    2.9
&gt; ## 14 Netherlands     9.5  2.5    3.7
&gt; ## 21 Switzerland    13.1  2.3    4.9
&gt; ## 22          UK    17.4  4.3    3.3
&gt; ## 24   W Germany    11.4  3.4    3.8
&gt; ## [1] &quot;cluster 5&quot;
&gt; ##    Country RedMeat Fish Fr.Veg
&gt; ## 10   Greece    10.2  5.9    6.5
&gt; ## 13    Italy     9.0  3.4    6.7
&gt; ## 17 Portugal     6.2 14.2    7.9
&gt; ## 19    Spain     7.1  7.0    7.2
&gt; cboot$bootmean
[1] 0.8670000 0.8420714 0.6147024 0.7647341 0.7508333

&gt; ## [1] 0.8670000 0.8420714 0.6147024 0.7647341 0.7508333
&gt; cboot$bootbrd
[1] 15 20 49 17 32

&gt; ## [1] 15 20 49 17 32
&gt; 
&gt; # Note 1: 
&gt; #   We’ve set the seed for the random generator 
&gt; #   so the results are reproducible. 
&gt; 
[1] &quot;############################### end  148 Tue May  2 20:40:51 2017&quot;
[1] &quot;############################### start  149 Tue May  2 20:40:51 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00149_example_8.13_of_section_8.1.5.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.13 of section 8.1.5 
&gt; # (example 8.13 of section 8.1.5)  : Unsupervised methods : Cluster analysis : Assigning new points to clusters 
&gt; # Title: A function to assign points to a cluster 
&gt; 
&gt; assign_cluster &lt;- function(newpt, centers, xcenter=0, xscale=1) {     # Note: 1 
    xpt &lt;- (newpt - xcenter)/xscale                                 # Note: 2 
    dists &lt;- apply(centers, 1, FUN=function(c0){sqr_edist(c0, xpt)})    # Note: 3 
    which.min(dists)                                                    # Note: 4 
  }

&gt; # Note 1: 
&gt; #   A function to assign a new data point newpt to 
&gt; #   a clustering described by centers, a matrix where 
&gt; #   each row is a cluster centroid. If the data was 
&gt; #   scaled (using scale()) before clustering, then 
&gt; #   xcenter and xscale are the scaled:center and 
&gt; #   scaled:scale attributes, respectively. 
&gt; 
&gt; # Note 2: 
&gt; #   Center and scale the new data point. 
&gt; 
&gt; # Note 3: 
&gt; #   Calculate how far the new data point is from 
&gt; #   each of the cluster centers. 
&gt; 
&gt; # Note 4: 
&gt; #   Return the cluster number of the closest 
&gt; #   centroid. 
&gt; 
[1] &quot;############################### end  149 Tue May  2 20:40:51 2017&quot;
[1] &quot;############################### start  150 Tue May  2 20:40:51 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00150_example_8.14_of_section_8.1.5.R&quot;
[1] &quot;#####   in directory ../Protein&quot;

&gt; # example 8.14 of section 8.1.5 
&gt; # (example 8.14 of section 8.1.5)  : Unsupervised methods : Cluster analysis : Assigning new points to clusters 
&gt; # Title: An example of assigning points to cluster 
&gt; 
&gt; rnorm.multidim &lt;- function(n, mean, sd, colstr=&quot;x&quot;) {     # Note: 1 
    ndim &lt;- length(mean)
    data &lt;- NULL
    for(i in 1:ndim) {
      col &lt;- rnorm(n, mean=mean[[i]], sd=sd[[i]])
      data&lt;-cbind(data, col)
    }
    cnames &lt;- paste(colstr, 1:ndim, sep=&#39;&#39;)
    colnames(data) &lt;- cnames
    data
  }

&gt; mean1 &lt;- c(1, 1, 1)                       # Note: 2 

&gt; sd1 &lt;- c(1, 2, 1)

&gt; mean2 &lt;- c(10, -3, 5)

&gt; sd2 &lt;- c(2, 1, 2)

&gt; mean3 &lt;- c(-5, -5, -5)

&gt; sd3 &lt;- c(1.5, 2, 1)

&gt; clust1 &lt;- rnorm.multidim(100, mean1, sd1)             # Note: 3 

&gt; clust2 &lt;- rnorm.multidim(100, mean2, sd2)

&gt; clust3 &lt;- rnorm.multidim(100, mean3, sd3)

&gt; toydata &lt;- rbind(clust3, rbind(clust1, clust2))

&gt; tmatrix &lt;- scale(toydata)                             # Note: 4 

&gt; tcenter &lt;- attr(tmatrix, &quot;scaled:center&quot;)         # Note: 5 

&gt; tscale&lt;-attr(tmatrix, &quot;scaled:scale&quot;)

&gt; kbest.t &lt;- 3

&gt; tclusters &lt;- kmeans(tmatrix, kbest.t, nstart=100, iter.max=100)       # Note: 6 

&gt; tclusters$size                # Note: 7 
[1] 101  99 100

&gt; ## [1] 100 101  99
&gt; 
&gt; unscale &lt;- function(scaledpt, centervec, scalevec) {      # Note: 8 
    scaledpt*scalevec + centervec
 }

&gt; unscale(tclusters$centers[1,], tcenter, tscale)       # Note: 9 
       x1        x2        x3 
 9.630898 -3.084541  4.949446 

&gt; ##        x1        x2        x3
&gt; ##  9.978961 -3.097584  4.864689
&gt; mean2
[1] 10 -3  5

&gt; ## [1] 10 -3  5
&gt; 
&gt; unscale(tclusters$centers[2,], tcenter, tscale)       # Note: 10 
       x1        x2        x3 
-4.833429 -5.018246 -5.064261 

&gt; ##        x1        x2        x3
&gt; ## -4.979523 -4.927404 -4.908949
&gt; mean3
[1] -5 -5 -5

&gt; ## [1] -5 -5 -5
&gt; 
&gt; unscale(tclusters$centers[3,], tcenter, tscale)       # Note: 11 
       x1        x2        x3 
1.0029535 0.7651268 0.8445897 

&gt; ##        x1        x2        x3
&gt; ## 1.0003356 1.3037825 0.9571058
&gt; mean1
[1] 1 1 1

&gt; ## [1] 1 1 1
&gt; 
&gt; assign_cluster(rnorm.multidim(1, mean1, sd1),     # Note: 12 
                 tclusters$centers,
                 tcenter, tscale)
3 
3 

&gt; ## 3                                                  # Note: 13 
&gt; ## 3
&gt; 
&gt; assign_cluster(rnorm.multidim(1, mean2, sd1),     # Note: 14 
                 tclusters$centers,
                 tcenter, tscale)
1 
1 

&gt; ## 1                                                  # Note: 15 
&gt; ## 1
&gt; 
&gt; assign_cluster(rnorm.multidim(1, mean3, sd1),         # Note: 16 
                 tclusters$centers,
                 tcenter, tscale)
2 
2 

&gt; ## 2                                              # Note: 17 
&gt; ## 2
&gt; 
&gt; # Note 1: 
&gt; #   A function to generate n points drawn from a 
&gt; #   multidimensional Gaussian distribution with 
&gt; #   centroid mean and standard deviation sd. The 
&gt; #   dimension of the distribution is given by the 
&gt; #   length of the vector mean. 
&gt; 
&gt; # Note 2: 
&gt; #   The parameters for three Gaussian 
&gt; #   distributions. 
&gt; 
&gt; # Note 3: 
&gt; #   Create a dataset with 100 points each drawn 
&gt; #   from the above distributions. 
&gt; 
&gt; # Note 4: 
&gt; #   Scale the dataset. 
&gt; 
&gt; # Note 5: 
&gt; #   Store the centering and scaling parameters for 
&gt; #   future use. 
&gt; 
&gt; # Note 6: 
&gt; #   Cluster the dataset, using k-means with three 
&gt; #   clusters. 
&gt; 
&gt; # Note 7: 
&gt; #   The resulting clusters are about the right 
&gt; #   size. 
&gt; 
&gt; # Note 8: 
&gt; #   A function to “unscale” data points (put them 
&gt; #   back in the coordinates of the original 
&gt; #   dataset). 
&gt; 
&gt; # Note 9: 
&gt; #   Unscale the first centroid. It corresponds to 
&gt; #   our original distribution 2. 
&gt; 
&gt; # Note 10: 
&gt; #   The second centroid corresponds to the 
&gt; #   original distribution 3. 
&gt; 
&gt; # Note 11: 
&gt; #   The third centroid corresponds to the original 
&gt; #   distribution 1. 
&gt; 
&gt; # Note 12: 
&gt; #   Generate a random point from the original 
&gt; #   distribution 1 and assign it to one of the 
&gt; #   discovered clusters. 
&gt; 
&gt; # Note 13: 
&gt; #   It’s assigned to cluster 3, as we would 
&gt; #   expect. 
&gt; 
&gt; # Note 14: 
&gt; #   Generate a random point from the original 
&gt; #   distribution 2 and assign it. 
&gt; 
&gt; # Note 15: 
&gt; #   It’s assigned to cluster 1. 
&gt; 
&gt; # Note 16: 
&gt; #   Generate a random point from the original 
&gt; #   distribution 3 and assign it. 
&gt; 
&gt; # Note 17: 
&gt; #   It’s assigned to cluster 2. 
&gt; 
[1] &quot;############################### end  150 Tue May  2 20:40:51 2017&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(<span class="dt">list=</span><span class="kw">ls</span>())
<span class="kw">source</span>(<span class="st">&#39;runDir.R&#39;</span>)</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(<span class="st">&#39;ggplot2&#39;</span>)
<span class="kw">runDir</span>(<span class="st">&#39;../CodeExamples/c08_Unsupervised_methods&#39;</span>,
      <span class="st">&#39;../Bookdata&#39;</span>,<span class="dt">first=</span><span class="dv">152</span>)</code></pre></div>
<pre><code>[1] &quot;############################### start  152 Tue May  2 20:40:51 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00152_example_8.15_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.15 of section 8.2.3 
&gt; # (example 8.15 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Reading in the book data 
&gt; 
&gt; library(arules)   # Note: 1 

Loading required package: Matrix


Attaching package: &#39;arules&#39;

The following objects are masked from &#39;package:base&#39;:

    abbreviate, write


&gt; bookbaskets &lt;- read.transactions(&quot;bookdata.tsv.gz&quot;, format=&quot;single&quot;,      # Note: 2 
                                  sep=&quot;\t&quot;,                     # Note: 3 
                                  cols=c(&quot;userid&quot;, &quot;title&quot;),        # Note: 4 
                                  rm.duplicates=T)          # Note: 5
distribution of transactions with duplicates:
items
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
701 222 106  68  43  39  23  24  18  18  16  10   7   7  13   7   8   5 
 19  20  21  22  23  25  26  27  28  29  30  31  33  34  35  38  39  42 
  3   9   4   4   3   2   2   5   4   5   4   4   1   2   1   1   1   2 
 44  45  47  48  49  52  56  57  59  61  63  71  73  80  84  86  91  93 
  1   1   1   1   1   1   2   1   2   1   2   1   1   1   1   1   1   1 
 95  96  99 103 158 206 260 891 
  1   1   1   1   1   2   1   1 

&gt; # Note 1: 
&gt; #   Load the arules package. 
&gt; 
&gt; # Note 2: 
&gt; #   Specify the file and the file format. 
&gt; 
&gt; # Note 3: 
&gt; #   Specify the column separator (a tab). 
&gt; 
&gt; # Note 4: 
&gt; #   Specify the column of transaction IDs and of 
&gt; #   item IDs, respectively. 
&gt; 
&gt; # Note 5: 
&gt; #   Tell the function to look for and remove 
&gt; #   duplicate entries (for example, multiple entries 
&gt; #   for “The Hobbit” by the same user). 
&gt; 
[1] &quot;############################### end  152 Tue May  2 20:41:06 2017&quot;
[1] &quot;############################### start  153 Tue May  2 20:41:06 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00153_example_8.16_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.16 of section 8.2.3 
&gt; # (example 8.16 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Examining the transaction data 
&gt; 
&gt; class(bookbaskets)                # Note: 1 
[1] &quot;transactions&quot;
attr(,&quot;package&quot;)
[1] &quot;arules&quot;

&gt; ## [1] &quot;transactions&quot;
&gt; ## attr(,&quot;package&quot;)
&gt; ## [1] &quot;arules&quot;
&gt; bookbaskets                       # Note: 2 
transactions in sparse format with
 92108 transactions (rows) and
 220447 items (columns)

&gt; ## transactions in sparse format with
&gt; ##  92108 transactions (rows) and
&gt; ##  220447 items (columns)
&gt; dim(bookbaskets)                  # Note: 3 
[1]  92108 220447

&gt; ## [1]  92108 220447
&gt; colnames(bookbaskets)[1:5]        # Note: 4 
[1] &quot; A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861&quot;
[2] &quot; Always Have Popsicles&quot;                                                                     
[3] &quot; Apple Magic&quot;                                                                               
[4] &quot; Ask Lily&quot;                                                                                  
[5] &quot; Beyond IBM: Leadership Marketing and Finance for the 1990s&quot;                                

&gt; ## [1] &quot; A Light in the Storm:[...]&quot;
&gt; ## [2] &quot; Always Have Popsicles&quot;
&gt; ## [3] &quot; Apple Magic&quot;
&gt; ## [4] &quot; Ask Lily&quot;
&gt; ## [5] &quot; Beyond IBM: Leadership Marketing and Finance for the 1990s&quot;
&gt; rownames(bookbaskets)[1:5]            # Note: 5 
[1] &quot;10&quot;     &quot;1000&quot;   &quot;100001&quot; &quot;100002&quot; &quot;100004&quot;

&gt; ## [1] &quot;10&quot;     &quot;1000&quot;   &quot;100001&quot; &quot;100002&quot; &quot;100004&quot;
&gt; 
&gt; # Note 1: 
&gt; #   The object is of class transactions. 
&gt; 
&gt; # Note 2: 
&gt; #   Printing the object tells you its 
&gt; #   dimensions. 
&gt; 
&gt; # Note 3: 
&gt; #   You can also use dim() to see the dimensions 
&gt; #   of the matrix. 
&gt; 
&gt; # Note 4: 
&gt; #   The columns are labeled by book 
&gt; #   title. 
&gt; 
&gt; # Note 5: 
&gt; #   The rows are labeled by customer. 
&gt; 
[1] &quot;############################### end  153 Tue May  2 20:41:06 2017&quot;
[1] &quot;############################### start  154 Tue May  2 20:41:06 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00154_informalexample_8.7_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # informalexample 8.7 of section 8.2.3 
&gt; # (informalexample 8.7 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; 
&gt; basketSizes &lt;- size(bookbaskets)

&gt; summary(basketSizes)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0     1.0     1.0    11.1     4.0 10250.0 

&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ##     1.0     1.0     1.0    11.1     4.0 10250.0
&gt; 
[1] &quot;############################### end  154 Tue May  2 20:41:06 2017&quot;
[1] &quot;############################### start  155 Tue May  2 20:41:06 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00155_example_8.17_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.17 of section 8.2.3 
&gt; # (example 8.17 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Examining the size distribution 
&gt; 
&gt; quantile(basketSizes, probs=seq(0,1,0.1))         # Note: 1 
   0%   10%   20%   30%   40%   50%   60%   70%   80%   90%  100% 
    1     1     1     1     1     1     2     3     5    13 10253 

&gt; ##    0%   10%   20%   30%   40%   50%   60%   70%   80%   90%  100%
&gt; ##     1     1     1     1     1     1     2     3     5    13 10253
&gt; library(ggplot2)                                  # Note: 2 

&gt; ggplot(data.frame(count=basketSizes)) +
   geom_density(aes(x=count)) +
   scale_x_log10()</code></pre>
<div class="figure">
<img src="rCh08_files/figure-markdown_github/ch8ex2-1.png" alt="" />

</div>
<pre><code>&gt; # Note 1: 
&gt; #   Look at the basket size distribution, in 10% 
&gt; #   increments. 
&gt; 
&gt; # Note 2: 
&gt; #   Plot the distribution to get a better 
&gt; #   look. 
&gt; 
[1] &quot;############################### end  155 Tue May  2 20:41:06 2017&quot;
[1] &quot;############################### start  156 Tue May  2 20:41:06 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00156_informalexample_8.8_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # informalexample 8.8 of section 8.2.3 
&gt; # (informalexample 8.8 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; 
&gt; bookFreq &lt;- itemFrequency(bookbaskets)

&gt; ## summary(bookFreq)
&gt; ##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
&gt; ## 1.086e-05 1.086e-05 1.086e-05 5.035e-05 3.257e-05 2.716e-02
&gt; 
&gt; sum(bookFreq)
[1] 11.09909

&gt; ## [1] 11.09909
&gt; 
[1] &quot;############################### end  156 Tue May  2 20:41:06 2017&quot;
[1] &quot;############################### start  157 Tue May  2 20:41:06 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00157_example_8.18_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.18 of section 8.2.3 
&gt; # (example 8.18 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Finding the ten most frequent books 
&gt; 
&gt; bookCount &lt;- (bookFreq/sum(bookFreq))*sum(basketSizes)        # Note: 1 

&gt; summary(bookCount)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   1.000    1.000    1.000    4.637    3.000 2502.000 

&gt; ##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
&gt; ##    1.000    1.000    1.000    4.637    3.000 2502.000
&gt; orderedBooks &lt;- sort(bookCount, decreasing=T)     # Note: 2 

&gt; orderedBooks[1:10]
                                    Wild Animus 
                                           2502 
                      The Lovely Bones: A Novel 
                                           1295 
                              She&#39;s Come Undone 
                                            934 
                              The Da Vinci Code 
                                            905 
          Harry Potter and the Sorcerer&#39;s Stone 
                                            832 
                     The Nanny Diaries: A Novel 
                                            821 
                                A Painted House 
                                            819 
                          Bridget Jones&#39;s Diary 
                                            772 
                        The Secret Life of Bees 
                                            762 
Divine Secrets of the Ya-Ya Sisterhood: A Novel 
                                            737 

&gt; ##                                     Wild Animus
&gt; ##                                            2502
&gt; ##                       The Lovely Bones: A Novel
&gt; ##                                            1295
&gt; ##                               She&#39;s Come Undone
&gt; ##                                             934
&gt; ##                               The Da Vinci Code
&gt; ##                                             905
&gt; ##           Harry Potter and the Sorcerer&#39;s Stone
&gt; ##                                             832
&gt; ##                      The Nanny Diaries: A Novel
&gt; ##                                             821
&gt; ##                                 A Painted House
&gt; ##                                             819
&gt; ##                           Bridget Jones&#39;s Diary
&gt; ##                                             772
&gt; ##                         The Secret Life of Bees
&gt; ##                                             762
&gt; ## Divine Secrets of the Ya-Ya Sisterhood: A Novel
&gt; ##                                             737
&gt; orderedBooks[1]/dim(bookbaskets)[1]                   # Note: 3 
Wild Animus 
 0.02716376 

&gt; ## Wild Animus
&gt; ##  0.02716376
&gt; 
&gt; # Note 1: 
&gt; #   Get the absolute count of book 
&gt; #   occurrences. 
&gt; 
&gt; # Note 2: 
&gt; #   Sort the count and list the 10 most popular 
&gt; #   books. 
&gt; 
&gt; # Note 3: 
&gt; #   The most popular book in the dataset 
&gt; #   occurred in fewer than 3% of the baskets. 
&gt; 
[1] &quot;############################### end  157 Tue May  2 20:41:07 2017&quot;
[1] &quot;############################### start  158 Tue May  2 20:41:07 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00158_informalexample_8.9_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # informalexample 8.9 of section 8.2.3 
&gt; # (informalexample 8.9 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; 
&gt; bookbaskets_use &lt;- bookbaskets[basketSizes &gt; 1]

&gt; dim(bookbaskets_use)
[1]  40822 220447

&gt; ## [1]  40822 220447
&gt; 
[1] &quot;############################### end  158 Tue May  2 20:41:07 2017&quot;
[1] &quot;############################### start  159 Tue May  2 20:41:07 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00159_example_8.19_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.19 of section 8.2.3 
&gt; # (example 8.19 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Finding the association rules 
&gt; 
&gt; rules &lt;- apriori(bookbaskets_use,                                     # Note: 1 
                 parameter =list(support = 0.002, confidence=0.75))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
       0.75    0.1    1 none FALSE            TRUE       5   0.002      1
 maxlen target   ext
     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 81 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[216031 item(s), 40822 transaction(s)] done [0.45s].
sorting and recoding items ... [1256 item(s)] done [0.03s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 5 done [0.05s].
writing ... [191 rule(s)] done [0.00s].
creating S4 object  ... done [0.05s].

&gt; summary(rules)
set of 191 rules

rule length distribution (lhs + rhs):sizes
  2   3   4   5 
 11 100  66  14 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   3.000   3.000   3.435   4.000   5.000 

summary of quality measures:
    support           confidence          lift       
 Min.   :0.002009   Min.   :0.7500   Min.   : 40.89  
 1st Qu.:0.002131   1st Qu.:0.8113   1st Qu.: 86.44  
 Median :0.002278   Median :0.8468   Median :131.36  
 Mean   :0.002593   Mean   :0.8569   Mean   :129.68  
 3rd Qu.:0.002695   3rd Qu.:0.9065   3rd Qu.:158.77  
 Max.   :0.005830   Max.   :0.9882   Max.   :321.89  

mining info:
            data ntransactions support confidence
 bookbaskets_use         40822   0.002       0.75

&gt; ## set of 191 rules                               # Note: 2 
&gt; ##
&gt; ## rule length distribution (lhs + rhs):sizes             # Note: 3 
&gt; ##   2   3   4   5
&gt; ##  11 100  66  14
&gt; ##
&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ##   2.000   3.000   3.000   3.435   4.000   5.000
&gt; ##
&gt; ## summary of quality measures:                               # Note: 4 
&gt; ##     support           confidence          lift
&gt; ##  Min.   :0.002009   Min.   :0.7500   Min.   : 40.89
&gt; ##  1st Qu.:0.002131   1st Qu.:0.8113   1st Qu.: 86.44
&gt; ##  Median :0.002278   Median :0.8468   Median :131.36
&gt; ##  Mean   :0.002593   Mean   :0.8569   Mean   :129.68
&gt; ##  3rd Qu.:0.002695   3rd Qu.:0.9065   3rd Qu.:158.77
&gt; ##  Max.   :0.005830   Max.   :0.9882   Max.   :321.89
&gt; ##
&gt; ## mining info:                                               # Note: 5 
&gt; ##             data ntransactions support confidence
&gt; ##  bookbaskets_use         40822   0.002       0.75
&gt; 
&gt; # Note 1: 
&gt; #   Call apriori() with a minimum support of 
&gt; #   0.002 and a minimum confidence of 0.75. 
&gt; 
&gt; # Note 2: 
&gt; #   The summary of the apriori() output reports 
&gt; #   the number of rules found;... 
&gt; 
&gt; # Note 3: 
&gt; #   ...the distribution of rule lengths (in this 
&gt; #   example, most rules contain 3 items—2 on the left 
&gt; #   side, X (lhs), and one on the right side, Y 
&gt; #   (rhs));... 
&gt; 
&gt; # Note 4: 
&gt; #   ...a summary of rule quality measures, 
&gt; #   including support and confidence;... 
&gt; 
&gt; # Note 5: 
&gt; #   ...and some information on how apriori() was 
&gt; #   called. 
&gt; 
[1] &quot;############################### end  159 Tue May  2 20:41:08 2017&quot;
[1] &quot;############################### start  160 Tue May  2 20:41:08 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00160_example_8.20_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.20 of section 8.2.3 
&gt; # (example 8.20 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Scoring rules 
&gt; 
&gt; measures &lt;- interestMeasure(rules,                                # Note: 1 
                  measure=c(&quot;coverage&quot;, &quot;fishersExactTest&quot;),        # Note: 2 
                  transactions=bookbaskets_use)                 # Note: 3 

&gt; summary(measures)
    coverage        fishersExactTest    
 Min.   :0.002082   Min.   : 0.000e+00  
 1st Qu.:0.002511   1st Qu.: 0.000e+00  
 Median :0.002719   Median : 0.000e+00  
 Mean   :0.003039   Mean   :5.080e-138  
 3rd Qu.:0.003160   3rd Qu.: 0.000e+00  
 Max.   :0.006982   Max.   :9.702e-136  

&gt; ##     coverage        fishersExactTest
&gt; ##  Min.   :0.002082   Min.   : 0.000e+00
&gt; ##  1st Qu.:0.002511   1st Qu.: 0.000e+00
&gt; ##  Median :0.002719   Median : 0.000e+00
&gt; ##  Mean   :0.003039   Mean   :5.080e-138
&gt; ##  3rd Qu.:0.003160   3rd Qu.: 0.000e+00
&gt; ##  Max.   :0.006982   Max.   :9.702e-136
&gt; 
&gt; # Note 1: 
&gt; #   The call to interestMeasure() takes as 
&gt; #   arguments the discovered rules,... 
&gt; 
&gt; # Note 2: 
&gt; #   ...a list of interest measures to 
&gt; #   apply,... 
&gt; 
&gt; # Note 3: 
&gt; #   ...and a dataset to evaluate the interest 
&gt; #   measures over. This is usually the same set used 
&gt; #   to mine the rules, but it needn’t be. For 
&gt; #   instance, you can evaluate the rules over the full 
&gt; #   dataset, bookbaskets, to get coverage estimates 
&gt; #   that reflect all the customers, not just the ones 
&gt; #   who showed interest in more than one book. 
&gt; 
[1] &quot;############################### end  160 Tue May  2 20:41:08 2017&quot;
[1] &quot;############################### start  161 Tue May  2 20:41:08 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00161_informalexample_8.10_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # informalexample 8.10 of section 8.2.3 
&gt; # (informalexample 8.10 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; 
&gt; inspect(head((sort(rules, by=&quot;confidence&quot;)), n=5))
    lhs                                               rhs                                                support confidence      lift
[1] {Four to Score,                                                                                                                  
     High Five,                                                                                                                      
     Seven Up,                                                                                                                       
     Two for the Dough}                            =&gt; {Three To Get Deadly : A Stephanie Plum Novel} 0.002057714  0.9882353 165.33500
[2] {Harry Potter and the Order of the Phoenix,                                                                                      
     Harry Potter and the Prisoner of Azkaban,                                                                                       
     Harry Potter and the Sorcerer&#39;s Stone}        =&gt; {Harry Potter and the Chamber of Secrets}      0.002866102  0.9669421  72.82751
[3] {Four to Score,                                                                                                                  
     High Five,                                                                                                                      
     One for the Money,                                                                                                              
     Two for the Dough}                            =&gt; {Three To Get Deadly : A Stephanie Plum Novel} 0.002082211  0.9659091 161.59976
[4] {Four to Score,                                                                                                                  
     Seven Up,                                                                                                                       
     Three To Get Deadly : A Stephanie Plum Novel,                                                                                   
     Two for the Dough}                            =&gt; {High Five}                                    0.002057714  0.9655172 180.79975
[5] {High Five,                                                                                                                      
     Seven Up,                                                                                                                       
     Three To Get Deadly : A Stephanie Plum Novel,                                                                                   
     Two for the Dough}                            =&gt; {Four to Score}                                0.002057714  0.9655172 167.72062
[1] &quot;############################### end  161 Tue May  2 20:41:08 2017&quot;
[1] &quot;############################### start  162 Tue May  2 20:41:08 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00162_example_8.21_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.21 of section 8.2.3 
&gt; # (example 8.21 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Finding rules with restrictions 
&gt; 
&gt; brules &lt;- apriori(bookbaskets_use,
                 parameter =list(support = 0.001,       # Note: 1 
                                 confidence=0.6),
                 appearance=list(rhs=c(&quot;The Lovely Bones: A Novel&quot;),    # Note: 2 
                                 default=&quot;lhs&quot;))                        # Note: 3 
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.6    0.1    1 none FALSE            TRUE       5   0.001      1
 maxlen target   ext
     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 40 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[216031 item(s), 40822 transaction(s)] done [0.42s].
sorting and recoding items ... [3172 item(s)] done [0.03s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 5 6 7 8 done [0.22s].
writing ... [46 rule(s)] done [0.05s].
creating S4 object  ... done [0.07s].

&gt; summary(brules)
set of 46 rules

rule length distribution (lhs + rhs):sizes
 3  4 
44  2 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  3.000   3.000   3.000   3.043   3.000   4.000 

summary of quality measures:
    support           confidence          lift      
 Min.   :0.001004   Min.   :0.6000   Min.   :21.81  
 1st Qu.:0.001029   1st Qu.:0.6118   1st Qu.:22.24  
 Median :0.001102   Median :0.6258   Median :22.75  
 Mean   :0.001132   Mean   :0.6365   Mean   :23.14  
 3rd Qu.:0.001219   3rd Qu.:0.6457   3rd Qu.:23.47  
 Max.   :0.001396   Max.   :0.7455   Max.   :27.10  

mining info:
            data ntransactions support confidence
 bookbaskets_use         40822   0.001        0.6

&gt; ## set of 46 rules
&gt; ##
&gt; ## rule length distribution (lhs + rhs):sizes
&gt; ##  3  4
&gt; ## 44  2
&gt; ##
&gt; ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
&gt; ##   3.000   3.000   3.000   3.043   3.000   4.000
&gt; ##
&gt; ## summary of quality measures:
&gt; ##     support           confidence          lift
&gt; ##  Min.   :0.001004   Min.   :0.6000   Min.   :21.81
&gt; ##  1st Qu.:0.001029   1st Qu.:0.6118   1st Qu.:22.24
&gt; ##  Median :0.001102   Median :0.6258   Median :22.75
&gt; ##  Mean   :0.001132   Mean   :0.6365   Mean   :23.14
&gt; ##  3rd Qu.:0.001219   3rd Qu.:0.6457   3rd Qu.:23.47
&gt; ##  Max.   :0.001396   Max.   :0.7455   Max.   :27.10
&gt; ##
&gt; ## mining info:
&gt; ##             data ntransactions support confidence
&gt; ##  bookbaskets_use         40822   0.001        0.6
&gt; 
&gt; # Note 1: 
&gt; #   Relax the minimum support to 0.001 and the 
&gt; #   minimum confidence to 0.6. 
&gt; 
&gt; # Note 2: 
&gt; #   Only The Lovely Bones 
&gt; #   is allowed to appear on the right side of the 
&gt; #   rules. 
&gt; 
&gt; # Note 3: 
&gt; #   By default, all the books can go into the 
&gt; #   left side of the rules. 
&gt; 
[1] &quot;############################### end  162 Tue May  2 20:41:09 2017&quot;
[1] &quot;############################### start  163 Tue May  2 20:41:09 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00163_example_8.22_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.22 of section 8.2.3 
&gt; # (example 8.22 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Inspecting rules 
&gt; 
&gt; brulesConf &lt;- sort(brules, by=&quot;confidence&quot;)   # Note: 1 

&gt; inspect(head(lhs(brulesConf), n=5))       # Note: 2 
    items                                                       
[1] {Divine Secrets of the Ya-Ya Sisterhood: A Novel,           
     Lucky : A Memoir}                                          
[2] {Lucky : A Memoir,                                          
     The Notebook}                                              
[3] {Lucky : A Memoir,                                          
     Wild Animus}                                               
[4] {Midwives: A Novel,                                         
     Wicked: The Life and Times of the Wicked Witch of the West}
[5] {Lucky : A Memoir,                                          
     Summer Sisters}                                            

&gt; ##   items
&gt; ## 1 {Divine Secrets of the Ya-Ya Sisterhood: A Novel,
&gt; ##    Lucky : A Memoir}
&gt; ## 2 {Lucky : A Memoir,
&gt; ##    The Notebook}
&gt; ## 3 {Lucky : A Memoir,
&gt; ##    Wild Animus}
&gt; ## 4 {Midwives: A Novel,
&gt; ##    Wicked: The Life and Times of the Wicked Witch of the West}
&gt; ## 5 {Lucky : A Memoir,
&gt; ##    Summer Sisters}
&gt; 
&gt; # Note 1: 
&gt; #   Sort the rules by confidence. 
&gt; 
&gt; # Note 2: 
&gt; #   Use the lhs() function to get the left 
&gt; #   itemsets of each rule; then inspect the top 
&gt; #   five. 
&gt; 
[1] &quot;############################### end  163 Tue May  2 20:41:10 2017&quot;
[1] &quot;############################### start  164 Tue May  2 20:41:10 2017&quot;
[1] &quot;#####  running  ../CodeExamples/c08_Unsupervised_methods/00164_example_8.23_of_section_8.2.3.R&quot;
[1] &quot;#####   in directory ../Bookdata&quot;

&gt; # example 8.23 of section 8.2.3 
&gt; # (example 8.23 of section 8.2.3)  : Unsupervised methods : Association rules : Mining association rules with the arules package 
&gt; # Title: Inspecting rules with restrictions 
&gt; 
&gt; brulesSub &lt;- subset(brules, subset=!(lhs %in% &quot;Lucky : A Memoir&quot;))    # Note: 1 

&gt; brulesConf &lt;- sort(brulesSub, by=&quot;confidence&quot;)

&gt; inspect(head(lhs(brulesConf), n=5))
    items                                                       
[1] {Midwives: A Novel,                                         
     Wicked: The Life and Times of the Wicked Witch of the West}
[2] {She&#39;s Come Undone,                                         
     The Secret Life of Bees,                                   
     Wild Animus}                                               
[3] {A Walk to Remember,                                        
     The Nanny Diaries: A Novel}                                
[4] {Beloved,                                                   
     The Red Tent}                                              
[5] {The Da Vinci Code,                                         
     The Reader}                                                

&gt; ##   items
&gt; ## 1 {Midwives: A Novel,
&gt; ##    Wicked: The Life and Times of the Wicked Witch of the West}
&gt; ## 2 {She&#39;s Come Undone,
&gt; ##    The Secret Life of Bees,
&gt; ##    Wild Animus}
&gt; ## 3 {A Walk to Remember,
&gt; ##    The Nanny Diaries: A Novel}
&gt; ## 4 {Beloved,
&gt; ##    The Red Tent}
&gt; ## 5 {The Da Vinci Code,
&gt; ##    The Reader}
&gt; 
&gt; # Note 1: 
&gt; #   Restrict to the subset of rules where 
&gt; #   Lucky is not in the left 
&gt; #   side. 
&gt; 
[1] &quot;############################### end  164 Tue May  2 20:41:10 2017&quot;</code></pre>