Skip to content

Latest commit

 

History

History
513 lines (460 loc) · 20.7 KB

rCh02.md

File metadata and controls

513 lines (460 loc) · 20.7 KB
source('runDir.R')
runDir('../CodeExamples/c02_Loading_data_into_R',
       '../Statlog',last=12)
[1] "############################### start  5 Fri Jun 17 10:32:27 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00005_example_2.1_of_section_2.1.1.R"
[1] "#####   in directory ../Statlog"

> # example 2.1 of section 2.1.1 
> # (example 2.1 of section 2.1.1)  : Loading data into R : Working with data from files : Working with well-structured data from files or URLs 
> # Title: Reading the UCI car data 
> 
> uciCar <- read.table(     # Note: 1 
    'http://www.win-vector.com/dfiles/car.data.csv',    # Note: 2 
    sep=',',    # Note: 3 
    header=T    # Note: 4 
    )

> # Note 1: 
> #   Command to read from a file or URL and store the result in a new data frame object 
> #   called 
> #   uciCar. 
> 
> # Note 2: 
> #   Filename or URL to get the data from. 
> 
> # Note 3: 
> #   Specify the column or field separator as a 
> #   comma. 
> 
> # Note 4: 
> #   Tell R to expect a header line that defines 
> #   the data column names. 
> 
[1] "############################### end  5 Fri Jun 17 10:32:27 2016"
[1] "############################### start  6 Fri Jun 17 10:32:27 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00006_example_2.2_of_section_2.1.1.R"
[1] "#####   in directory ../Statlog"

> # example 2.2 of section 2.1.1 
> # (example 2.2 of section 2.1.1)  : Loading data into R : Working with data from files : Working with well-structured data from files or URLs 
> # Title: Exploring the car data 
> 
> class(uciCar)
[1] "data.frame"

> ## [1] "data.frame"   # Note: 1 
> summary(uciCar)
   buying      maint       doors     persons     lug_boot    safety   
 high :432   high :432   2    :432   2   :576   big  :576   high:576  
 low  :432   low  :432   3    :432   4   :576   med  :576   low :576  
 med  :432   med  :432   4    :432   more:576   small:576   med :576  
 vhigh:432   vhigh:432   5more:432                                    
   rating    
 acc  : 384  
 good :  69  
 unacc:1210  
 vgood:  65  

> ##    buying      maint       doors
> ##  high :432   high :432   2    :432
> ##  low  :432   low  :432   3    :432
> ##  med  :432   med  :432   4    :432
> ##  vhigh:432   vhigh:432   5more:432
> ##
> ##  persons     lug_boot    safety
> ##  2   :576   big  :576   high:576
> ##  4   :576   med  :576   low :576
> ##  more:576   small:576   med :576
> ##
> ##    rating
> ##  acc  : 384
> ##  good :  69
> ##  unacc:1210
> ##  vgood:  65
> 
> dim(uciCar)
[1] 1728    7

> ## [1] 1728    7      # Note: 2
> 
> # Note 1: 
> #   The loaded object uciCar is of type data.frame. 
> 
> # Note 2: 
> #   The [1] is just an output sequence 
> #   marker. The actual information is this: uciCar has 
> #   1728 rows and 7 columns. Always try to confirm you 
> #   got a good parse by at least checking that the 
> #   number of rows is exactly one fewer than the 
> #   number of lines of text in the original file. The 
> #   difference of one is because the column header 
> #   counts as a line, but not as a data row. 
> 
[1] "############################### end  6 Fri Jun 17 10:32:27 2016"
[1] "############################### start  8 Fri Jun 17 10:32:27 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00008_example_2.3_of_section_2.1.2.R"
[1] "#####   in directory ../Statlog"

> # example 2.3 of section 2.1.2 
> # (example 2.3 of section 2.1.2)  : Loading data into R : Working with data from files : Using R on less-structured data 
> # Title: Loading the credit dataset 
> 
> d <- read.table(paste('http://archive.ics.uci.edu/ml/',
    'machine-learning-databases/statlog/german/german.data',sep=''),
    stringsAsFactors=F,header=F)

> print(d[1:3,])
   V1 V2  V3  V4   V5  V6  V7 V8  V9  V10 V11  V12 V13  V14  V15 V16  V17
1 A11  6 A34 A43 1169 A65 A75  4 A93 A101   4 A121  67 A143 A152   2 A173
2 A12 48 A32 A43 5951 A61 A73  2 A92 A101   2 A121  22 A143 A152   1 A173
3 A14 12 A34 A46 2096 A61 A74  2 A93 A101   3 A121  49 A143 A152   1 A172
  V18  V19  V20 V21
1   1 A192 A201   1
2   1 A191 A201   2
3   2 A191 A201   1
[1] "############################### end  8 Fri Jun 17 10:32:28 2016"
[1] "############################### start  9 Fri Jun 17 10:32:28 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00009_example_2.4_of_section_2.1.2.R"
[1] "#####   in directory ../Statlog"

> # example 2.4 of section 2.1.2 
> # (example 2.4 of section 2.1.2)  : Loading data into R : Working with data from files : Using R on less-structured data 
> # Title: Setting column names 
> 
> colnames(d) <- c('Status.of.existing.checking.account',
    'Duration.in.month',  'Credit.history', 'Purpose',
    'Credit.amount', 'Savings account/bonds',
    'Present.employment.since',
    'Installment.rate.in.percentage.of.disposable.income',
    'Personal.status.and.sex', 'Other.debtors/guarantors',
    'Present.residence.since', 'Property', 'Age.in.years',
    'Other.installment.plans', 'Housing',
    'Number.of.existing.credits.at.this.bank', 'Job',
    'Number.of.people.being.liable.to.provide.maintenance.for',
    'Telephone', 'foreign.worker', 'Good.Loan')

> d$Good.Loan <- as.factor(ifelse(d$Good.Loan==1,'GoodLoan','BadLoan'))

> print(d[1:3,])
  Status.of.existing.checking.account Duration.in.month Credit.history
1                                 A11                 6            A34
2                                 A12                48            A32
3                                 A14                12            A34
  Purpose Credit.amount Savings account/bonds Present.employment.since
1     A43          1169                   A65                      A75
2     A43          5951                   A61                      A73
3     A46          2096                   A61                      A74
  Installment.rate.in.percentage.of.disposable.income
1                                                   4
2                                                   2
3                                                   2
  Personal.status.and.sex Other.debtors/guarantors Present.residence.since
1                     A93                     A101                       4
2                     A92                     A101                       2
3                     A93                     A101                       3
  Property Age.in.years Other.installment.plans Housing
1     A121           67                    A143    A152
2     A121           22                    A143    A152
3     A121           49                    A143    A152
  Number.of.existing.credits.at.this.bank  Job
1                                       2 A173
2                                       1 A173
3                                       1 A172
  Number.of.people.being.liable.to.provide.maintenance.for Telephone
1                                                        1      A192
2                                                        1      A191
3                                                        2      A191
  foreign.worker Good.Loan
1           A201  GoodLoan
2           A201   BadLoan
3           A201  GoodLoan
[1] "############################### end  9 Fri Jun 17 10:32:28 2016"
[1] "############################### start  10 Fri Jun 17 10:32:28 2016"
[1] "#####  running  /Users/johnmount/Documents/work/PracticalDataScienceWithR/zmPDSwR/RunExamples/replacements/00010_example_2.5_of_section_2.1.2.R"
[1] "#####   in directory ../Statlog"

> # Example in book was purposely elipsized, here is complete mapping table
> 
> mapping <- list('A11'='... < 0 DM',
  'A12'='0 <= ... < 200 DM',
  'A13'='... >= 200 DM / salary assignments for at least 1 year',
  'A14'='no checking account',
  'A30'='no credits taken/all credits paid back duly',
  'A31'='all credits at this bank paid back duly',
  'A32'='existing credits paid back duly till now',
  'A33'='delay in paying off in the past',
  'A34'='critical account/other credits existing (not at this bank)',
  'A40'='car (new)',
  'A41'='car (used)',
  'A42'='furniture/equipment',
  'A43'='radio/television',
  'A44'='domestic appliances',
  'A45'='repairs',
  'A46'='education',
  'A47'='(vacation - does not exist?)',
  'A48'='retraining',
  'A49'='business',
  'A410'='others',
  'A61'='... < 100 DM',
  'A62'='100 <= ... < 500 DM',
  'A63'='500 <= ... < 1000 DM',
  'A64'='.. >= 1000 DM',
  'A65'='unknown/ no savings account',
  'A71'='unemployed',
  'A72'='... < 1 year',
  'A73'='1 <= ... < 4 years',
  'A74'='4 <= ... < 7 years',
  'A75'='.. >= 7 years',
  'A91'='male : divorced/separated',
  'A92'='female : divorced/separated/married',
  'A93'='male : single',
  'A94'='male : married/widowed',
  'A95'='female : single',
  'A101'='none',
  'A102'='co-applicant',
  'A103'='guarantor',
  'A121'='real estate',
  'A122'='if not A121 : building society savings agreement/life insurance',
  'A123'='if not A121/A122 : car or other, not in attribute 6',
  'A124'='unknown / no property',
  'A141'='bank',
  'A142'='stores',
  'A143'='none',
  'A151'='rent',
  'A152'='own',
  'A153'='for free',
  'A171'='unemployed/ unskilled - non-resident',
  'A172'='unskilled - resident',
  'A173'='skilled employee / official',
  'A174'='management/ self-employed/highly qualified employee/ officer',
  'A191'='none',
  'A192'='yes, registered under the customers name',
  'A201'='yes',
  'A202'='no')
[1] "############################### end  10 Fri Jun 17 10:32:28 2016"
[1] "############################### start  11 Fri Jun 17 10:32:28 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00011_example_2.6_of_section_2.1.2.R"
[1] "#####   in directory ../Statlog"

> # example 2.6 of section 2.1.2 
> # (example 2.6 of section 2.1.2)  : Loading data into R : Working with data from files : Using R on less-structured data 
> # Title: Transforming the car data 
> 
> for(i in 1:(dim(d))[2]) {                 # Note: 1 
    if(class(d[,i])=='character') {
       d[,i] <- as.factor(as.character(mapping[d[,i]]))     # Note: 2 
    }
 }

> # Note 1: 
> #   (dim(d))[2] is the number of columns 
> #   in the data frame d. 
> 
> # Note 2: 
> #   Note that the indexing operator [] is vectorized. Each step in the for loop remaps an 
> #   entire column of data through our list. 
> 
[1] "############################### end  11 Fri Jun 17 10:32:28 2016"
[1] "############################### start  12 Fri Jun 17 10:32:28 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00012_example_2.7_of_section_2.1.2.R"
[1] "#####   in directory ../Statlog"

> # example 2.7 of section 2.1.2 
> # (example 2.7 of section 2.1.2)  : Loading data into R : Working with data from files : Using R on less-structured data 
> # Title: Summary of Good.Loan and Purpose 
> 
> table(d$Purpose,d$Good.Loan) 
                     
                      BadLoan GoodLoan
  business                 34       63
  car (new)                89      145
  car (used)               17       86
  domestic appliances       4        8
  education                22       28
  furniture/equipment      58      123
  others                    5        7
  radio/television         62      218
  repairs                   8       14
  retraining                1        8

> ##                       BadLoan GoodLoan
> ##   business                 34       63
> ##   car (new)                89      145
> ##   car (used)               17       86
> ##   domestic appliances       4        8
> ##   education                22       28
> ##   furniture/equipment      58      123
> ##   others                    5        7
> ##   radio/television         62      218
> ##   repairs                   8       14
> ##   retraining                1        8
> 
[1] "############################### end  12 Fri Jun 17 10:32:28 2016"

Step 00016_example_2.11_of_section_2.2.2.R requires large files that we don't have the rights to redistributed in their entirety. So we are either running that step, or skipping it and just loading phsample.RDdata .

runDir('../../CodeExamples/c02_Loading_data_into_R',
       '../PUMS/loadExample',first=13,last=16)
[1] "############################### start  16 Fri Jun 17 10:32:28 2016"
[1] "#####  running  ../../CodeExamples/c02_Loading_data_into_R/00016_example_2.11_of_section_2.2.2.R"
[1] "#####   in directory ../PUMS/loadExample"

> # example 2.11 of section 2.2.2 
> # (example 2.11 of section 2.2.2)  : Loading data into R : Working with relational databases : Loading data from a database into R 
> # Title: Loading data into R from a relational database 
> 
> options( java.parameters = "-Xmx2g" )     # Note: 1 

> library(RJDBC)

Loading required package: DBI

Loading required package: rJava


> drv <- JDBC("org.h2.Driver",  # Note: 2 
    "h2-1.3.176.jar",   # Note: 3 
    identifier.quote="'")   # Note: 4 

> options<-";LOG=0;CACHE_SIZE=65536;LOCK_MODE=0;UNDO_LOG=0"

> conn <- dbConnect(drv,paste("jdbc:h2:./H2DB",options,sep=''),"u","u")

> dhus <- dbGetQuery(conn,"SELECT * FROM hus WHERE ORIGRANDGROUP<=1")   # Note: 5 

> dpus <- dbGetQuery(conn,"SELECT pus.* FROM pus WHERE pus.SERIALNO IN \
    (SELECT DISTINCT hus.SERIALNO FROM hus \
    WHERE hus.ORIGRANDGROUP<=1)")   # Note: 6 

> dbDisconnect(conn)    # Note: 7 
[1] TRUE

> save(dhus,dpus,file='phsample.RData')     # Note: 8

> # Note 1: 
> #   Set Java option for extra memory before DB 
> #   drivers are loaded. 
> 
> # Note 2: 
> #   Specify the name of the database driver, same 
> #   as in our XML database configuration. 
> 
> # Note 3: 
> #   Specify where to find the implementation of 
> #   the database driver. 
> 
> # Note 4: 
> #   SQL column names with mixed-case 
> #   capitalization, special characters, or that 
> #   collide with reserved words must be quoted. We 
> #   specify single-quote as the quote we’ll use when 
> #   quoting column names, which may different than the 
> #   quote we use for SQL literals. 
> 
> # Note 5: 
> #   Create a data frame called dhus from * 
> #   (everything) from the database table hus, taking 
> #   only rows where ORGINRANGGROUP <= 1. The 
> #   ORGINRANDGROUP column is a random integer from 0 
> #   through 999 that SQL Screwdriver adds to the rows 
> #   during data load to facilitate sampling. In this 
> #   case, we’re taking 2/1000 of the data rows to get 
> #   a small sample. 
> 
> # Note 6: 
> #   Create a data frame called dpus from the 
> #   database table pus, taking only records that have 
> #   a household ID in the set of household IDs we 
> #   selected from households table hus. 
> 
> # Note 7: 
> #   Disconnect for the database. 
> 
> # Note 8: 
> #   Save the two data frames into a file named 
> #   phsample.RData, which can be read in with load(). 
> #   Try help("save") or help("load") for more 
> #   details. 
> 
[1] "############################### end  16 Fri Jun 17 10:32:31 2016"
# load('../PUMS/phsample.RData')
runDir('../CodeExamples/c02_Loading_data_into_R',
       '../PUMS',first=17)
[1] "############################### start  17 Fri Jun 17 10:32:31 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00017_example_2.12_of_section_2.2.3.R"
[1] "#####   in directory ../PUMS"

> # example 2.12 of section 2.2.3 
> # (example 2.12 of section 2.2.3)  : Loading data into R : Working with relational databases : Working with the PUMS data 
> # Title: Selecting a subset of the Census data 
> 
> load('phsample.RData')

> psub = subset(dpus,with(dpus,(PINCP>1000)&(ESR==1)&
    (PINCP<=250000)&(PERNP>1000)&(PERNP<=250000)&
    (WKHP>=40)&(AGEP>=20)&(AGEP<=50)&
    (PWGTP1>0)&(COW %in% (1:7))&(SCHL %in% (1:24))))    # Note: 1

> # Note 1: 
> #   Subset of data rows matching detailed 
> #   employment conditions 
> 
[1] "############################### end  17 Fri Jun 17 10:32:31 2016"
[1] "############################### start  18 Fri Jun 17 10:32:31 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00018_example_2.13_of_section_2.2.3.R"
[1] "#####   in directory ../PUMS"

> # example 2.13 of section 2.2.3 
> # (example 2.13 of section 2.2.3)  : Loading data into R : Working with relational databases : Working with the PUMS data 
> # Title: Recoding variables 
> 
> psub$SEX = as.factor(ifelse(psub$SEX==1,'M','F'))     # Note: 1 

> psub$SEX = relevel(psub$SEX,'M')  # Note: 2 

> cowmap <- c("Employee of a private for-profit",
    "Private not-for-profit employee",
    "Local government employee",
    "State government employee",
    "Federal government employee",
    "Self-employed not incorporated",
    "Self-employed incorporated")

> psub$COW = as.factor(cowmap[psub$COW])    # Note: 3 

> psub$COW = relevel(psub$COW,cowmap[1])

> schlmap = c(      # Note: 4 
    rep("no high school diploma",15),
   "Regular high school diploma",
   "GED or alternative credential",
   "some college credit, no degree",
   "some college credit, no degree",
   "Associate's degree",
   "Bachelor's degree",
   "Master's degree",
   "Professional degree",
   "Doctorate degree")

> psub$SCHL = as.factor(schlmap[psub$SCHL])

> psub$SCHL = relevel(psub$SCHL,schlmap[1])

> dtrain = subset(psub,ORIGRANDGROUP >= 500)    # Note: 5 

> dtest = subset(psub,ORIGRANDGROUP < 500)      # Note: 6

> # Note 1: 
> #   Reencode sex from 1/2 to M/F. 
> 
> # Note 2: 
> #   Make the reference sex M, so F encodes a 
> #   difference from M in models. 
> 
> # Note 3: 
> #   Reencode class of worker info into a more 
> #   readable form. 
> 
> # Note 4: 
> #   Reencode education info into a more readable 
> #   form and fewer levels (merge all levels below high 
> #   school into same encoding). 
> 
> # Note 5: 
> #   Subset of data rows used for model 
> #   training. 
> 
> # Note 6: 
> #   Subset of data rows used for model 
> #   testing. 
> 
[1] "############################### end  18 Fri Jun 17 10:32:31 2016"
[1] "############################### start  19 Fri Jun 17 10:32:31 2016"
[1] "#####  running  ../CodeExamples/c02_Loading_data_into_R/00019_example_2.14_of_section_2.2.3.R"
[1] "#####   in directory ../PUMS"

> # example 2.14 of section 2.2.3 
> # (example 2.14 of section 2.2.3)  : Loading data into R : Working with relational databases : Working with the PUMS data 
> # Title: Summarizing the classifications of work 
> 
> summary(dtrain$COW)
Employee of a private for-profit      Federal government employee 
                             423                               21 
       Local government employee  Private not-for-profit employee 
                              39                               55 
      Self-employed incorporated   Self-employed not incorporated 
                              17                               16 
       State government employee 
                              24 

> ## Employee of a private for-profit      Federal government employee
> ##                              423                               21
> ##        Local government employee  Private not-for-profit employee
> ##                               39                               55
> ##       Self-employed incorporated   Self-employed not incorporated
> ##                               17                               16
> ##        State government employee
> ##                               24
> 
[1] "############################### end  19 Fri Jun 17 10:32:31 2016"