quantargo · BalintKomjati · May 8, 2019 · May 8, 2019 · May 8, 2019 · May 8, 2019
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,3 @@
+^test\.Rproj$
+^\.Rproj\.user$
+^README\.Rmd$
diff --git a/.Rhistory b/.Rhistory
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,15 @@
+Package: bmarketing
+Title: What the Package Does (one line, title case)
+Version: 0.0.0.9000
+Authors@R: person("First", "Last", email = "[email protected]", role = c("aut", "cre"))
+Description: What the package does (one paragraph).
+License: What license is it under?
+Encoding: UTF-8
+LazyData: true
+Imports: tidyverse, 
+         rpart,
+         rpart.plot,
+         dplyr,
+         nortest
+RoxygenNote: 6.1.1
+
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,7 @@
+# Generated by roxygen2: do not edit by hand
+
+export(calcPerformance)
+export(clean)
+export(fitModel)
+export(predictByModel)
+export(transform)
diff --git a/R/Transform.R b/R/Transform.R
@@ -0,0 +1,34 @@
+#' Log-Tranforms and standardizes input data for easy model development\cr
+#' 
+#' If data is numeric and negative \cr
+#' AND if data appears as non-normal \cr
+#' the function performs a log transformation\cr
+#' If data is numeric it then standardizes\cr
+#' 
+#' @param input dataframe
+#' @examples
+#' @export
+#' 
+
+
+#todos
+#function should give warning if a variable was log transformed
+#log transformation should be optional
+
+transform <- function(input) {
+  output <- as.data.frame(lapply(input, function(x) {
+    if(is.numeric(x) && min(x)>0) {
+      if(ad.test(x)$p.value <.05) { 
+        x<-log(x)
+        }
+      x<-scale(x)
+      }
+    x  
+  }))
+  output
+
+}
+
+
+
+
diff --git a/R/bmarketing.R b/R/bmarketing.R
@@ -0,0 +1,14 @@
+#' bmarketing: A package for analyzing datasets of banking customers.
+#'
+#' The bmarketing package provides three important functions:
+#' clean, transform and calcPerformance.
+#' 
+#' @section bmarketing functions: \cr
+#' - clean: A function to clean data (clean NA values, basic checks)\cr
+#' - transform: A function to log transform values\cr
+#' - calcPerformance: A function to ...
+#'
+#' @docType package
+#' @name bmarketing
+#' 
+NULL
diff --git a/R/calcPerformance.R b/R/calcPerformance.R
@@ -0,0 +1,50 @@
+#' Reports model's classification accuracy measures 
+#'
+#' @param y Target variable (class or numeric)
+#' @param y_pred Target variable (class or numeric)
+#' @return Gives back a classification report containing the Confusion Matrix, Sensitivity, Specificity, Precision and Accuracy, and a list containing the results
+#' @examples
+#' y_example      = c(0,1,1,0)
+#' y_pred_example = c(1,1,1,0)
+#' results <- calcPerfogit pushrmance(y = y_example, y_pred = y_pred_example)
+#' @export
+#' 
+calcPerformance <- function(y, y_pred) {
+
+  if( length(y) != length(y_pred) ){
+    stop("y and y_pred do not have the same number of observations")
+  }
+
+  if( any(is.na(y)) ){
+    stop("y contains value NA(s)")
+  }  
+
+  if( any(is.na(y_pred)) ){
+    stop("y_pred contains value NA(s)")
+  }  
+
+  cm <- table(y, y_pred)
+
+  res <- data.frame(test = c("TPR", 
+                             "TNR",
+                             "Precision",
+                             "Accuracy"),
+
+                     value = c(  round( 100 * cm[2,2] / ( cm[2,2] + cm[2,1]), 3 ),
+                                 round( 100 * cm[1,1] / ( cm[1,1] + cm[1,2]), 3 ),
+                                 round( 100 * cm[2,2] / ( cm[2,2] + cm[1,2]), 3 ),
+                                 round( 100 * mean(y == y_pred)             , 3 ))
+                     )    
+
+
+  print( "Confusion matrix")
+  print( cm )
+  print("")
+  print( paste( "True Positive Rate (Sensitivity):", res[1, 2], "%" ) )
+  print( paste( "True Negative Rate (Specificity):", res[2, 2], "%" ) )
+  print( paste( "Precision:"                       , res[3, 2], "%" ) )
+  print( paste( "Accuracy:"                        , res[4, 2], "%" ) )
+
+  return(list(cm, res))
+} 
+
diff --git a/R/clean.R b/R/clean.R
@@ -0,0 +1,39 @@
+#' Clean function.
+#' 
+#' Cleans a dataset: \cr
+#' - return an error if the target variable contains any missing values (NA’s). \cr
+#' - Give clear warnings for all other variables which contain NA’s. \cr
+#' - Remove any columns (and report as warning) which contain more than 50% NA’s
+#' 
+#' @param x A dataframe
+#' @param t The name of the target variable column of dataframe x
+#' @examples
+#' cleaned_data <- clean(bmarketing, "y")
+#' @export
+#' 
+
+
+
+clean <- function(x, t, meanimpute = FALSE) {
+  if (is.data.frame(x)==FALSE) {stop("Parameter x must be a dataframe")}
+  if (is.na(match(t, names(x)))==TRUE) {stop("Parameter t must be the name (string) of a column in the dataframe")}
+  if (any(is.na(x[[t]]))==TRUE) {stop(paste("The target variabe", t, "contais NA values"))}
+  if (any(is.na(x[ , -which(colnames(x)==t)]))==TRUE) {warning("Explanatory variables contain NA values")}
+  count_na <- sapply(x, function(y) sum(length(which(is.na(y))))/length(y))
+  cols_to_remove <- names(count_na[count_na > 0.5])
+  if (meanimpute) {
+    cols_imputed <- c()
+    for(i in 1:ncol(x)){
+      if(is.numeric(x[,i]) && any(is.na(x[,i]))) {
+        x[is.na(x[,i]), i] <- mean(x[,i], na.rm = TRUE)
+        cols_imputed <- c(cols_imputed, colnames(x)[i])
+        }
+    } 
+    warning(paste("The following columns were meanimputed: ", paste(cols_imputed, collapse=", ")))
+  }
+  if (length(cols_to_remove)==0) {return(x)}
+  else {
+    warning(paste("The following columns are removed: ", paste(cols_to_remove,collapse=", ")))
+    return(x[,-which(colnames(x)==cols_to_remove)])
+    }
+}
diff --git a/R/fitModel.R b/R/fitModel.R
@@ -0,0 +1,35 @@
+#' Reports model's classification accuracy measures 
+#'
+#' @param data data.frame used for model fitting
+#' @param y name of the target variable (quoted character)
+#' @param modelType string, name of the requested model type: either 'Logistic' or 'DecisionTree'
+#' @param explVars either NULL or character vector containing list of explanatory variables 
+#' @return Return the objectum of the model
+#' @examples
+#' df <- data.frame(y = c(0,1,1,0), a = c('a', 'b', 'c', 'a'), b = c(12,121,11,12))
+#' varList <- c('a','b')
+#' results <- fitModel(data = df, y = 'y', modelType = 'Logistic', explVars = varList)
+#' @export
+#' 
+
+fitModel <- function(data, y, modelType, explVars = NULL) {
+
+  if( !(modelType %in% c("Logistic", "DecisionTree")) ){
+    stop("Unknown model type")
+  }
+
+  # Concatenates the model formula 
+  if( is.null(explVars) ){
+    modelFormula <- paste(y, "~ .")
+  } else {
+    modelFormula <- paste(y, "~", paste(explVars, collapse = "+"))
+  }
+
+  # Which model is requested?
+  if( modelType == "DecisionTree" ){
+    fit <- rpart(as.formula(modelFormula), data = data)
+  } else {     
+    fit <- glm(as.formula(modelFormula), data = data, family = "binomial")
+  }  
+}
+
diff --git a/R/predictByModel.R b/R/predictByModel.R
@@ -0,0 +1,29 @@
+#' Predicts output from DecisitonTree or Logistic model
+#'
+#' @param data dataset to be used for prediction
+#' @param model2Predict model which the predictions are based on
+#' @param modelType type of the model
+#' @return Gives back predictions
+#' @examples
+#' y_example      = c(0,1,1,0)
+#' y_pred_example = c(1,1,1,0)
+#' @export
+#' 
+
+predictByModel <- function(data, model2Predict, modelType){
+
+  if( !(modelType %in% c("Logistic", "DecisionTree")) ){
+    stop("Unknown model type")
+  }
+
+  if( modelType == "Logistic"){
+    pred <- as.factor( (predict(object = model2Predict, data = data, type = "response") > 0.5) * 1 ) 
+    levels(pred) <- c("no", "yes")
+  } else {
+    pred <- predict( object = model2Predict, data = data, type = "class")
+  }
+
+  return(pred)
+}
+
+
diff --git a/README.Rmd b/README.Rmd
@@ -4,20 +4,66 @@ output: github_document
 
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 
-```{r, echo = FALSE}
+```{r, include = FALSE}
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
-  fig.path = "man/figures/README-"
+  fig.path = "man/figures/README-",
+  out.width = "100%"
 )
+library(bmarketing)
+library(rpart)
+library(rpart.plot)
 ```
+# bmarketing
 
-[![Travis Build Status](https://travis-ci.org/Quantargo/bmarketing.svg?branch=master)](https://travis-ci.org/Quantargo/bmarketing)
-[![Coverage Status](https://img.shields.io/codecov/c/github/Quantargo/bmarketing/master.svg)](https://codecov.io/github/Quantargo/bmarketing?branch=master)
+<!-- badges: start -->
+<!-- badges: end -->
 
-## Overview
+The goal of bmarketing is to provide functions useful for data cleansing, modelling and reporting tasks.
 
-The bmarketing dataset
+## Installation
 
-<!-- TODO: Change README to make it more descriptive, add examples, etc. -->
+You can install the released version of bmarketing from [Github](https://github.com/BalintKomjati/bmarketing) with: 
+
+```{r echo=FALSE}
+devtools::install_github("BalintKomjati/bmarketing")
+library(bmarketing)
+```
+
+## Example
+
+This is a basic workflow for package usage is the following:
+
+1) Import the package
+
+```{r}
+library(bmarketing)
+```
+
+2) Import the data you want to analyse. Like
+
+```{r}
+bmarketing <- read.csv2("inst/bmarketing.csv",dec = ".")
+```
+
+3) Do some data cleansing with function clean()
+
+```{r}
+bmarketing <- clean(x = bmarketing, t = "y")
+```
+
+4) Fit a Decision Tree, plot the results, give predictions:
+
+```{r}
+dt_model <- fitModel(data = bmarketing, y = 'y', modelType = 'DecisionTree')
+rpart.plot(dt_model)
+
+predictions <- predictByModel(data = bmarketing, model2Predict = dt_model, modelType = "DecisionTree")
+```
+
+5) Finally, you can create a report for model performance:
+```{r}
+results <- calcPerformance(y = bmarketing$y, y_pred = predictions)
+```