WebSciences · ghost · Apr 29, 2014 · Apr 29, 2014
diff --git a/2014-group28-PrzybylakHowellsDeng/.DS_Store b/2014-group28-PrzybylakHowellsDeng/.DS_Store
diff --git a/2014-group28-PrzybylakHowellsDeng/code/Models.R b/2014-group28-PrzybylakHowellsDeng/code/Models.R
@@ -0,0 +1,89 @@
+#____________________ Decision Trees
+
+
+controls <- rpart.control(minsplit = 1, minbucket = 5, cp = 0.001) #tree parameters
+parms <- list(split = 'gini') #splitting methodology
+
+
+CreateClassificationTree <- function(ch_discrDependentVariable, df_learnAttributes, df_learnDependentVariable, controls, parms)
+{
+  df_data <- cbind(df_learnDependentVariable, df_learnAttributes)
+  df_data <- data.frame(df_data)
+
+  colnames(df_data) <- make.names(c(ch_discrDependentVariable,colnames(df_learnAttributes)), unique = TRUE, allow_ = TRUE)
+
+  ch_explanatoryVariable <- paste(colnames(df_data[,-1]), "+", collapse = "", sep="")
+  formula <- paste(ch_discrDependentVariable," ~ ", ch_explanatoryVariable, sep="")
+  formula <- substr(formula, 1, nchar(formula) - 1)
+  tree <- rpart(eval(parse(t = formula)), data = df_data, 
+                parms = parms, control = controls, method = "class")
+
+  return(tree)
+}
+
+
+CPoptimal <- function(tree)  #Pruning the tree to optimal size by complexity parameter (not used)
+{
+
+  minpos <- min(seq_along(tree$cptable[,4])[tree$cptable[,4] == min(tree$cptable[,4])])
+  minline <- tree$cptable[minpos,4] + tree$cptable[minpos,5]
+  xerror_min <- tree$cptable[,4] - minline
+  optimalCP_index <- min(seq_along(xerror_min)[xerror_min < 0])
+  optimalCP <- tree$cptable[optimalCP_index,1]
+
+  prunedTree <- prune(tree, optimalCP)
+
+  return(prunedTree)
+}
+
+
+#____________________ RFs
+
+
+CreateRF <- function(ch_discrDependentVariable, df_learnAttributes, df_learnDependentVariable, ntrees,
+                     mtrys, nodesizes)
+{
+  df_data <- cbind(df_learnDependentVariable, df_learnAttributes)
+  df_data <- data.frame(df_data)
+
+  colnames(df_data) <- make.names(c(ch_discrDependentVariable,colnames(df_learnAttributes)), unique = TRUE, allow_ = TRUE)
+
+  ch_explanatoryVariable <- paste(colnames(df_data[,-1]), "+", collapse = "", sep="")
+  formula <- paste(ch_discrDependentVariable," ~ ", ch_explanatoryVariable, sep="")
+  formula <- substr(formula, 1, nchar(formula) - 1)
+  rf <- randomForest(eval(parse(t = formula)), na.action = na.omit,importance=TRUE, data = df_data, ntree = ntrees,
+                     mtry = mtrys, nodesize = nodesizes)
+
+  return(rf)
+}
+
+
+#____________________ LogisticRegresion
+
+
+CreateLogistic <- function(ch_discrDependentVariable, df_learnAttributes, df_learnDependentVariable)
+{
+
+
+  df_data <- cbind(df_learnDependentVariable, df_learnAttributes)
+  df_data <- data.frame(df_data)
+
+  colnames(df_data) <- make.names(c(ch_discrDependentVariable,colnames(df_learnAttributes)), unique = TRUE, allow_ = TRUE)
+
+  ch_explanatoryVariable <- paste(colnames(df_data[,-1]), "+", collapse = "", sep="")
+  formula <- paste(ch_discrDependentVariable," ~ ", ch_explanatoryVariable, sep="")
+  formula <- substr(formula, 1, nchar(formula) - 1)
+  logistic <- glm(eval(parse(t = formula)), data = df_data, family=binomial(logit))
+
+  return(logistic)
+}
+
+
+#logistic <- CreateLogistic(colnames(labels), training_features, training_labels) #training labels haveto be {0,1} for logistic regression!
+#tree <- CreateClassificationTree(colnames(MMM_label)[2], MMM_daily_features[1:20,c(2,3,4)], MMM_label[1:20,2], controls, parms)
+#plot(tree); text(tree,use.n=T,cex=.5)
+
+
+
+
+#table(prediction = class_predict , correct = MMM_label[21:32,2]) # Hitratio Matrix
diff --git a/2014-group28-PrzybylakHowellsDeng/code/ROC_functions.R b/2014-group28-PrzybylakHowellsDeng/code/ROC_functions.R
@@ -0,0 +1,102 @@
+# CODE adapted from : http://web.expasy.org/pROC/screenshots.html
+install.packages("pROC")
+library(pROC)
+
+#data(aSAH)
+
+
+
+
+#_____________________________
+
+
+pROC_AUC <- function(test_DV, pred, c_partial){
+
+  plot.roc(test_DV, pred, # data # (response, predictor)
+
+           percent=TRUE, # show all values in percent
+
+           partial.auc=c_partial, partial.auc.correct=TRUE, # define a partial AUC (pAUC)
+
+           print.auc=TRUE, #display pAUC value on the plot with following options:
+
+           print.auc.pattern=paste("Corrected pAUC (", c_partial[1],"-",c_partial[2]," %% SP):\n%.1f%%"), print.auc.col="#1c61b6",
+
+           auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon
+
+           max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon
+
+           main="Partial AUC (pAUC)")
+
+  plot.roc(test_DV, pred,
+
+           percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless)
+
+           partial.auc=c_partial, partial.auc.correct=TRUE, # define a partial AUC (pAUC)
+
+           partial.auc.focus="se", # focus pAUC on the sensitivity
+
+           print.auc=TRUE, print.auc.pattern=paste("Corrected pAUC (", c_partial[1],"-",c_partial[2]," %% SE):\n%.1f%%"), print.auc.col="#008600", #display pAUC value with options:
+
+           print.auc.y=40, # do not print auc over the previous one
+
+           auc.polygon=TRUE, auc.polygon.col="#008600", # show pAUC as a polygon
+
+           max.auc.polygon=TRUE, max.auc.polygon.col="#00860022") # also show the 100% polygon
+
+}
+#pROC_AUC(aSAH$outcome, aSAH$s100b,c(100,80)) # c(response,predictor)
+
+#____________________________
+
+pROC_singlek <- function(test_DV, pred, th, method) {  
+
+  plot.roc(test_DV, pred,
+
+         #main="Confidence interval of a threshold", percent=TRUE,
+
+         main=method, percent=TRUE,
+
+         ci=TRUE, of="thresholds", # compute AUC (of threshold)
+
+         thresholds=th, # select the (best) threshold
+
+         print.thres=th, # also highlight this threshold on the plot
+
+         grid = TRUE,
+
+         auc.polygon = TRUE) 
+
+}
+
+
+#pROC_singlek(aSAH$outcome, aSAH$s100b,"best")
+
+#____________________________
+
+# ci.sp i ce.se w przedziale (0:100) zwracaja stochastycznie ten sam wykres. S� jednozancznie wyznacozne
+# zacieniowany obszar rozjezdza sie troche z CI dla best boROC czase mjest sta��! w tym miejscu akurat jest. Czy tylko dlatego?
+
+
+pROC_CI <- function(test_DV, pred) {
+
+  rocobj <- plot.roc(test_DV, pred,
+
+                   main="Confidence intervals", percent=TRUE,
+
+                   ci=TRUE, # compute AUC (of AUC by default)
+
+                   print.auc=TRUE)
+
+  ciobj <- ci.se(rocobj, # CI of sensitivity
+
+               specificities=seq(0, 100, 5)) # over a select set of specificities
+
+  plot(ciobj, type="shape", col="#1c61b6AA") # plot as a blue shape
+
+
+  plot(ci(rocobj, of="thresholds", thresholds="best")) # add one threshold
+
+}
+
+#pROC_CI(aSAH$outcome, aSAH$s100b)
diff --git a/2014-group28-PrzybylakHowellsDeng/code/chart.py b/2014-group28-PrzybylakHowellsDeng/code/chart.py
@@ -0,0 +1,99 @@
+import statsmodels
+import os
+import numpy as np
+import pandas as pd
+
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+import datetime as dt
+
+from sklearn.preprocessing import scale
+from sklearn.preprocessing import Imputer
+from sklearn.preprocessing import normalize
+
+def nan_helper(y):
+    """Helper to handle indices and logical indices of NaNs.
+
+    Input:
+        - y, 1d numpy array with possible NaNs
+    Output:
+        - nans, logical indices of NaNs
+        - index, a function, with signature indices= index(logical_indices),
+          to convert logical indices of NaNs to 'equivalent' indices
+    Example:
+        >>> # linear interpolation of NaNs
+        >>> nans, x= nan_helper(y)
+        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
+    """
+
+    return np.isnan(y), lambda z: z.nonzero()[0]
+
+os.chdir('/Users/jonathanhowells/Dropbox/IRDM-Twitter/alg1/')
+
+companies = ['boeing', '3m', 'caterpillar']
+
+for company in companies:
+    folder = 'data/' + company + '/'
+    data = pd.read_csv(folder + company + 'share.csv')
+    y=data['Adj Close'].values
+    date = data['Date'].values
+
+
+    negative = data['Negative'].values
+    neutral = data['Neutral'].values
+    positive = data['Positive'].values
+
+    nans, x= nan_helper(negative)
+    negative[nans]= np.interp(x(nans), x(~nans), negative[~nans])
+
+    nans, x= nan_helper(neutral)
+    neutral[nans]= np.interp(x(nans), x(~nans), neutral[~nans])
+
+    nans, x= nan_helper(positive)
+    positive[nans]= np.interp(x(nans), x(~nans), positive[~nans])
+
+    nans, x= nan_helper(y)
+    y[nans]= np.interp(x(nans), x(~nans), y[~nans])
+
+
+    sentiment = np.column_stack((negative,neutral,positive))
+
+    sentiment
+
+    sentiment_score = []
+
+    for i in range(sentiment.shape[0]):
+        score = (sentiment[i,2] - sentiment[i,0])/(sum(sentiment[i,:]))
+        sentiment_score.append(score)
+
+    sentiment_score
+
+
+    x = [dt.datetime.strptime(d,'%d/%m/%Y').date() for d in date]
+
+    fig, ax1 = plt.subplots()
+    y_norm = scale(y)
+
+    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d/%m/%Y'))
+    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
+
+    ax1.plot(x,sentiment_score,'r-')
+    ax1.set_xlabel('Date')
+    ax1.set_ylabel('Sentiment', color='b')
+
+    for tl in ax1.get_yticklabels():
+        tl.set_color('b')
+
+    ax2 = ax1.twinx()
+    ax2.plot(x, y, 'b-')
+    ax2.set_ylabel('Share Price', color='r')
+    for tl in ax2.get_yticklabels():
+        tl.set_color('r')
+
+    title = company + ' Share Price and Sentiment'
+    plt.title(title)
+    plt.legend()
+    plt.grid()
+    plt.gcf().autofmt_xdate()
+    plt.savefig(company + "graph")
+    plt.show()