finished conversion to transfusion project

drivendataorg · Aug 1, 2018 · 64c7d2c · 64c7d2c
1 parent c63493c
commit 64c7d2c
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 53 deletions.
diff --git a/{{ cookiecutter.repo_name }}/src/data/make_dataset.py b/{{ cookiecutter.repo_name }}/src/data/make_dataset.py
@@ -1,68 +1,47 @@
 import os
+import io
+import requests
 import pandas as pd
 from pathlib import Path
 from sklearn.preprocessing import StandardScaler
 
-DATA_LINK = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
-TITLES = ['Mlle', 'Mrs', 'Mr', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Col', 'Capt', 'Countess']
+DATA_LINK = 'https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data'
 ROOT = Path(__file__).resolve().parents[2]
 
-
-def extract_title(name):
-    title = 'missing'
-    for item in TITLES:
-        if item in name:
-            title = item
-            break
-    if title == 'missing':
-        title = 'Mr'
-    return title
-
-
 def massage_data(raw_data):
+    """ Preprocess the data for predictions
     """
-    preprocess the data for predictions
-    """
-    # Feature engineering ---
-    raw_data["title"] = raw_data.apply(lambda row: extract_title(row["name"]), axis=1)  
-
-    # Age: replace NaN with median
-    raw_data["age"].fillna(raw_data.age.median(), inplace=True)
-
-    # Embarked: replace NaN with the mode value
-    raw_data["embarked"].fillna(raw_data.embarked.mode()[0], inplace=True)
+    raw_data.rename(index=str, columns={"whether he/she donated blood in March 2007": "label"}, inplace = True)
+
+    # generate features for year for time columns
+    for x,y in zip(['time_years','recency_years'],['Time (months)', 'Recency (months)']):
+        raw_data[x] = (raw_data[y]/12).astype('int')
+
+    # generate features for quarter for time columns (3 month periods)
+    for x,y in zip(['time_quarters','recency_quarters'],['Time (months)', 'Recency (months)']):
+        raw_data[x] = (raw_data[y]/3).astype('int')
 
-    # Fare: replace NaN with median
-    raw_data["fare"].fillna(raw_data.fare.median(), inplace=True)
-
-    # Encode Categorical features ---
-    raw_data["cabin"] = raw_data.apply(lambda obs: "No" if pd.isnull(obs['cabin']) else "Yes", axis=1)  # binarize “cabin” feature
-    raw_data = pd.get_dummies(raw_data, columns=['sex', 'title', 'cabin', 'embarked'])
-
-    # Scaling numerical features ---
-    scale = StandardScaler().fit(raw_data[['age', 'fare']])
-    raw_data[['age', 'fare']] = scale.transform(raw_data[['age', 'fare']])
     return raw_data
 
-
 def dump_data(data, out_loc):
-    """
-    given a path to a datafile, either a local file path
+    """ Given a path to a datafile, either a local file path
     or a url, fetch the data and dump it to a csv
     """
     out_dir = os.path.join(ROOT, out_loc)
     data.to_csv(out_dir, index=False)
 
 
 def main():
-    """ Runs data processing scripts to turn raw data from (../raw) into
+    """ Retrieves data and runs processing scripts to turn raw data from (../raw) into
         cleaned data ready to be analyzed (saved in ../processed).
     """
-    raw_data = pd.read_csv(DATA_LINK)
-    dump_data(raw_data, 'data/raw/titanic.csv')
+    s=requests.get(DATA_LINK).content
+    raw_data = pd.read_csv(io.StringIO(s.decode('utf-8')))
+
+    dump_data(raw_data, 'data/raw/transfusion_data_raw.csv')
     processed_data = massage_data(raw_data)
-    dump_data(processed_data, 'data/processed/titanic.csv')
+    dump_data(processed_data, 'data/processed/transfusion_data.csv')
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/{{ cookiecutter.repo_name }}/src/models/predict_model.py b/{{ cookiecutter.repo_name }}/src/models/predict_model.py
@@ -6,19 +6,23 @@
 
 
 def retrieve_model():
-    pickled_model = os.path.join(ROOT, 'models/titanic.model')
+    """retrieve the pickled model object
+    """
+    pickled_model = os.path.join(ROOT, 'models/transfusion.model')
     with open(pickled_model, 'rb') as fin:
         return(pickle.load(fin))
 
 
 def main():
+    """ retrieve the model and predict labels. Show prediction and performance
+    """
     deserialized_model = retrieve_model()
     X_test = pd.read_csv(os.path.join(ROOT, 
-        'data/processed/titanic_x_test.csv'))
+        'data/processed/transfusion_x_test.csv'))
     y_pred = deserialized_model.predict(X_test)
 
     y_test = pd.read_csv(os.path.join(ROOT,
-        'data/processed/titanic_y_test.csv'), header=None)
+        'data/processed/transfusion_y_test.csv'), header=None)
     print(f'The model returned these predictions:\n{y_pred}')
 
     auc = roc_auc_score(y_test.astype(int), deserialized_model.predict_proba(X_test)[:, 1])

diff --git a/{{ cookiecutter.repo_name }}/src/models/train_model.py b/{{ cookiecutter.repo_name }}/src/models/train_model.py
@@ -7,39 +7,46 @@
 
 ROOT = Path(__file__).resolve().parents[2]
 
-if '{{ cookiecutter.python_interpreter }}' == 'python3':
+if 'python3' == 'python3':
     PROTOCOL = pickle.DEFAULT_PROTOCOL
 else:
     PROTOCOL = 2
 
 
 def fetch_processed(data_path):
+    """
+    fetch the data that was processed in make data
+    """
     data = pd.read_csv(os.path.join(ROOT, data_path))
-    data_y = data.survived
-    data_x = data.drop(['survived', 'name', 'ticket', 'boat', 
-        'body', 'home.dest'], axis=1)
+    data_y = data.label
+    data_x = data.drop(['label'], axis=1)
     # Create training and test sets
     X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, 
         test_size=0.2, random_state=0)
     return X_train, X_test, y_train, y_test
 
 
 def fit_model(X_train, y_train):
+    """
+    fit a model to the training data
+    """
     model = RandomForestClassifier(n_estimators=100)
     # Fit to the training data
     model.fit(X_train, y_train)
     return model
 
 
 def main():
-    x_train, x_test, y_train, y_test = fetch_processed('data/processed/titanic.csv')
+    """ Trains the model on the retrieved data write it back to file
+    """
+    x_train, x_test, y_train, y_test = fetch_processed('data/processed/transfusion_data.csv')
     # Train the model 
     model = fit_model(x_train, y_train)
 
     # Paths for storage
-    model_out_dir = os.path.join(ROOT, 'models/titanic.model')
-    x_test_path = os.path.join(ROOT, 'data/processed/titanic_x_test.csv')
-    y_test_path = os.path.join(ROOT, 'data/processed/titanic_y_test.csv')
+    model_out_dir = os.path.join(ROOT, 'models/transfusion.model')
+    x_test_path = os.path.join(ROOT, 'data/processed/transfusion_x_test.csv')
+    y_test_path = os.path.join(ROOT, 'data/processed/transfusion_y_test.csv')
 
     # Store model and test set for prediction
     with open(model_out_dir, 'wb') as fout: