Skip to content

Commit

Permalink
finished conversion to transfusion project
Browse files Browse the repository at this point in the history
  • Loading branch information
jkarlenmm committed Aug 1, 2018
1 parent c63493c commit 64c7d2c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 53 deletions.
63 changes: 21 additions & 42 deletions {{ cookiecutter.repo_name }}/src/data/make_dataset.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,47 @@
import os
import io
import requests
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

DATA_LINK = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
TITLES = ['Mlle', 'Mrs', 'Mr', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Col', 'Capt', 'Countess']
DATA_LINK = 'https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data'
ROOT = Path(__file__).resolve().parents[2]


def extract_title(name):
title = 'missing'
for item in TITLES:
if item in name:
title = item
break
if title == 'missing':
title = 'Mr'
return title


def massage_data(raw_data):
""" Preprocess the data for predictions
"""
preprocess the data for predictions
"""
# Feature engineering ---
raw_data["title"] = raw_data.apply(lambda row: extract_title(row["name"]), axis=1)

# Age: replace NaN with median
raw_data["age"].fillna(raw_data.age.median(), inplace=True)

# Embarked: replace NaN with the mode value
raw_data["embarked"].fillna(raw_data.embarked.mode()[0], inplace=True)
raw_data.rename(index=str, columns={"whether he/she donated blood in March 2007": "label"}, inplace = True)

# generate features for year for time columns
for x,y in zip(['time_years','recency_years'],['Time (months)', 'Recency (months)']):
raw_data[x] = (raw_data[y]/12).astype('int')

# generate features for quarter for time columns (3 month periods)
for x,y in zip(['time_quarters','recency_quarters'],['Time (months)', 'Recency (months)']):
raw_data[x] = (raw_data[y]/3).astype('int')

# Fare: replace NaN with median
raw_data["fare"].fillna(raw_data.fare.median(), inplace=True)

# Encode Categorical features ---
raw_data["cabin"] = raw_data.apply(lambda obs: "No" if pd.isnull(obs['cabin']) else "Yes", axis=1) # binarize “cabin” feature
raw_data = pd.get_dummies(raw_data, columns=['sex', 'title', 'cabin', 'embarked'])

# Scaling numerical features ---
scale = StandardScaler().fit(raw_data[['age', 'fare']])
raw_data[['age', 'fare']] = scale.transform(raw_data[['age', 'fare']])
return raw_data


def dump_data(data, out_loc):
"""
given a path to a datafile, either a local file path
""" Given a path to a datafile, either a local file path
or a url, fetch the data and dump it to a csv
"""
out_dir = os.path.join(ROOT, out_loc)
data.to_csv(out_dir, index=False)


def main():
""" Runs data processing scripts to turn raw data from (../raw) into
""" Retrieves data and runs processing scripts to turn raw data from (../raw) into
cleaned data ready to be analyzed (saved in ../processed).
"""
raw_data = pd.read_csv(DATA_LINK)
dump_data(raw_data, 'data/raw/titanic.csv')
s=requests.get(DATA_LINK).content
raw_data = pd.read_csv(io.StringIO(s.decode('utf-8')))

dump_data(raw_data, 'data/raw/transfusion_data_raw.csv')
processed_data = massage_data(raw_data)
dump_data(processed_data, 'data/processed/titanic.csv')
dump_data(processed_data, 'data/processed/transfusion_data.csv')


if __name__ == '__main__':
main()
main()
10 changes: 7 additions & 3 deletions {{ cookiecutter.repo_name }}/src/models/predict_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,23 @@


def retrieve_model():
pickled_model = os.path.join(ROOT, 'models/titanic.model')
"""retrieve the pickled model object
"""
pickled_model = os.path.join(ROOT, 'models/transfusion.model')
with open(pickled_model, 'rb') as fin:
return(pickle.load(fin))


def main():
""" retrieve the model and predict labels. Show prediction and performance
"""
deserialized_model = retrieve_model()
X_test = pd.read_csv(os.path.join(ROOT,
'data/processed/titanic_x_test.csv'))
'data/processed/transfusion_x_test.csv'))
y_pred = deserialized_model.predict(X_test)

y_test = pd.read_csv(os.path.join(ROOT,
'data/processed/titanic_y_test.csv'), header=None)
'data/processed/transfusion_y_test.csv'), header=None)
print(f'The model returned these predictions:\n{y_pred}')

auc = roc_auc_score(y_test.astype(int), deserialized_model.predict_proba(X_test)[:, 1])
Expand Down
23 changes: 15 additions & 8 deletions {{ cookiecutter.repo_name }}/src/models/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,46 @@

ROOT = Path(__file__).resolve().parents[2]

if '{{ cookiecutter.python_interpreter }}' == 'python3':
if 'python3' == 'python3':
PROTOCOL = pickle.DEFAULT_PROTOCOL
else:
PROTOCOL = 2


def fetch_processed(data_path):
"""
fetch the data that was processed in make data
"""
data = pd.read_csv(os.path.join(ROOT, data_path))
data_y = data.survived
data_x = data.drop(['survived', 'name', 'ticket', 'boat',
'body', 'home.dest'], axis=1)
data_y = data.label
data_x = data.drop(['label'], axis=1)
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y,
test_size=0.2, random_state=0)
return X_train, X_test, y_train, y_test


def fit_model(X_train, y_train):
"""
fit a model to the training data
"""
model = RandomForestClassifier(n_estimators=100)
# Fit to the training data
model.fit(X_train, y_train)
return model


def main():
x_train, x_test, y_train, y_test = fetch_processed('data/processed/titanic.csv')
""" Trains the model on the retrieved data write it back to file
"""
x_train, x_test, y_train, y_test = fetch_processed('data/processed/transfusion_data.csv')
# Train the model
model = fit_model(x_train, y_train)

# Paths for storage
model_out_dir = os.path.join(ROOT, 'models/titanic.model')
x_test_path = os.path.join(ROOT, 'data/processed/titanic_x_test.csv')
y_test_path = os.path.join(ROOT, 'data/processed/titanic_y_test.csv')
model_out_dir = os.path.join(ROOT, 'models/transfusion.model')
x_test_path = os.path.join(ROOT, 'data/processed/transfusion_x_test.csv')
y_test_path = os.path.join(ROOT, 'data/processed/transfusion_y_test.csv')

# Store model and test set for prediction
with open(model_out_dir, 'wb') as fout:
Expand Down

0 comments on commit 64c7d2c

Please sign in to comment.