Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clase 3 #4

Open
wants to merge 10 commits into
base: clase-2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions lib/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,28 @@ def load_data(path: Path):
movies = movies[~movies.averageRating.isna()].copy()

return movies


def load_rating_train_dev_test(movies: pd.DataFrame, train_max_year=2015, dev_max_year=2017, sample_count: int = None):
"""
:param movies: Movies dataframe
:param train_max_year: cut year for training
:param dev_max_year: cut year for dev (and starts test)
:param sample_count: whether to take a sample (useful for testing the code). Ignored when it is None
"""
if sample_count:
movies = movies.sample(sample_count)

train_df = movies[movies.startYear <= train_max_year]
dev_df = movies[(movies.startYear > train_max_year) & (movies.startYear <= dev_max_year)]
test_df = movies[movies.startYear > dev_max_year]

X_train = train_df.to_dict(orient='records')
X_dev = dev_df.to_dict(orient='records')
X_test = test_df.to_dict(orient='records')

y_train = train_df.averageRating.values
y_dev = dev_df.averageRating.values
y_test = test_df.averageRating.values

return dict(X_train=X_train, y_train=y_train, X_dev=X_dev, y_dev=y_dev, X_test=X_test, y_test=y_test)
28 changes: 28 additions & 0 deletions lib/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from . import transformers
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction import DictVectorizer


def get_features_pipe(
use_years: bool, use_genre: bool,
use_director: bool, director_kws: dict = None, post_processing=None):
steps = []
if use_years:
steps.append(make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)))

if use_genre:
steps.append(make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)))

if use_director:
director_kws = director_kws or {}
# cuando hacemos **director_kws usamos ese diccionario para pasar parametros
steps.append(make_pipeline(transformers.DirectorFeatures(**director_kws), DictVectorizer(sparse=False)))

res = make_union(*steps)
if post_processing:
res = make_pipeline(res, post_processing)
return res


def get_model_pipe(features_pipe, model):
return make_pipeline(features_pipe, model)
2 changes: 1 addition & 1 deletion lib/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .director_features import DirectorFeatures
from .director_features import CrewFeatures, DirectorFeatures
from .genre_dummies import GenreDummies
from .years_ago import YearsAgo
44 changes: 24 additions & 20 deletions lib/transformers/director_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,55 @@
import pandas as pd


class DirectorFeatures(BaseEstimator, TransformerMixin):
def __init__(self, min_cnt_movies=2):
class CrewFeatures(BaseEstimator, TransformerMixin):
def __init__(self, field, min_cnt_movies=2):
self.field = field
self.min_cnt_movies = min_cnt_movies

def fit(self, X, y):
# Esto no es la forma mas elegante, pero es mas comodo y a esta altura priorizo la comodidad
# Llevamos las cosas de nuevo a un DataFrame y calculamos features por director
directors_stats = (
pd.DataFrame(X)
.groupby('director')
.agg({
'tconst': 'count',
'averageRating': ['mean', 'max', 'min'],
'numVotes': ['mean', 'min', 'max']}
.groupby(self.field)
.agg(
n_films=('tconst', 'count'),
min_rating=('averageRating', 'min'),
avg_rating=('averageRating', 'mean'),
max_rating=('averageRating', 'max'),
min_votes=('numVotes', 'min'),
avg_votes=('numVotes', 'mean'),
max_votes=('numVotes', 'max'),
)
)

# Para hacer flattening de las columnas
# https://stackoverflow.com/questions/14507794/pandas-how-to-flatten-a-hierarchical-index-in-columns
directors_stats.columns = [
'_'.join(i)
for i in zip(directors_stats.columns.get_level_values(1), directors_stats.columns.get_level_values(0))
]

# Guardamos las estadisticas
self.directors_stats_ = directors_stats

# Diccionario con los datos para los directores comunes
self.directors_stats_lk_ = (
directors_stats[directors_stats.count_tconst >= self.min_cnt_movies].to_dict(orient='index')
directors_stats[directors_stats.n_films >= self.min_cnt_movies].to_dict(orient='index')
)

# Valor default para los que consideramos que tenemos demasiado poca data
self.default_ = directors_stats[directors_stats.count_tconst < self.min_cnt_movies].mean(0).to_dict()
self.default_ = directors_stats[directors_stats.n_films < self.min_cnt_movies].mean(0).to_dict()
if self.min_cnt_movies > 1:
self.default_ = directors_stats[directors_stats.count_tconst < self.min_cnt_movies].mean(0).to_dict()
self.default_ = directors_stats[directors_stats.n_films < self.min_cnt_movies].mean(0).to_dict()
else:
self.default_ = directors_stats.mean(0).to_dict()
return self

def transform(self, X):
res = []
for e in X:
if e['director'] in self.directors_stats_lk_:
res.append(self.directors_stats_lk_[e['director']])
if e[self.field] in self.directors_stats_lk_:
res.append(self.directors_stats_lk_[e[self.field]])
else:
res.append(self.default_)
return res
return res


# Para retrocompatibilidad del material en el curso
class DirectorFeatures(CrewFeatures):
def __init__(self, min_cnt_movies=2):
super().__init__(field='director', min_cnt_movies=min_cnt_movies)
34 changes: 16 additions & 18 deletions notebooks/clase-1/01_get_the_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,38 +87,36 @@
"outputs": [],
"source": [
"# descargamos los datos\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/aclImdb_v1.tar.gz -O $DATA_HOME/aclImdb_v1.tar.gz\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/movie_gross.csv -O $DATA_HOME/movie_gross.csv\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/name.basics.tsv.gz -O $DATA_HOME/name.basics.tsv.gz\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/title.akas.tsv.gz -O $DATA_HOME/title.akas.tsv.gz\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/title.basics.tsv.gz -O $DATA_HOME/title.basics.tsv.gz\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/title.crew.tsv.gz -O $DATA_HOME/title.crew.tsv.gz\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/title.principals.tsv.gz -O $DATA_HOME/title.principals.tsv.gz\n",
"!wget https://machine-learning-practico.s3.amazonaws.com/title.ratings.tsv.gz -O $DATA_HOME/title.ratings.tsv.gz"
"!wget https://datasets.imdbws.com/name.basics.tsv.gz -O $DATA_HOME/name.basics.tsv.gz\n",
"!wget https://datasets.imdbws.com/title.akas.tsv.gz -O $DATA_HOME/title.akas.tsv.gz\n",
"!wget https://datasets.imdbws.com/title.basics.tsv.gz -O $DATA_HOME/title.basics.tsv.gz\n",
"!wget https://datasets.imdbws.com/title.crew.tsv.gz -O $DATA_HOME/title.crew.tsv.gz\n",
"!wget https://datasets.imdbws.com/title.principals.tsv.gz -O $DATA_HOME/title.principals.tsv.gz\n",
"!wget https://datasets.imdbws.com/title.ratings.tsv.gz -O $DATA_HOME/title.ratings.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Wl0qAm-hboiQ"
},
"metadata": {},
"outputs": [],
"source": [
"# descomprimimos \n",
"!ls $DATA_HOME/*.gz | grep -v aclImdb_v1.tar.gz | xargs -I% gunzip \"%\""
"# PARA DESCARGAR movie_gross.csv bajalo a mano de acá\n",
"# https://drive.google.com/file/d/1Aav7imwH7s1U2W3Olwgyd1tzUcYGUtcu/view?usp=sharing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Wl0qAm-hboiQ"
},
"outputs": [],
"source": [
"# Opcional, no lo usamos en la materia, toma mucho tiempo en descomprimir\n",
"!tar -C $DATA_HOME -vxf $DATA_HOME/aclImdb_v1.tar.gz"
"# descomprimimos \n",
"!ls $DATA_HOME/*.gz | xargs -I% gunzip \"%\""
]
}
],
Expand Down
Loading