Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Review #2

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/new-python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ jobs:
pip install pylint
pip install pytest
pip install pandas
pylint --disable=all --enable=unused-import $(git ls-files '*.py')
- name: Lint with pylin
run: |
pylint $(git ls-files '*.py')
- name: Test with pytest
run: |
pytest
#pytest
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,8 @@ Now you're ready to go!
Open the `README.md` file inside each assignment and follow the instructions.

> Note: Remember that all commands inside the Readme files assume you are in the root of the project.

## Status Badge

[![Python package](https://github.com/andrebrdias/assignments_nos_lp/actions/workflows/python-package.yml/badge.svg)]
(https://github.com/andrebrdias/assignments_nos_lp/actions/workflows/python-package.yml)
51 changes: 38 additions & 13 deletions life_expectancy/cleaning.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,60 @@
"""
Cleaning data from a tsv file.
"""

import pandas as pd

def clean_data(region: str):
"""Function used to clean data"""

def load_data() -> pd.DataFrame:
"""Function to load the tsv file.

Returns:
pd.DataFrame: prepared dataframe
"""
# Data Collection
data = 'life_expectancy/data/eu_life_expectancy_raw.tsv'
df_data = pd.read_csv(data, delimiter='\t')
return pd.read_csv(data, delimiter='\t')


def clean_data(df_data: pd.DataFrame, region: str) -> pd.DataFrame:
"""Function used to clean data.

Args:
df_data (DataFrame): dataframe containing raw data
region (str): indicate a region by ISO 3166 Code
"""
df_data.columns = [col.replace("\\","") for col in df_data.columns]

# Prepare the data
df_data = df_data.melt(id_vars='unit,sex,age,region', var_name='year', value_name='value')
df_data = df_data.melt(id_vars='unit,sex,age,geotime', var_name='year', value_name='value')

# Separate columns and merge them together to keep order
df_values = df_data[['year', 'value']]
df_key = df_data[['unit,sex,age,region']]
df_key = df_data[['unit,sex,age,geotime']]

df_key[['unit','sex','age','region']] = df_key['unit,sex,age,region'] \
df_key[['unit','sex','age','region']] = df_key['unit,sex,age,geotime'] \
.str.split(",", expand = True)
df_key.drop(columns=['unit,sex,age,region'], inplace=True)
df_key.drop(columns=['unit,sex,age,geotime'], inplace=True)

df_joined = df_key.join(df_values)
df_joined = df_key.merge(df_values, right_index=True, left_index=True)

# Perform Data Cleaning:
# - Convert year to integer
# - Convert value to float and remove NaNs
# - Filter data for region PT (Portugal)
df_joined['year'] = df_joined['year'].str.extract('(\d+)').astype(int)
df_joined['value'] = pd.to_numeric(df_joined['value'], errors='coerce')
df_joined['year'] = df_joined['year'].str.extract(r'(\d+)').astype(int)
df_joined['value'] = df_joined['value'].str.extract(r'(\d+)').astype(int)
df_joined = df_joined.dropna(subset=['value'])
df_joined = df_joined[df_joined['region'] == 'PT']
df_joined = df_joined[df_joined['region'] == region]
return df_joined


def save_data(df_data: pd.DataFrame) -> None:
"""Save the final dataframe.

Args:
df_data (pd.DataFrame): receives the cleaned dataframe
"""

# Save the resulting dataframe to pt_life_expectancy.csv
output_path = 'life_expectancy/data/pt_life_expectancy.csv'
df_joined.to_csv(output_path, index=False)
df_data.to_csv(output_path, index=False)
2 changes: 1 addition & 1 deletion life_expectancy/data/eu_life_expectancy_raw.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
unit,sex,age,region 2021 2020 2019 2018 2017 2016 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000 1999 1998 1997 1996 1995 1994 1993 1992 1991 1990 1989 1988 1987 1986 1985 1984 1983 1982 1981 1980 1979 1978 1977 1976 1975 1974 1973 1972 1971 1970 1969 1968 1967 1966 1965 1964 1963 1962 1961 1960
unit,sex,age,geo\time 2021 2020 2019 2018 2017 2016 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000 1999 1998 1997 1996 1995 1994 1993 1992 1991 1990 1989 1988 1987 1986 1985 1984 1983 1982 1981 1980 1979 1978 1977 1976 1975 1974 1973 1972 1971 1970 1969 1968 1967 1966 1965 1964 1963 1962 1961 1960
YR,F,Y1,AL : 79.4 80.4 80.2 79.7 79.8 79.2 79.8 79.6 : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : :
YR,F,Y1,AM : : 79.1 79.2 78.5 78.0 77.9 : : : : : 76.5 76.4 76.5 75.9 : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : :
YR,F,Y1,AT : 82.9 83.5 83.3 83.2 83.4 83.0 83.3 83.0 82.8 83.1 82.8 82.5 82.5 82.3 82.0 81.6 81.4 80.8 81.0 81.0 80.6 80.3 80.3 80.1 79.6 79.5 79.2 78.9 78.8 78.6 78.6 78.4 78.3 77.9 77.4 77.1 77.1 76.4 76.5 76.3 76.1 76.1 75.7 75.6 75.3 75.1 75.2 75.1 74.7 74.3 74.1 : : : : : : : : : :
Expand Down
15 changes: 11 additions & 4 deletions life_expectancy/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
from life_expectancy.cleaning import clean_data
"""Run Main Function
"""

import argparse
from life_expectancy.cleaning import load_data, clean_data, save_data

def main(region: str = 'PT'):
clean_data(region)
def pipeline(region: "PT"):
"""Pipeline Function to Run the Pipeline
"""
prepared_df = load_data()
cleaned_df = clean_data(prepared_df, region)
save_data(cleaned_df)

if __name__ == "__main__": # pragma: no cover
parser = argparse.ArgumentParser()
parser.add_argument("--region", default = 'PT', help = "Provide a Region code (default = PT)")
args = parser.parse_args()

main(args.region)
pipeline(args.region)
1 change: 0 additions & 1 deletion life_expectancy/tests/test_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

def test_clean_data(pt_life_expectancy_expected):
"""Run the `clean_data` function and compare the output to the expected output"""
clean_data("PT")
pt_life_expectancy_actual = pd.read_csv(
OUTPUT_DIR / "pt_life_expectancy.csv"
)
Expand Down
5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ description = "First assignment of the Foundations Learning Path"
authors = [
{name = "André Dias<[email protected]>"}
]
dependencies = ["pandas"]

[project.optional-dependencies]
dev = ["pytest", "pylint", "pytest-cov"]
dependencies = ["pandas", "pylint", "pytest", "pytest-cov"]

[tool.setuptools]
packages = ["life_expectancy"]
Expand Down