Skip to content

Commit

Permalink
Assignment 2 (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
andrebastosdias authored Dec 10, 2024
1 parent ccb1ad6 commit 7df162a
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 9 deletions.
26 changes: 19 additions & 7 deletions assignments/life_expectancy/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,19 @@
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')


def clean_data(country: str):
'''Clean the raw data and save it to a CSV file.'''
def load_data() -> pd.DataFrame:
'''Load the raw data for the given country.'''
df = pd.read_csv(os.path.join(DATA_DIR, 'eu_life_expectancy_raw.tsv'), sep='\t')
return df


def save_data(df: pd.DataFrame) -> None:
'''Save the cleaned data to a CSV file.'''
df.to_csv(os.path.join(DATA_DIR, 'pt_life_expectancy.csv'), index=False)


def clean_data(df: pd.DataFrame, country: str) -> pd.DataFrame:
'''Clean the raw data.'''
df_split = df['unit,sex,age,geo\\time'].str.split(',', expand=True)
df_split.columns = ['unit', 'sex', 'age', 'region']
df = pd.concat([df_split, df.drop(columns=['unit,sex,age,geo\\time'])], axis=1)
Expand All @@ -22,16 +31,19 @@ def clean_data(country: str):

df = df[df['region'] == country]

df.to_csv(os.path.join(DATA_DIR, 'pt_life_expectancy.csv'), index=False)
return df


print(df.info())
print(df.describe(include='all'))
print(df.head())
def main(country: str) -> None:
'''Main function.'''
df = load_data()
df = clean_data(df, country)
save_data(df)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--country', type=str, default='PT', help='Country code to filter the data.')
args = parser.parse_args()

clean_data(args.country)
main(args.country)
4 changes: 2 additions & 2 deletions assignments/life_expectancy/tests/test_cleaning.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""Tests for the cleaning module"""
import pandas as pd

from life_expectancy.cleaning import clean_data
from life_expectancy.cleaning import main
from . import OUTPUT_DIR


def test_clean_data(pt_life_expectancy_expected):
"""Run the `clean_data` function and compare the output to the expected output"""
clean_data("PT")
main("PT")
pt_life_expectancy_actual = pd.read_csv(
OUTPUT_DIR / "pt_life_expectancy.csv"
)
Expand Down

0 comments on commit 7df162a

Please sign in to comment.