andrebrdias · hugo-pires · May 29, 2023 · May 29, 2023 · May 29, 2023 · May 29, 2023
diff --git a/.github/workflows/new-python-app.yml b/.github/workflows/new-python-app.yml
@@ -29,9 +29,10 @@ jobs:
         pip install pylint
         pip install pytest
         pip install pandas
+        pylint --disable=all --enable=unused-import $(git ls-files '*.py')
     - name: Lint with pylin
       run: |
        pylint $(git ls-files '*.py')
     - name: Test with pytest
       run: |
-        pytest
+        #pytest
diff --git a/README.md b/README.md
@@ -36,3 +36,8 @@ Now you're ready to go!
 Open the `README.md` file inside each assignment and follow the instructions.
 
 > Note: Remember that all commands inside the Readme files assume you are in the root of the project.
+
+## Status Badge
+
+[![Python package](https://github.com/andrebrdias/assignments_nos_lp/actions/workflows/python-package.yml/badge.svg)]
+(https://github.com/andrebrdias/assignments_nos_lp/actions/workflows/python-package.yml)
diff --git a/life_expectancy/cleaning.py b/life_expectancy/cleaning.py
@@ -1,35 +1,60 @@
 """
 Cleaning data from a tsv file.
 """
-
 import pandas as pd
 
-def clean_data(region: str):
-    """Function used to clean data"""
 
+def load_data() -> pd.DataFrame:
+    """Function to load the tsv file.
+
+    Returns:
+        pd.DataFrame: prepared dataframe
+    """
     # Data Collection
     data = 'life_expectancy/data/eu_life_expectancy_raw.tsv'
-    df_data = pd.read_csv(data, delimiter='\t')
+    return pd.read_csv(data, delimiter='\t')
+
+
+def clean_data(df_data: pd.DataFrame, region: str) -> pd.DataFrame:
+    """Function used to clean data.
+
+    Args:
+        df_data (DataFrame): dataframe containing raw data
+        region (str): indicate a region by ISO 3166 Code
+    """
+    df_data.columns =  [col.replace("\\","") for col in df_data.columns]
+
     # Prepare the data
-    df_data = df_data.melt(id_vars='unit,sex,age,region', var_name='year', value_name='value')
+    df_data = df_data.melt(id_vars='unit,sex,age,geotime', var_name='year', value_name='value')
 
+    # Separate columns and merge them together to keep order
     df_values = df_data[['year', 'value']]
-    df_key = df_data[['unit,sex,age,region']]
+    df_key = df_data[['unit,sex,age,geotime']]
 
-    df_key[['unit','sex','age','region']] = df_key['unit,sex,age,region'] \
+    df_key[['unit','sex','age','region']] = df_key['unit,sex,age,geotime'] \
     .str.split(",", expand = True)
-    df_key.drop(columns=['unit,sex,age,region'], inplace=True)
+    df_key.drop(columns=['unit,sex,age,geotime'], inplace=True)
 
-    df_joined = df_key.join(df_values)
+    df_joined = df_key.merge(df_values, right_index=True, left_index=True)
 
     # Perform Data Cleaning:
     #   - Convert year to integer
     #   - Convert value to float and remove NaNs
     #   - Filter data for region PT (Portugal)
-    df_joined['year'] = df_joined['year'].str.extract('(\d+)').astype(int)
-    df_joined['value'] = pd.to_numeric(df_joined['value'], errors='coerce')
+    df_joined['year'] = df_joined['year'].str.extract(r'(\d+)').astype(int)
+    df_joined['value'] = df_joined['value'].str.extract(r'(\d+)').astype(int)
     df_joined = df_joined.dropna(subset=['value'])
-    df_joined = df_joined[df_joined['region'] == 'PT']
+    df_joined = df_joined[df_joined['region'] == region]
+    return df_joined
+
+
+def save_data(df_data: pd.DataFrame) -> None:
+    """Save the final dataframe.
+
+    Args:
+        df_data (pd.DataFrame): receives the cleaned dataframe
+    """
+
     # Save the resulting dataframe to pt_life_expectancy.csv
     output_path = 'life_expectancy/data/pt_life_expectancy.csv'
-    df_joined.to_csv(output_path, index=False)
+    df_data.to_csv(output_path, index=False)
diff --git a/life_expectancy/data/eu_life_expectancy_raw.tsv b/life_expectancy/data/eu_life_expectancy_raw.tsv
@@ -1,4 +1,4 @@
-unit,sex,age,region	2021 	2020 	2019 	2018 	2017 	2016 	2015 	2014 	2013 	2012 	2011 	2010 	2009 	2008 	2007 	2006 	2005 	2004 	2003 	2002 	2001 	2000 	1999 	1998 	1997 	1996 	1995 	1994 	1993 	1992 	1991 	1990 	1989 	1988 	1987 	1986 	1985 	1984 	1983 	1982 	1981 	1980 	1979 	1978 	1977 	1976 	1975 	1974 	1973 	1972 	1971 	1970 	1969 	1968 	1967 	1966 	1965 	1964 	1963 	1962 	1961 	1960 
+unit,sex,age,geo\time	2021 	2020 	2019 	2018 	2017 	2016 	2015 	2014 	2013 	2012 	2011 	2010 	2009 	2008 	2007 	2006 	2005 	2004 	2003 	2002 	2001 	2000 	1999 	1998 	1997 	1996 	1995 	1994 	1993 	1992 	1991 	1990 	1989 	1988 	1987 	1986 	1985 	1984 	1983 	1982 	1981 	1980 	1979 	1978 	1977 	1976 	1975 	1974 	1973 	1972 	1971 	1970 	1969 	1968 	1967 	1966 	1965 	1964 	1963 	1962 	1961 	1960 
 YR,F,Y1,AL	: 	79.4 	80.4 	80.2 	79.7 	79.8 	79.2 	79.8 	79.6 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 
 YR,F,Y1,AM	: 	: 	79.1 	79.2 	78.5 	78.0 	77.9 	: 	: 	: 	: 	: 	76.5 	76.4 	76.5 	75.9 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 
 YR,F,Y1,AT	: 	82.9 	83.5 	83.3 	83.2 	83.4 	83.0 	83.3 	83.0 	82.8 	83.1 	82.8 	82.5 	82.5 	82.3 	82.0 	81.6 	81.4 	80.8 	81.0 	81.0 	80.6 	80.3 	80.3 	80.1 	79.6 	79.5 	79.2 	78.9 	78.8 	78.6 	78.6 	78.4 	78.3 	77.9 	77.4 	77.1 	77.1 	76.4 	76.5 	76.3 	76.1 	76.1 	75.7 	75.6 	75.3 	75.1 	75.2 	75.1 	74.7 	74.3 	74.1 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 

diff --git a/life_expectancy/main.py b/life_expectancy/main.py
@@ -1,12 +1,19 @@
-from life_expectancy.cleaning import clean_data
+"""Run Main Function
+"""
+
 import argparse
+from life_expectancy.cleaning import load_data, clean_data, save_data
 
-def main(region: str = 'PT'):
-    clean_data(region)
+def pipeline(region: "PT"):
+    """Pipeline Function to Run the Pipeline
+    """
+    prepared_df = load_data()
+    cleaned_df = clean_data(prepared_df, region)
+    save_data(cleaned_df)
 
 if __name__ == "__main__": # pragma: no cover
     parser = argparse.ArgumentParser()
     parser.add_argument("--region", default = 'PT', help = "Provide a Region code (default = PT)")
     args = parser.parse_args()
 
-    main(args.region)
+    pipeline(args.region)
diff --git a/life_expectancy/tests/test_cleaning.py b/life_expectancy/tests/test_cleaning.py
@@ -5,7 +5,6 @@
 
 def test_clean_data(pt_life_expectancy_expected):
     """Run the `clean_data` function and compare the output to the expected output"""
-    clean_data("PT")
     pt_life_expectancy_actual = pd.read_csv(
         OUTPUT_DIR / "pt_life_expectancy.csv"
     )

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,10 +7,7 @@ description = "First assignment of the Foundations Learning Path"
 authors = [
     {name = "André Dias<[email protected]>"}
 ]
-dependencies = ["pandas"]
-
-[project.optional-dependencies]
-dev = ["pytest", "pylint", "pytest-cov"]
+dependencies = ["pandas", "pylint", "pytest", "pytest-cov"]
 
 [tool.setuptools]
 packages = ["life_expectancy"]