Merge pull request #6 from ONEcampaign/5-parser-bug

5 parser bug
ONEcampaign · Oct 11, 2024 · 773678c · 773678c
2 parents 27d3606 + 4613b3b
commit 773678c
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 35 deletions.
diff --git a/src/imf_reader/weo/parser.py b/src/imf_reader/weo/parser.py
@@ -17,11 +17,11 @@
 
 # numeric columns and the type to convert them to
 SDMX_NUMERIC_COLUMNS = {
-    "REF_AREA_CODE": "Int16",
     "OBS_VALUE": "Float64",
-    "SCALE_CODE": "Int16",
-    "LASTACTUALDATE": "Int16",
-    "TIME_PERIOD": "Int16",
+    "REF_AREA_CODE": "Int64",
+    "SCALE_CODE": "Int64",
+    "LASTACTUALDATE": "Int64",
+    "TIME_PERIOD": "Int64",
 }
 
 
@@ -121,16 +121,25 @@ def check_folder(sdmx_folder: ZipFile) -> None:
     @staticmethod
     def clean_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
         """Cleans the numeric columns
+        Replaces non numeric values with null values and converts the columns to numeric and the correct type.
 
-        Replaces "n/a" and "--" with pd.NA and converts the columns to numeric and the correct type.
+        Returns:
+            The DataFrame with the numeric columns cleaned.
 
         """
 
         for column, dtype in SDMX_NUMERIC_COLUMNS.items():
-            df[column] = df[column].replace(["n/a", "--", "NULL", ""], pd.NA)
             df[column] = df[column].str.replace(",", "")  # Remove commas
+            df[column] = pd.to_numeric(
+                df[column], errors="coerce"
+            )  # Convert to numeric
             df[column] = df[column].astype(dtype)
 
+        # set type for the other columns to string
+        for column in df.columns:
+            if column not in SDMX_NUMERIC_COLUMNS.keys():
+                df[column] = df[column].astype("string")
+
         return df
 
     @staticmethod

diff --git a/src/imf_reader/weo/reader.py b/src/imf_reader/weo/reader.py
@@ -83,11 +83,11 @@ def roll_back_version(version: Version) -> Version:
     """
 
     if version[0] == "October":
-        logger.info(f"Rolling back version to April {version[1]}")
+        logger.debug(f"Rolling back version to April {version[1]}")
         return "April", version[1]
 
     elif version[0] == "April":
-        logger.info(f"Rolling back version to October {version[1] - 1}")
+        logger.debug(f"Rolling back version to October {version[1] - 1}")
         return "October", version[1] - 1
 
     else:
@@ -107,9 +107,7 @@ def _fetch(version: Version) -> pd.DataFrame:
 
     folder = SDMXScraper.scrape(*version)  # scrape the data and get the SDMX files
     df = SDMXParser.parse(folder)  # parse the SDMX files into a DataFrame
-    logger.debug(
-        f"Data scraped and parsed successfully for version {version[0]} {version[1]}"
-    )
+    logger.info(f"Data fetched successfully for version: {version[0]} {version[1]}")
     return df
 
 
@@ -143,13 +141,17 @@ def fetch_data(version: Optional[Version] = None) -> pd.DataFrame:
 
     # if version is passed, validate it and fetch the data
     if version is not None:
-        version = validate_version(version)
-        df = _fetch(version)
-        logger.info(f"Data fetched successfully for version {version[0]} {version[1]}")
-        fetch_data.last_version_fetched = (
-            version  # store the version fetched as function attribute
-        )
-        return df
+        try:
+            version = validate_version(version)
+            df = _fetch(version)
+            fetch_data.last_version_fetched = (
+                version  # store the version fetched as function attribute
+            )
+            return df
+        except Exception as e:
+            raise NoDataError(
+                f"Could not fetch data for version: {version[0]} {version[1]}. {str(e)}"
+            )
 
     # if no version is passed, generate the latest version and fetch the data
     latest_version = gen_latest_version()
@@ -158,9 +160,9 @@ def fetch_data(version: Optional[Version] = None) -> pd.DataFrame:
 
     # if no data is found for the expected latest version, roll back once and try again
     except NoDataError:
-        logger.debug(
-            f"No data found for the expected latest version {latest_version[0]} {latest_version[1]}."
-            f" Rolling back version"
+        logger.info(
+            f"No data found for expected latest version: {latest_version[0]} {latest_version[1]}."
+            f" Rolling back version..."
         )
         latest_version = roll_back_version(latest_version)
         return fetch_data(latest_version)
diff --git a/tests/test_weo/test_parser.py b/tests/test_weo/test_parser.py
@@ -222,17 +222,3 @@ def test_clean_numeric_columns(self):
         # Assert that "n/a" and "--" have been replaced with nulls
         for column in result_df.columns:
             assert result_df[column].isnull().any()
-
-        # check error is raised if any other unusual values are present
-        data_df = pd.DataFrame(
-            {
-                "REF_AREA_CODE": ["1", "2", "n/a", "--"],
-                "OBS_VALUE": ["1.1", "2.2", "n/a", "abc"],
-                "SCALE_CODE": ["3", "4", "n/a", "--"],
-                "LASTACTUALDATE": ["2023", "2024", "n/a", "--"],
-                "TIME_PERIOD": ["1980", "1981", "n/a", "--"],
-            }
-        )
-
-        with pytest.raises(ValueError):
-            SDMXParser.clean_numeric_columns(data_df)