Skip to content

Commit

Permalink
Merge pull request #6 from ONEcampaign/5-parser-bug
Browse files Browse the repository at this point in the history
5 parser bug
  • Loading branch information
lpicci96 authored Oct 11, 2024
2 parents 27d3606 + 4613b3b commit 773678c
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 35 deletions.
21 changes: 15 additions & 6 deletions src/imf_reader/weo/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@

# numeric columns and the type to convert them to
SDMX_NUMERIC_COLUMNS = {
"REF_AREA_CODE": "Int16",
"OBS_VALUE": "Float64",
"SCALE_CODE": "Int16",
"LASTACTUALDATE": "Int16",
"TIME_PERIOD": "Int16",
"REF_AREA_CODE": "Int64",
"SCALE_CODE": "Int64",
"LASTACTUALDATE": "Int64",
"TIME_PERIOD": "Int64",
}


Expand Down Expand Up @@ -121,16 +121,25 @@ def check_folder(sdmx_folder: ZipFile) -> None:
@staticmethod
def clean_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Cleans the numeric columns
Replaces non numeric values with null values and converts the columns to numeric and the correct type.
Replaces "n/a" and "--" with pd.NA and converts the columns to numeric and the correct type.
Returns:
The DataFrame with the numeric columns cleaned.
"""

for column, dtype in SDMX_NUMERIC_COLUMNS.items():
df[column] = df[column].replace(["n/a", "--", "NULL", ""], pd.NA)
df[column] = df[column].str.replace(",", "") # Remove commas
df[column] = pd.to_numeric(
df[column], errors="coerce"
) # Convert to numeric
df[column] = df[column].astype(dtype)

# set type for the other columns to string
for column in df.columns:
if column not in SDMX_NUMERIC_COLUMNS.keys():
df[column] = df[column].astype("string")

return df

@staticmethod
Expand Down
32 changes: 17 additions & 15 deletions src/imf_reader/weo/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ def roll_back_version(version: Version) -> Version:
"""

if version[0] == "October":
logger.info(f"Rolling back version to April {version[1]}")
logger.debug(f"Rolling back version to April {version[1]}")
return "April", version[1]

elif version[0] == "April":
logger.info(f"Rolling back version to October {version[1] - 1}")
logger.debug(f"Rolling back version to October {version[1] - 1}")
return "October", version[1] - 1

else:
Expand All @@ -107,9 +107,7 @@ def _fetch(version: Version) -> pd.DataFrame:

folder = SDMXScraper.scrape(*version) # scrape the data and get the SDMX files
df = SDMXParser.parse(folder) # parse the SDMX files into a DataFrame
logger.debug(
f"Data scraped and parsed successfully for version {version[0]} {version[1]}"
)
logger.info(f"Data fetched successfully for version: {version[0]} {version[1]}")
return df


Expand Down Expand Up @@ -143,13 +141,17 @@ def fetch_data(version: Optional[Version] = None) -> pd.DataFrame:

# if version is passed, validate it and fetch the data
if version is not None:
version = validate_version(version)
df = _fetch(version)
logger.info(f"Data fetched successfully for version {version[0]} {version[1]}")
fetch_data.last_version_fetched = (
version # store the version fetched as function attribute
)
return df
try:
version = validate_version(version)
df = _fetch(version)
fetch_data.last_version_fetched = (
version # store the version fetched as function attribute
)
return df
except Exception as e:
raise NoDataError(
f"Could not fetch data for version: {version[0]} {version[1]}. {str(e)}"
)

# if no version is passed, generate the latest version and fetch the data
latest_version = gen_latest_version()
Expand All @@ -158,9 +160,9 @@ def fetch_data(version: Optional[Version] = None) -> pd.DataFrame:

# if no data is found for the expected latest version, roll back once and try again
except NoDataError:
logger.debug(
f"No data found for the expected latest version {latest_version[0]} {latest_version[1]}."
f" Rolling back version"
logger.info(
f"No data found for expected latest version: {latest_version[0]} {latest_version[1]}."
f" Rolling back version..."
)
latest_version = roll_back_version(latest_version)
return fetch_data(latest_version)
14 changes: 0 additions & 14 deletions tests/test_weo/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,17 +222,3 @@ def test_clean_numeric_columns(self):
# Assert that "n/a" and "--" have been replaced with nulls
for column in result_df.columns:
assert result_df[column].isnull().any()

# check error is raised if any other unusual values are present
data_df = pd.DataFrame(
{
"REF_AREA_CODE": ["1", "2", "n/a", "--"],
"OBS_VALUE": ["1.1", "2.2", "n/a", "abc"],
"SCALE_CODE": ["3", "4", "n/a", "--"],
"LASTACTUALDATE": ["2023", "2024", "n/a", "--"],
"TIME_PERIOD": ["1980", "1981", "n/a", "--"],
}
)

with pytest.raises(ValueError):
SDMXParser.clean_numeric_columns(data_df)

0 comments on commit 773678c

Please sign in to comment.