Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

5 parser bug #6

Merged
merged 6 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions src/imf_reader/weo/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@

# numeric columns and the type to convert them to
SDMX_NUMERIC_COLUMNS = {
"REF_AREA_CODE": "Int16",
"OBS_VALUE": "Float64",
"SCALE_CODE": "Int16",
"LASTACTUALDATE": "Int16",
"TIME_PERIOD": "Int16",
"REF_AREA_CODE": "Int64",
"SCALE_CODE": "Int64",
"LASTACTUALDATE": "Int64",
"TIME_PERIOD": "Int64",
}


Expand Down Expand Up @@ -121,16 +121,25 @@ def check_folder(sdmx_folder: ZipFile) -> None:
@staticmethod
def clean_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Cleans the numeric columns
Replaces non numeric values with null values and converts the columns to numeric and the correct type.

Replaces "n/a" and "--" with pd.NA and converts the columns to numeric and the correct type.
Returns:
The DataFrame with the numeric columns cleaned.

"""

for column, dtype in SDMX_NUMERIC_COLUMNS.items():
df[column] = df[column].replace(["n/a", "--", "NULL", ""], pd.NA)
df[column] = df[column].str.replace(",", "") # Remove commas
df[column] = pd.to_numeric(
df[column], errors="coerce"
) # Convert to numeric
df[column] = df[column].astype(dtype)

# set type for the other columns to string
for column in df.columns:
if column not in SDMX_NUMERIC_COLUMNS.keys():
df[column] = df[column].astype("string")

return df

@staticmethod
Expand Down
32 changes: 17 additions & 15 deletions src/imf_reader/weo/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ def roll_back_version(version: Version) -> Version:
"""

if version[0] == "October":
logger.info(f"Rolling back version to April {version[1]}")
logger.debug(f"Rolling back version to April {version[1]}")
return "April", version[1]

elif version[0] == "April":
logger.info(f"Rolling back version to October {version[1] - 1}")
logger.debug(f"Rolling back version to October {version[1] - 1}")
return "October", version[1] - 1

else:
Expand All @@ -107,9 +107,7 @@ def _fetch(version: Version) -> pd.DataFrame:

folder = SDMXScraper.scrape(*version) # scrape the data and get the SDMX files
df = SDMXParser.parse(folder) # parse the SDMX files into a DataFrame
logger.debug(
f"Data scraped and parsed successfully for version {version[0]} {version[1]}"
)
logger.info(f"Data fetched successfully for version: {version[0]} {version[1]}")
return df


Expand Down Expand Up @@ -143,13 +141,17 @@ def fetch_data(version: Optional[Version] = None) -> pd.DataFrame:

# if version is passed, validate it and fetch the data
if version is not None:
version = validate_version(version)
df = _fetch(version)
logger.info(f"Data fetched successfully for version {version[0]} {version[1]}")
fetch_data.last_version_fetched = (
version # store the version fetched as function attribute
)
return df
try:
version = validate_version(version)
df = _fetch(version)
fetch_data.last_version_fetched = (
version # store the version fetched as function attribute
)
return df
except Exception as e:
raise NoDataError(
f"Could not fetch data for version: {version[0]} {version[1]}. {str(e)}"
)

# if no version is passed, generate the latest version and fetch the data
latest_version = gen_latest_version()
Expand All @@ -158,9 +160,9 @@ def fetch_data(version: Optional[Version] = None) -> pd.DataFrame:

# if no data is found for the expected latest version, roll back once and try again
except NoDataError:
logger.debug(
f"No data found for the expected latest version {latest_version[0]} {latest_version[1]}."
f" Rolling back version"
logger.info(
f"No data found for expected latest version: {latest_version[0]} {latest_version[1]}."
f" Rolling back version..."
)
latest_version = roll_back_version(latest_version)
return fetch_data(latest_version)
14 changes: 0 additions & 14 deletions tests/test_weo/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,17 +222,3 @@ def test_clean_numeric_columns(self):
# Assert that "n/a" and "--" have been replaced with nulls
for column in result_df.columns:
assert result_df[column].isnull().any()

# check error is raised if any other unusual values are present
data_df = pd.DataFrame(
{
"REF_AREA_CODE": ["1", "2", "n/a", "--"],
"OBS_VALUE": ["1.1", "2.2", "n/a", "abc"],
"SCALE_CODE": ["3", "4", "n/a", "--"],
"LASTACTUALDATE": ["2023", "2024", "n/a", "--"],
"TIME_PERIOD": ["1980", "1981", "n/a", "--"],
}
)

with pytest.raises(ValueError):
SDMXParser.clean_numeric_columns(data_df)
Loading