Skip to content

Commit

Permalink
Remove requests dependency (#519)
Browse files Browse the repository at this point in the history
* remove requests dependency
* save/read original zipped file instead of csv
* fix numerical accuracy in tests
  • Loading branch information
hoffmansc authored Feb 23, 2024
1 parent c704cd5 commit a41b0ab
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 19 deletions.
27 changes: 11 additions & 16 deletions aif360/sklearn/datasets/meps_datasets.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from io import BytesIO
import os
from zipfile import ZipFile
import urllib

import pandas as pd
import requests

from aif360.sklearn.datasets.utils import standardize_dataset

Expand Down Expand Up @@ -59,25 +58,21 @@ def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True,
if panel not in {19, 20, 21}:
raise ValueError("only panels 19, 20, and 21 are currently supported.")

fname = 'h192' if panel == 21 else 'h181'
cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname + '.csv')
fname = 'h192ssp.zip' if panel == 21 else 'h181ssp.zip'
cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname)
if cache and os.path.isfile(cache_path):
df = pd.read_csv(cache_path)
df = pd.read_sas(cache_path, format="xport", encoding="utf-8")
else:
# skip prompt if user chooses
accept = accept_terms or input(PROMPT)
if accept != 'y' and accept != True:
if accept != 'y' and accept is not True:
raise PermissionError("Terms not agreed.")
rawz = requests.get(os.path.join(MEPS_URL, fname + 'ssp.zip')).content
with ZipFile(BytesIO(rawz)) as zf:
with zf.open(fname + '.ssp') as ssp:
df = pd.read_sas(ssp, format='xport')
# TODO: does this cause any differences?
# reduce storage size
df = df.apply(pd.to_numeric, errors='ignore', downcast='integer')
if cache:
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
df.to_csv(cache_path, index=None)
rawz = urllib.request.urlopen(os.path.join(MEPS_URL, fname)).read()
df = pd.read_sas(BytesIO(rawz), format='xport', encoding="utf-8", compression="zip")
if cache:
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
with open(cache_path, "wb") as f:
f.write(rawz)
# restrict to correct panel
df = df[df['PANEL'] == panel]
# change all 15s to 16s if panel == 21
Expand Down
7 changes: 4 additions & 3 deletions tests/sklearn/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from functools import partial

import numpy as np
from numpy.testing import assert_array_equal
from numpy.testing import assert_array_equal, assert_allclose
import pandas as pd
from pandas.api.types import is_numeric_dtype
from pandas.testing import assert_frame_equal
Expand Down Expand Up @@ -233,7 +233,7 @@ def test_cache_meps(panel):
meps_raw = fetch_meps(panel, cache=False, accept_terms=True)[0]
fetch_meps(panel, cache=True, accept_terms=True)
meps_cached = fetch_meps(panel, cache=True)[0]
assert_frame_equal(meps_raw, meps_cached, check_dtype=False, check_categorical=False)
assert_frame_equal(meps_raw, meps_cached)
assert_array_equal(meps_raw.to_numpy(), meps_cached.to_numpy())

@pytest.mark.parametrize(
Expand All @@ -254,7 +254,8 @@ def test_meps_matches_old(panel, cls):
assert len(meps) == 3
meps.X.RACE = meps.X.RACE.factorize(sort=True)[0]
MEPS = cls()
assert_array_equal(pd.get_dummies(meps.X.drop(columns=educols)), MEPS.features)
assert_allclose(pd.get_dummies(meps.X.drop(columns=educols)).astype(float),
MEPS.features, atol=1e-16)
assert_array_equal(meps.y.factorize(sort=True)[0], MEPS.labels.ravel())

@pytest.mark.parametrize("panel", [19, 20, 21])
Expand Down

0 comments on commit a41b0ab

Please sign in to comment.