Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upt raw_fullacs to raw_acs, raw_acs to raw_spm_acs #57

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,12 @@ class CustomDataset:
### CPS
- OpenFisca-US-compatible
- Contains OpenFisca-US-compatible input arrays.
### RawACS
### RawSPMACS
- Not OpenFisca-US-compatible
- Contains the tables from the raw [ACS SPM research file](https://www.census.gov/data/datasets/time-series/demo/supplemental-poverty-measure/acs-research-files.html) microdata.
### RawACS
- Not OpenFisca-US-compatible
- Contains the tables from the raw [ACS person and household file](https://www.census.gov/programs-surveys/acs/microdata.html)
### ACS
- OpenFisca-US-compatible
- Contains OpenFisca-US-compatible input arrays.
- Contains OpenFisca-US-compatible input arrays from the spm research file.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- Contains OpenFisca-US-compatible input arrays from the spm research file.
- Contains OpenFisca-US-compatible input arrays from the SPM research file.

2 changes: 1 addition & 1 deletion openfisca_us_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

REPO = Path(__file__).parent

DATASETS = (RawCPS, CPS, RawACS, ACS)
DATASETS = (RawCPS, CPS, RawACS, ACS, RawSPMACS)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
DATASETS = (RawCPS, CPS, RawACS, ACS, RawSPMACS)
DATASETS = (RawCPS, CPS, RawACS, RawSPMACS, ACS)

1 change: 1 addition & 0 deletions openfisca_us_data/datasets/acs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from openfisca_us_data.datasets.acs.raw_acs import RawACS
from openfisca_us_data.datasets.acs.raw_spm_acs import RawSPMACS
from openfisca_us_data.datasets.acs.acs import ACS
8 changes: 4 additions & 4 deletions openfisca_us_data/datasets/acs/acs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from openfisca_us_data.utils import US, dataset
from openfisca_us_data.datasets.acs.raw_acs import RawACS
from openfisca_us_data.datasets.acs.raw_spm_acs import RawSPMACS
from pandas import DataFrame
import h5py

Expand All @@ -19,10 +19,10 @@ def generate(year: int) -> None:

# Prepare raw ACS tables
year = int(year)
if year not in RawACS.years:
RawACS.generate(year)
if year not in RawSPMACS.years:
RawSPMACS.generate(year)

raw_data = RawACS.load(year)
raw_data = RawSPMACS.load(year)
acs = h5py.File(ACS.file(year), mode="w")

person, spm_unit, household = [
Expand Down
71 changes: 22 additions & 49 deletions openfisca_us_data/datasets/acs/raw_acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,64 +2,37 @@
import requests
from io import BytesIO
import pandas as pd

from zipfile import ZipFile

@dataset
class RawACS:
name = "raw_acs"

def generate(year: int) -> None:
url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta"
url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip"
request = requests.get(url)
file = ZipFile(BytesIO(request.content))
file.extractall(f'{year}_pus')
Copy link
Contributor

@MaxGhenis MaxGhenis Nov 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nikhilwoodruff feel free to suggest otherwise given these are large files (larger than others), but to be consistent with other generate functions I think we'll want to avoid writing the source files to disk, and instead load from the zip file directly. This might make most sense as a function, something like this (not sure if it'll work):

def concat_zipped_csvs(url: str, prefix: str) -> pd.DataFrame:
    # Creates a DataFrame with the two csvs inside a zip file from a URL.
    zf = ZipFile(BytesIO(requests.get(url)))
    a = pd.read_csv(zf.open(prefix + "a.csv"))
    b = pd.read_csv(zf.open(prefix + "b.csv"))
    res = pd.concat([a, b]).fillna(0)
    res.columns = res.columns.str.lower()
    return res

Then called as:

person_df = concat_zipped_csvs(
    f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip",
    "psam_pus"
)

And similarly for household.

Copy link
Contributor

@nikhilwoodruff nikhilwoodruff Nov 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's fine as a general approach for large files - though I did envisage something similar to openfisca-uk-data's download() function (either from gcp or GitHub) being useful here


url2 = f'https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip'
request = requests.get(url2)
file = ZipFile(BytesIO(request.content))
file.extractall(f'{year}_hus')

try:
with pd.HDFStore(RawACS.file(year)) as storage:
person = pd.read_stata(url).fillna(0)
person.columns = person.columns.str.upper()
storage["person"] = person
storage["spm_unit"] = create_SPM_unit_table(person)
storage["household"] = create_household_table(person)
persona = pd.read_csv(f'{year}_pus/psam_pusa.csv')
personb = pd.read_csv(f'{year}_pus/psam_pusb.csv')
person_df = pd.concat(persona, personb).fillna(0)
person_df.columns = person_df.columns.str.upper()

householda = pd.read_csv(f'{year}_hus/psam_husa.csv')
householdb = pd.read_csv(f'{year}_hus/psam_husa.csv')
household_df = pd.concat(householda, householdb).fillna(0)
household_df.columns = household_df.columns.str.upper()

except Exception as e:
RawACS.remove(year)
raise ValueError(
f"Attempted to extract and save the CSV files, but encountered an error: {e}"
)


def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame:
SPM_UNIT_COLUMNS = [
"CAPHOUSESUB",
"CAPWKCCXPNS",
"CHILDCAREXPNS",
"EITC",
"ENGVAL",
"EQUIVSCALE",
"FEDTAX",
"FEDTAXBC",
"FICA",
"GEOADJ",
"MEDXPNS",
"NUMADULTS",
"NUMKIDS",
"NUMPER",
"POOR",
"POVTHRESHOLD",
"RESOURCES",
"SCHLUNCH",
"SNAPSUB",
"STTAX",
"TENMORTSTATUS",
"TOTVAL",
"WCOHABIT",
"WICVAL",
"WKXPNS",
"WUI_LT15",
"ID",
]
return (
person[["SPM_" + column for column in SPM_UNIT_COLUMNS]]
.groupby(person.SPM_ID)
.first()
)


def create_household_table(person: pd.DataFrame) -> pd.DataFrame:
return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first()
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a newline

65 changes: 65 additions & 0 deletions openfisca_us_data/datasets/acs/raw_spm_acs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from openfisca_us_data.utils import *
import requests
from io import BytesIO
import pandas as pd


@dataset
class RawSPMACS:
name = "raw_spm_acs"

def generate(year: int) -> None:
url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta"
try:
with pd.HDFStore(RawSPMACS.file(year)) as storage:
person = pd.read_stata(url).fillna(0)
person.columns = person.columns.str.upper()
storage["person"] = person
storage["spm_unit"] = create_SPM_unit_table(person)
storage["household"] = create_household_table(person)
except Exception as e:
RawSPMACS.remove(year)
raise ValueError(
f"Attempted to extract and save the CSV files, but encountered an error: {e}"
)


def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame:
SPM_UNIT_COLUMNS = [
"CAPHOUSESUB",
"CAPWKCCXPNS",
"CHILDCAREXPNS",
"EITC",
"ENGVAL",
"EQUIVSCALE",
"FEDTAX",
"FEDTAXBC",
"FICA",
"GEOADJ",
"MEDXPNS",
"NUMADULTS",
"NUMKIDS",
"NUMPER",
"POOR",
"POVTHRESHOLD",
"RESOURCES",
"SCHLUNCH",
"SNAPSUB",
"STTAX",
"TENMORTSTATUS",
"TOTVAL",
"WCOHABIT",
"WICVAL",
"WKXPNS",
"WUI_LT15",
"ID",
]
return (
person[["SPM_" + column for column in SPM_UNIT_COLUMNS]]
.groupby(person.SPM_ID)
.first()
)


def create_household_table(person: pd.DataFrame) -> pd.DataFrame:
return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first()
3 changes: 3 additions & 0 deletions tests/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ def test_CPS_import():
def test_RawACS_import():
from openfisca_us_data import RawACS

def test_RawSPMACS_import():
from openfisca_us_data import RawSPMACS


def test_ACS_import():
from openfisca_us_data import ACS