-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Upt raw_fullacs to raw_acs, raw_acs to raw_spm_acs #57
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3,4 +3,4 @@ | |||||
|
||||||
REPO = Path(__file__).parent | ||||||
|
||||||
DATASETS = (RawCPS, CPS, RawACS, ACS) | ||||||
DATASETS = (RawCPS, CPS, RawACS, ACS, RawSPMACS) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from openfisca_us_data.datasets.acs.raw_acs import RawACS | ||
from openfisca_us_data.datasets.acs.raw_spm_acs import RawSPMACS | ||
from openfisca_us_data.datasets.acs.acs import ACS |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,64 +2,37 @@ | |
import requests | ||
from io import BytesIO | ||
import pandas as pd | ||
|
||
from zipfile import ZipFile | ||
|
||
@dataset | ||
class RawACS: | ||
name = "raw_acs" | ||
|
||
def generate(year: int) -> None: | ||
url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" | ||
url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip" | ||
request = requests.get(url) | ||
file = ZipFile(BytesIO(request.content)) | ||
file.extractall(f'{year}_pus') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nikhilwoodruff feel free to suggest otherwise given these are large files (larger than others), but to be consistent with other def concat_zipped_csvs(url: str, prefix: str) -> pd.DataFrame:
# Creates a DataFrame with the two csvs inside a zip file from a URL.
zf = ZipFile(BytesIO(requests.get(url)))
a = pd.read_csv(zf.open(prefix + "a.csv"))
b = pd.read_csv(zf.open(prefix + "b.csv"))
res = pd.concat([a, b]).fillna(0)
res.columns = res.columns.str.lower()
return res Then called as: person_df = concat_zipped_csvs(
f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip",
"psam_pus"
) And similarly for household. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that's fine as a general approach for large files - though I did envisage something similar to |
||
|
||
url2 = f'https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip' | ||
request = requests.get(url2) | ||
file = ZipFile(BytesIO(request.content)) | ||
file.extractall(f'{year}_hus') | ||
|
||
try: | ||
with pd.HDFStore(RawACS.file(year)) as storage: | ||
person = pd.read_stata(url).fillna(0) | ||
person.columns = person.columns.str.upper() | ||
storage["person"] = person | ||
storage["spm_unit"] = create_SPM_unit_table(person) | ||
storage["household"] = create_household_table(person) | ||
persona = pd.read_csv(f'{year}_pus/psam_pusa.csv') | ||
personb = pd.read_csv(f'{year}_pus/psam_pusb.csv') | ||
person_df = pd.concat(persona, personb).fillna(0) | ||
person_df.columns = person_df.columns.str.upper() | ||
|
||
householda = pd.read_csv(f'{year}_hus/psam_husa.csv') | ||
householdb = pd.read_csv(f'{year}_hus/psam_husa.csv') | ||
household_df = pd.concat(householda, householdb).fillna(0) | ||
household_df.columns = household_df.columns.str.upper() | ||
|
||
except Exception as e: | ||
RawACS.remove(year) | ||
raise ValueError( | ||
f"Attempted to extract and save the CSV files, but encountered an error: {e}" | ||
) | ||
|
||
|
||
def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame: | ||
SPM_UNIT_COLUMNS = [ | ||
"CAPHOUSESUB", | ||
"CAPWKCCXPNS", | ||
"CHILDCAREXPNS", | ||
"EITC", | ||
"ENGVAL", | ||
"EQUIVSCALE", | ||
"FEDTAX", | ||
"FEDTAXBC", | ||
"FICA", | ||
"GEOADJ", | ||
"MEDXPNS", | ||
"NUMADULTS", | ||
"NUMKIDS", | ||
"NUMPER", | ||
"POOR", | ||
"POVTHRESHOLD", | ||
"RESOURCES", | ||
"SCHLUNCH", | ||
"SNAPSUB", | ||
"STTAX", | ||
"TENMORTSTATUS", | ||
"TOTVAL", | ||
"WCOHABIT", | ||
"WICVAL", | ||
"WKXPNS", | ||
"WUI_LT15", | ||
"ID", | ||
] | ||
return ( | ||
person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] | ||
.groupby(person.SPM_ID) | ||
.first() | ||
) | ||
|
||
|
||
def create_household_table(person: pd.DataFrame) -> pd.DataFrame: | ||
return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first() | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a newline |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from openfisca_us_data.utils import * | ||
import requests | ||
from io import BytesIO | ||
import pandas as pd | ||
|
||
|
||
@dataset | ||
class RawSPMACS: | ||
name = "raw_spm_acs" | ||
|
||
def generate(year: int) -> None: | ||
url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" | ||
try: | ||
with pd.HDFStore(RawSPMACS.file(year)) as storage: | ||
person = pd.read_stata(url).fillna(0) | ||
person.columns = person.columns.str.upper() | ||
storage["person"] = person | ||
storage["spm_unit"] = create_SPM_unit_table(person) | ||
storage["household"] = create_household_table(person) | ||
except Exception as e: | ||
RawSPMACS.remove(year) | ||
raise ValueError( | ||
f"Attempted to extract and save the CSV files, but encountered an error: {e}" | ||
) | ||
|
||
|
||
def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame: | ||
SPM_UNIT_COLUMNS = [ | ||
"CAPHOUSESUB", | ||
"CAPWKCCXPNS", | ||
"CHILDCAREXPNS", | ||
"EITC", | ||
"ENGVAL", | ||
"EQUIVSCALE", | ||
"FEDTAX", | ||
"FEDTAXBC", | ||
"FICA", | ||
"GEOADJ", | ||
"MEDXPNS", | ||
"NUMADULTS", | ||
"NUMKIDS", | ||
"NUMPER", | ||
"POOR", | ||
"POVTHRESHOLD", | ||
"RESOURCES", | ||
"SCHLUNCH", | ||
"SNAPSUB", | ||
"STTAX", | ||
"TENMORTSTATUS", | ||
"TOTVAL", | ||
"WCOHABIT", | ||
"WICVAL", | ||
"WKXPNS", | ||
"WUI_LT15", | ||
"ID", | ||
] | ||
return ( | ||
person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] | ||
.groupby(person.SPM_ID) | ||
.first() | ||
) | ||
|
||
|
||
def create_household_table(person: pd.DataFrame) -> pd.DataFrame: | ||
return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.