Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support for years >= 2000 #38

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions folktables/load_acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ def download_and_extract(url, datadir, remote_fname, file_name, delete_download=
def initialize_and_download(datadir, state, year, horizon, survey, download=False):
"""Download the dataset (if required)."""
assert horizon in ['1-Year', '5-Year']
assert int(year) >= 2014
assert state in state_list
assert survey in ['person', 'household']
assert int(year) >= 2000
assert int(year) >= 2009 or horizon == '1-Year'

state_code = _STATE_CODES[state]
survey_code = 'p' if survey == 'person' else 'h'
Expand All @@ -68,7 +69,9 @@ def initialize_and_download(datadir, state, year, horizon, survey, download=Fals

print(f'Downloading data for {year} {horizon} {survey} survey for {state}...')
# Download and extract file
base_url= f'https://www2.census.gov/programs-surveys/acs/data/pums/{year}/{horizon}'
base_url = f'https://www2.census.gov/programs-surveys/acs/data/pums/{year}'
if year >= 2007:
base_url += f'/{horizon}'
remote_fname = f'csv_{survey_code}{state.lower()}.zip'
url = f'{base_url}/{remote_fname}'
try:
Expand All @@ -91,8 +94,11 @@ def load_acs(root_dir, states=None, year=2018, horizon='1-Year',
and the output is instead filtered with the provided list (only entries with
a serial number in the list are kept).
"""
if int(year) < 2014:
raise ValueError('Year must be >= 2014')
if int(year) < 2000:
raise ValueError('Only years 2000 and later are supported')

if int(year) < 2009 and horizon != '1-Year':
raise ValueError('5-Year estimates are not available for years before 2009')

if serial_filter_list is not None:
serial_filter_list = set(serial_filter_list) # set for faster membership check
Expand All @@ -111,10 +117,10 @@ def load_acs(root_dir, states=None, year=2018, horizon='1-Year',
initialize_and_download(base_datadir, state, year, horizon, survey, download=download)
)

dtypes = {'PINCP': np.float64, 'RT': str, 'SOCP': str, 'SERIALNO': str, 'NAICSP': str}
df_list = []
for file_name in file_names:
df = pd.read_csv(file_name, dtype=dtypes).replace(' ','')
dtype = {'RT': str, 'SOCP': str, 'SERIALNO': str, 'NAICSP': str}
df = pd.read_csv(file_name, na_values=[' '], dtype=dtype).replace(' ','')
if serial_filter_list is not None:
df = df[df['SERIALNO'].isin(serial_filter_list)]
df_list.append(df)
Expand Down
Loading