UDST · mxndrwgrdnr · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,4 @@
 language: python
-sudo: false
 python:
 - '2.7'
 - '3.5'
@@ -13,13 +12,12 @@ install:
 - conda config --set always_yes yes --set changeps1 no
 - conda update -q conda
 - conda info -a
-- |
-  conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip numexpr numpy pandas scipy pytest
+- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip numexpr numpy pandas scipy pytest tqdm futures
 - source activate test-environment
-- pip install pytest-cov coveralls pep8
+- pip install pytest-cov coveralls pycodestyle
 - pip install .
 script:
-- pep8 synthpop
+- pycodestyle synthpop
 - py.test --cov synthpop --cov-report term-missing
 after_success:
 - coveralls

diff --git a/scripts/sfbay_synth.py b/scripts/sfbay_synth.py
@@ -0,0 +1,107 @@
+import os
+import pandas as pd
+from glob import glob
+import warnings
+
+from synthpop.census_helpers import Census
+from synthpop.recipes.starter2 import Starter
+from synthpop.synthesizer import synthesize_all_in_parallel
+
+warnings.filterwarnings('ignore')
+
+counties = [
+    "Napa County", "Santa Clara County", "Solano County", "San Mateo County",
+    "Marin County", "San Francisco County", "Sonoma County",
+    "Contra Costa County", "Alameda County"]
+
+if __name__ == '__main__':
+
+    for county in counties:
+        c = Census(os.environ["CENSUS"])
+        starter = Starter(os.environ["CENSUS"], "CA", county)
+
+        county_dfs = synthesize_all_in_parallel(starter)
+
+        hh_all = county_dfs[0]
+        p_all = county_dfs[1]
+        fits_all = county_dfs[2]
+
+        hh_all.index.name = 'household_id'
+        p_all.index.name = 'person_id'
+        p_all.rename(columns={'hh_id': 'household_id'}, inplace=True)
+
+        hh_all['age_of_head'] = p_all[p_all.RELP == 0].groupby(
+            'household_id').AGEP.max()
+        hh_all['race_of_head'] = p_all[p_all.RELP == 0].groupby(
+            'household_id').RAC1P.max()
+        hh_all['workers'] = p_all[p_all.ESR.isin([1, 2, 4, 5])].groupby(
+            'household_id').size()
+        hh_all['children'] = p_all[p_all.AGEP < 18].groupby(
+            'household_id').size()
+        hh_all['tenure'] = 2
+        hh_all.tenure[hh_all.TEN < 3] = 1  # tenure coded 1:own, 2:rent
+        hh_all['recent_mover'] = 0
+        hh_all.recent_mover[hh_all.MV < 4] = 1  # 1 if recent mover
+        hh_all = hh_all.rename(columns={
+            'VEH': 'cars', 'HINCP': 'income', 'NP': 'persons',
+            'BLD': 'building_type'})
+
+        for col in hh_all.columns:
+            if col not in [
+                    'persons', 'income', 'age_of_head', 'race_of_head',
+                    'hispanic_head', 'workers', 'children', 'cars', 'tenure',
+                    'recent_mover', 'building_type', 'serialno', 'state',
+                    'county', 'tract', 'block group']:
+                del hh_all[col]
+
+        p_all.rename(columns={
+            'AGEP': 'age', 'RAC1P': 'race_id', 'NP': 'persons',
+            'SPORDER': 'member_id', 'HISP': 'hispanic', 'RELP': 'relate',
+            'SEX': 'sex', 'WKHP': 'hours', 'SCHL': 'edu', 'PERNP': 'earning'},
+            inplace=True)
+        p_all['student'] = 0
+        p_all.student[p_all.SCH.isin([2, 3])] = 1
+        p_all['work_at_home'] = 0
+        p_all.work_at_home[p_all.JWTR == 11] = 1
+        p_all['worker'] = 0
+        p_all.worker[p_all.ESR.isin([1, 2, 4, 5])] = 1
+
+        for col in p_all.columns:
+            if col not in ['household_id', 'member_id',
+                           'relate', 'age', 'sex', 'race_id', 'hispanic',
+                           'student', 'worker', 'hours',
+                           'work_at_home', 'edu', 'earning']:
+                del p_all[col]
+
+        hh_all.to_csv('{0}_hh_synth_parallel.csv'.format(county))
+        p_all.to_csv('{0}_p_synth_parallel.csv'.format(county))
+
+    # concat all the county dfs
+    hh_fnames = glob('*hh*.csv')
+
+    p_df_list = []
+    hh_df_list = []
+    hh_index_start = 0
+    p_index_start = 0
+
+    for hh_file in hh_fnames:
+        county = hh_file.split('_hh')[0]
+        hh_df = pd.read_csv(hh_file, index_col='household_id', header=0)
+        p_df = pd.read_csv(
+            glob(county + '_p*.csv')[0], index_col='person_id', header=0)
+        print(county + ': {0}'.format(str(hh_df.iloc[0].county)))
+        hh_df.index += hh_index_start
+        p_df.household_id += hh_index_start
+        p_df.index += p_index_start
+        hh_df_list.append(hh_df)
+        p_df_list.append(p_df)
+        hh_index_start = hh_df.index.values[-1] + 1
+        p_index_start = p_df.index.values[-1] + 1
+
+    hh_all = pd.concat(hh_df_list)
+    p_all = pd.concat(p_df_list)
+    print(len(hh_all.iloc[hh_all.index.duplicated(keep=False)]))
+    print(len(p_all.iloc[p_all.index.duplicated(keep=False)]))
+    p_all.to_csv('sfbay_persons_2018_04_08.csv')
+    hh_all.to_csv('sfbay_households_2018_04_08.csv')
+
diff --git a/setup.py b/setup.py
@@ -24,6 +24,7 @@
         'numpy>=1.8.0',
         'pandas>=0.15.0',
         'scipy>=0.13.3',
-        'us>=0.8'
+        'us>=0.8',
+        'tqdm>=4.23'
     ]
 )
diff --git a/synthpop/census_helpers.py b/synthpop/census_helpers.py
@@ -83,7 +83,7 @@ def chunks(l, n):
             """ Yield successive n-sized chunks from l.
             """
             for i in range(0, len(l), n):
-                yield l[i:i+n]
+                yield l[i: i + n]
 
         for census_column_batch in chunks(census_columns, 45):
             census_column_batch = list(census_column_batch)
@@ -200,12 +200,12 @@ def try_fips_lookup(self, state, county=None):
         if county is None:
             try:
                 return getattr(us.states, state).fips
-            except:
+            except (KeyError, NameError, ValueError, AttributeError, IndexError):
                 pass
             return state
 
         try:
             return df.loc[(state, county)]
-        except:
+        except (KeyError, NameError, ValueError, AttributeError, IndexError):
             pass
         return state, county
diff --git a/synthpop/ipf/test/test_ipf.py b/synthpop/ipf/test/test_ipf.py
@@ -2,7 +2,7 @@
 import pytest
 from pandas.util import testing as pdt
 
-from .. import ipf
+from synthpop.ipf import ipf
 
 
 def test_trivial_ipf():

diff --git a/synthpop/ipu/ipu.py b/synthpop/ipu/ipu.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import warnings
 
 
 def _drop_zeros(df):
@@ -99,7 +100,7 @@ def iter_columns(self):
         The returned column contains only the non-zero elements.
 
         """
-        return list(self._everything.values())
+        return self._everything.values()
 
     def get_column(self, key):
         """
@@ -259,9 +260,12 @@ def household_weights(
         iterations += 1
 
         if iterations > max_iterations:
-            raise RuntimeError(
+            warnings.warn(
                 'Maximum number of iterations reached during IPU: {}'.format(
-                    max_iterations))
+                    max_iterations), UserWarning)
+            return (
+                pd.Series(best_weights, index=household_freq.index),
+                best_fit_qual, iterations)
 
     return (
         pd.Series(best_weights, index=household_freq.index),

diff --git a/synthpop/ipu/test/test_ipu.py b/synthpop/ipu/test/test_ipu.py
@@ -2,9 +2,8 @@
 import numpy.testing as npt
 import pandas as pd
 import pytest
-from pandas.util import testing as pdt
 
-from .. import ipu
+from synthpop.ipu import ipu
 
 
 @pytest.fixture(scope='module')
@@ -169,7 +168,7 @@ def test_household_weights(
 def test_household_weights_max_iter(
         household_freqs, person_freqs, household_constraints,
         person_constraints):
-    with pytest.raises(RuntimeError):
+    with pytest.warns(UserWarning):
         ipu.household_weights(
             household_freqs, person_freqs, household_constraints,
             person_constraints, convergence=1e-7, max_iterations=10)

diff --git a/synthpop/recipes/starter2.py b/synthpop/recipes/starter2.py
@@ -72,8 +72,7 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
             merge_columns=['tract', 'county', 'state'],
             block_group_size_attr="B11005_001E",
             tract_size_attr="B08201_001E",
-            tract=tract, year=acsyear)
-        self.h_acs = h_acs
+            tract=tract)
 
         self.h_acs_cat = cat.categorize(h_acs, {
             ("sf_detached", "yes"): "B25032_003E + B25032_014E",
@@ -181,9 +180,8 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
         # that will be in the outputted synthetic population
         self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', 'TYPE',
                             'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18')
-        self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RELP', 'AGEP',
-                            'ESR', 'RAC1P', 'HISP', 'SEX', 'SPORDER',
-                            'PERNP', 'SCHL', 'WKHP', 'JWTR', 'SCH')
+        self.p_pums_cols = ('serialno', 'SPORDER', 'PUMA00', 'PUMA10', 'RELP', 'AGEP',
+                            'ESR', 'SCHL', 'SCH', 'JWTR', 'PERNP', 'WKHP', 'RAC1P', 'HISP', 'SEX')
 
     def get_geography_name(self):
         # this synthesis is at the block group level for most variables

diff --git a/synthpop/recipes/tests/test_starter.py b/synthpop/recipes/tests/test_starter.py
@@ -1,6 +1,8 @@
 import pytest
-from ...synthesizer import *
-from ..starter import Starter
+
+from synthpop.synthesizer import *
+from synthpop.recipes.starter import Starter
+from synthpop.recipes.starter2 import Starter as Starter2
 
 
 @pytest.fixture
@@ -9,5 +11,11 @@ def key():
 
 
 def test_starter(key):
-    st = Starter(key, "CA", "Napa County")
+    st = Starter(key, "CA", "Alpine County")
+    # just run it for now
     synthesize_all(st, num_geogs=1)
+
+
+# no synthesizer bc it's too memory intensive for travis
+def test_starter2(key):
+    Starter2(key, "CA", "Alpine County")