diff --git a/urbansim/models/dcm.py b/urbansim/models/dcm.py index b649ad1d..515f7983 100644 --- a/urbansim/models/dcm.py +++ b/urbansim/models/dcm.py @@ -234,13 +234,19 @@ class MNLDiscreteChoiceModel(DiscreteChoiceModel): Whether (and how much) to sample alternatives during prediction. Note that this can lead to multiple choosers picking the same alternative. - choice_column : optional + choice_column : str, optional Name of the column in the `alternatives` table that choosers should choose. e.g. the 'building_id' column. If not provided the alternatives index is used. - name : optional + name : str, optional Optional descriptive name for this model that may be used in output. + normalize : bool, optional default False + subtract the mean and divide by the standard deviation before fitting the Coefficients + l1 : float, optional default 0.0 + the amount of l1 (Lasso) regularization when fitting the Coefficients + l2 : float, optional default 0.0 + the amount of l2 (Ridge) regularization when fitting the Coefficients """ def __init__( @@ -251,7 +257,8 @@ def __init__( interaction_predict_filters=None, estimation_sample_size=None, prediction_sample_size=None, - choice_column=None, name=None): + choice_column=None, name=None, + normalize=False, l1=0.0, l2=0.0): self._check_prob_choice_mode_compat(probability_mode, choice_mode) self._check_prob_mode_interaction_compat( probability_mode, interaction_predict_filters) @@ -270,6 +277,9 @@ def __init__( self.choice_column = choice_column self.name = name if name is not None else 'MNLDiscreteChoiceModel' self.sim_pdf = None + self.normalize = normalize + self.l1 = l1 + self.l2 = l2 self.log_likelihoods = None self.fit_parameters = None @@ -308,7 +318,10 @@ def from_yaml(cls, yaml_str=None, str_or_buffer=None): estimation_sample_size=cfg.get('estimation_sample_size', None), prediction_sample_size=cfg.get('prediction_sample_size', None), choice_column=cfg.get('choice_column', None), - name=cfg.get('name', None) + name=cfg.get('name', None), + normalize=cfg.get('normalize', False), + l1=cfg.get('l1', 0.0), + l2=cfg.get('l2', 0.0), ) if cfg.get('log_likelihoods', None): @@ -420,7 +433,8 @@ def fit(self, choosers, alternatives, current_choice): 'the input columns.') self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate( - model_design.as_matrix(), chosen, self.sample_size) + model_design.as_matrix(), chosen, self.sample_size, + normalize=self.normalize, l1=self.l1, l2=self.l2) self.fit_parameters.index = model_design.columns logger.debug('finish: fit LCM model {}'.format(self.name)) @@ -534,6 +548,18 @@ def probabilities(self, choosers, alternatives, filter_tables=True): coeffs = [self.fit_parameters['Coefficient'][x] for x in model_design.columns] + normalization_mean = 0.0 + if 'Normalization Mean' in self.fit_parameters.columns: + normalization_mean = self.fit_parameters['Normalization Mean'] + normalization_mean = [normalization_mean[x] + for x in model_design.columns] + + normalization_std = 1.0 + if 'Normalization Std' in self.fit_parameters.columns: + normalization_std = self.fit_parameters['Normalization Std'] + normalization_std = [normalization_std[x] + for x in model_design.columns] + # probabilities are returned from mnl_simulate as a 2d array # with choosers along rows and alternatives along columns if self.probability_mode == 'single_chooser': @@ -544,7 +570,10 @@ def probabilities(self, choosers, alternatives, filter_tables=True): probabilities = mnl.mnl_simulate( model_design.as_matrix(), coeffs, - numalts=numalts, returnprobs=True) + numalts, + normalization_mean, + normalization_std, + returnprobs=True) # want to turn probabilities into a Series with a MultiIndex # of chooser IDs and alternative IDs. @@ -680,7 +709,10 @@ def to_dict(self): 'fitted': self.fitted, 'log_likelihoods': self.log_likelihoods, 'fit_parameters': (yamlio.frame_to_yaml_safe(self.fit_parameters) - if self.fitted else None) + if self.fitted else None), + 'normalize': self.normalize, + 'l1': self.l1, + 'l2': self.l2, } def to_yaml(self, str_or_buffer=None): diff --git a/urbansim/models/tests/test_dcm.py b/urbansim/models/tests/test_dcm.py index fa60ba06..e72f3c6b 100644 --- a/urbansim/models/tests/test_dcm.py +++ b/urbansim/models/tests/test_dcm.py @@ -45,7 +45,7 @@ def alternatives(): @pytest.fixture -def basic_dcm(): +def simple_dcm(request): model_exp = 'var2 + var1:var3' sample_size = 5 probability_mode = 'full_product' @@ -71,6 +71,21 @@ def basic_dcm(): return model +@pytest.fixture +def simple_dcm_fit(simple_dcm, choosers, alternatives): + simple_dcm.fit(choosers, alternatives, choosers.thing_id) + return simple_dcm + + +@pytest.fixture(params=[(False, 0.0, 0.0), (True, 0.0, 0.0), + (True, 1.0e-2, 0.0), (True, 0.0, 1.0e-2), (True, 1.0e-2, 1.0e-2)]) +def basic_dcm(request, simple_dcm): + model = simple_dcm + model.normalize, model.l1, model.l2 = request.param + + return model + + @pytest.fixture def basic_dcm_fit(basic_dcm, choosers, alternatives): basic_dcm.fit(choosers, alternatives, choosers.thing_id) @@ -144,7 +159,7 @@ def test_mnl_dcm(seed, basic_dcm, choosers, alternatives): # involved, but can at least do a smoke test. assert len(loglik) == 3 assert len(basic_dcm.fit_parameters) == 2 - assert len(basic_dcm.fit_parameters.columns) == 3 + assert len(basic_dcm.fit_parameters.columns) == 5 if basic_dcm.normalize else 3 filtered_choosers, filtered_alts = basic_dcm.apply_predict_filters( choosers, alternatives) @@ -160,10 +175,17 @@ def test_mnl_dcm(seed, basic_dcm, choosers, alternatives): choices = basic_dcm.predict(choosers.iloc[1:], alternatives) - pdt.assert_series_equal( - choices, - pd.Series( - ['h', 'c', 'f'], index=pd.Index([1, 3, 4], name='chooser_id'))) + if basic_dcm.normalize: + # normalize should not change the predictions but we hit the limits of coeffrange + pdt.assert_series_equal( + choices, + pd.Series( + ['j', 'b', 'b'], index=pd.Index([1, 3, 4], name='chooser_id'))) + else: + pdt.assert_series_equal( + choices, + pd.Series( + ['h', 'c', 'f'], index=pd.Index([1, 3, 4], name='chooser_id'))) # check that we can do a YAML round-trip yaml_str = basic_dcm.to_yaml() @@ -190,7 +212,7 @@ def test_mnl_dcm_repeated_alts(basic_dcm, choosers, alternatives): # involved, but can at least do a smoke test. assert len(loglik) == 3 assert len(basic_dcm.fit_parameters) == 2 - assert len(basic_dcm.fit_parameters.columns) == 3 + assert len(basic_dcm.fit_parameters.columns) == 5 if basic_dcm.normalize else 3 repeated_index = alternatives.index.repeat([1, 2, 3, 2, 4, 3, 2, 1, 5, 8]) repeated_alts = alternatives.loc[repeated_index].reset_index() @@ -219,7 +241,10 @@ def test_mnl_dcm_yaml(basic_dcm, choosers, alternatives): 'choice_column': basic_dcm.choice_column, 'fitted': False, 'log_likelihoods': None, - 'fit_parameters': None + 'fit_parameters': None, + 'normalize': basic_dcm.normalize, + 'l1': basic_dcm.l1, + 'l2': basic_dcm.l2, } assert yaml.load(basic_dcm.to_yaml()) == expected_dict @@ -243,13 +268,13 @@ def test_mnl_dcm_yaml(basic_dcm, choosers, alternatives): assert new_mod.fitted is True -def test_mnl_dcm_prob_mode_single(seed, basic_dcm_fit, choosers, alternatives): - basic_dcm_fit.probability_mode = 'single_chooser' +def test_mnl_dcm_prob_mode_single(seed, simple_dcm_fit, choosers, alternatives): + simple_dcm_fit.probability_mode = 'single_chooser' - filtered_choosers, filtered_alts = basic_dcm_fit.apply_predict_filters( + filtered_choosers, filtered_alts = simple_dcm_fit.apply_predict_filters( choosers, alternatives) - probs = basic_dcm_fit.probabilities(choosers.iloc[1:], alternatives) + probs = simple_dcm_fit.probabilities(choosers.iloc[1:], alternatives) pdt.assert_series_equal( probs, @@ -267,21 +292,21 @@ def test_mnl_dcm_prob_mode_single(seed, basic_dcm_fit, choosers, alternatives): [[1], filtered_alts.index.values], names=['chooser_id', 'alternative_id']))) - sprobs = basic_dcm_fit.summed_probabilities(choosers, alternatives) + sprobs = simple_dcm_fit.summed_probabilities(choosers, alternatives) pdt.assert_index_equal( sprobs.index, filtered_alts.index, check_names=False) npt.assert_allclose(sprobs.sum(), len(filtered_choosers)) def test_mnl_dcm_prob_mode_single_prediction_sample_size( - seed, basic_dcm_fit, choosers, alternatives): - basic_dcm_fit.probability_mode = 'single_chooser' - basic_dcm_fit.prediction_sample_size = 5 + seed, simple_dcm_fit, choosers, alternatives): + simple_dcm_fit.probability_mode = 'single_chooser' + simple_dcm_fit.prediction_sample_size = 5 - filtered_choosers, filtered_alts = basic_dcm_fit.apply_predict_filters( + filtered_choosers, filtered_alts = simple_dcm_fit.apply_predict_filters( choosers, alternatives) - probs = basic_dcm_fit.probabilities(choosers.iloc[1:], alternatives) + probs = simple_dcm_fit.probabilities(choosers.iloc[1:], alternatives) pdt.assert_series_equal( probs, @@ -295,7 +320,7 @@ def test_mnl_dcm_prob_mode_single_prediction_sample_size( [[1], ['g', 'j', 'f', 'd', 'a']], names=['chooser_id', 'alternative_id']))) - sprobs = basic_dcm_fit.summed_probabilities(choosers, alternatives) + sprobs = simple_dcm_fit.summed_probabilities(choosers, alternatives) pdt.assert_index_equal( sprobs.index, pd.Index(['d', 'g', 'a', 'c', 'd'], name='alternative_id')) @@ -320,14 +345,14 @@ def test_mnl_dcm_prob_mode_full_prediction_sample_size( npt.assert_allclose(sprobs.sum(), len(filtered_choosers)) -def test_mnl_dcm_choice_mode_agg(seed, basic_dcm_fit, choosers, alternatives): - basic_dcm_fit.probability_mode = 'single_chooser' - basic_dcm_fit.choice_mode = 'aggregate' +def test_mnl_dcm_choice_mode_agg(seed, simple_dcm_fit, choosers, alternatives): + simple_dcm_fit.probability_mode = 'single_chooser' + simple_dcm_fit.choice_mode = 'aggregate' - filtered_choosers, filtered_alts = basic_dcm_fit.apply_predict_filters( + filtered_choosers, filtered_alts = simple_dcm_fit.apply_predict_filters( choosers, alternatives) - choices = basic_dcm_fit.predict(choosers, alternatives) + choices = simple_dcm_fit.predict(choosers, alternatives) pdt.assert_series_equal( choices, @@ -512,14 +537,20 @@ def test_mnl_dcm_segmented_yaml(grouped_choosers, alternatives): 'name': 'x', 'fitted': False, 'log_likelihoods': None, - 'fit_parameters': None + 'fit_parameters': None, + 'normalize': False, + 'l1': 0.0, + 'l2': 0.0, }, 'y': { 'name': 'y', 'model_expression': 'var3 + var1:var2', 'fitted': False, 'log_likelihoods': None, - 'fit_parameters': None + 'fit_parameters': None, + 'normalize': False, + 'l1': 0.0, + 'l2': 0.0, } } } diff --git a/urbansim/urbanchoice/mnl.py b/urbansim/urbanchoice/mnl.py index 463eb252..8ba33866 100644 --- a/urbansim/urbanchoice/mnl.py +++ b/urbansim/urbanchoice/mnl.py @@ -67,7 +67,7 @@ def get_standard_error(hessian): def mnl_loglik(beta, data, chosen, numalts, weights=None, lcgrad=False, - stderr=0): + stderr=0, l1=0.0, l2=0.0): logger.debug('start: calculate MNL log-likelihood') numvars = beta.size numobs = data.size() // numvars // numalts @@ -117,11 +117,18 @@ def mnl_loglik(beta, data, chosen, numalts, weights=None, lcgrad=False, loglik = loglik.get_mat()[0, 0] gradarr = np.reshape(gradarr.get_mat(), (1, gradarr.size()))[0] + loglik -= l1 * np.abs(beta.get_mat()).sum() + gradarr -= l1 * np.sign(beta.get_mat())[0] + + loglik -= l2 * np.square(beta.get_mat()).sum() + gradarr -= l2 * beta.get_mat()[0] + logger.debug('finish: calculate MNL log-likelihood') return -1 * loglik, -1 * gradarr -def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True): +def mnl_simulate(data, coeff, numalts, normalization_mean=0.0, normalization_std=1.0, GPU=False, + returnprobs=True): """ Get the probabilities for each chooser choosing between `numalts` alternatives. @@ -136,6 +143,10 @@ def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True): The model coefficients corresponding to each column in `data`. numalts : int The number of alternatives available to each chooser. + normalization_mean : 1D array, optional + The model normalization constant corresponding to each column in `data`. + normalization_std : 1D array, optional + The model normalization factor corresponding to each column in `data`. GPU : bool, optional returnprobs : bool, optional If True, return the probabilities for each chooser/alternative instead @@ -153,6 +164,7 @@ def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True): len(data), numalts)) atype = 'numpy' if not GPU else 'cuda' + data = (data.copy() - normalization_mean) / normalization_std data = np.transpose(data) coeff = np.reshape(np.array(coeff), (1, len(coeff))) @@ -176,7 +188,7 @@ def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True): def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3), - weights=None, lcgrad=False, beta=None): + weights=None, lcgrad=False, beta=None, normalize=False, l1=0.0, l2=0.0): """ Calculate coefficients of the MNL model. @@ -200,6 +212,8 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3), lcgrad : bool, optional beta : 1D array, optional Any initial guess for the coefficients. + normalize : bool, optional default False + subtract the mean and divide by the standard deviation before fitting the Coefficients Returns ------- @@ -224,6 +238,12 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3), numvars = data.shape[1] numobs = data.shape[0] // numalts + if normalize: + normalization_mean = data.mean(0) + normalization_std = data.std(0, ddof=1) + + data = (data.copy() - normalization_mean) / normalization_std + if chosen is None: chosen = np.ones((numobs, numalts)) # used for latent classes @@ -239,7 +259,7 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3), bounds = [coeffrange] * numvars with log_start_finish('scipy optimization for MNL fit', logger): - args = (data, chosen, numalts, weights, lcgrad) + args = (data, chosen, numalts, weights, lcgrad, 0, l1, l2) bfgs_result = scipy.optimize.fmin_l_bfgs_b(mnl_loglik, beta, args=args, @@ -271,5 +291,9 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3), 'Std. Error': stderr, 'T-Score': beta / stderr}) + if normalize: + fit_parameters['Normalization Mean'] = normalization_mean + fit_parameters['Normalization Std'] = normalization_std + logger.debug('finish: MNL fit') return log_likelihood, fit_parameters diff --git a/urbansim/urbanchoice/tests/test_mnl.py b/urbansim/urbanchoice/tests/test_mnl.py index a939e462..50e6869d 100644 --- a/urbansim/urbanchoice/tests/test_mnl.py +++ b/urbansim/urbanchoice/tests/test_mnl.py @@ -111,30 +111,54 @@ def choosers_dm(choosers, test_data): @pytest.fixture def fit_coeffs(dm, chosen, num_alts): log_like, fit = mnl.mnl_estimate(dm.as_matrix(), chosen, num_alts) - return fit.Coefficient.values + return fit -def test_mnl_estimate(dm, chosen, num_alts, test_data): - log_like, fit = mnl.mnl_estimate(dm.as_matrix(), chosen, num_alts) - result = pd.Series(fit.Coefficient.values, index=dm.columns) +@pytest.fixture +def fit_normalize(dm, chosen, num_alts): + log_like, fit = mnl.mnl_estimate(dm.as_matrix(), chosen, num_alts, normalize=True) + return fit + + +def test_mnl_estimate(fit_coeffs, fit_normalize, dm, chosen, num_alts, test_data): + result = pd.Series(fit_coeffs.Coefficient.values, index=dm.columns) result, expected = result.align(test_data['est_expected']) npt.assert_allclose(result.values, expected.values, rtol=1e-4) + l1 = abs(fit_normalize.Coefficient).sum() + _, fit = mnl.mnl_estimate(dm.as_matrix(), chosen, num_alts, normalize=True, l1=0.1) + fit_l1 = abs(fit.Coefficient).sum() + assert fit_l1 < l1, "we asked that the l1 norm be minimized so it should be smaller" + + l2 = np.square(fit_normalize.Coefficient).sum() + _, fit = mnl.mnl_estimate(dm.as_matrix(), chosen, num_alts, normalize=True, l1=0.2) + fit_l2 = np.square(fit.Coefficient).sum() + assert fit_l2 < l2, "we asked that the l2 norm be minimized so it should be smaller" + -def test_mnl_simulate(dm, fit_coeffs, num_alts, test_data, choosers_dm): +def test_mnl_simulate(dm, fit_coeffs, fit_normalize, num_alts, test_data, choosers_dm): # check that if all the alternatives have the same numbers # we get an even probability distribution data = np.array( [[10 ** (x + 1) for x in range(len(dm.columns))]] * num_alts) probs = mnl.mnl_simulate( - data, fit_coeffs, num_alts, returnprobs=True) + data, fit_coeffs.Coefficient.values, num_alts, returnprobs=True) npt.assert_allclose(probs, [[1 / num_alts] * num_alts]) # now test with real data probs = mnl.mnl_simulate( - choosers_dm.as_matrix(), fit_coeffs, num_alts, returnprobs=True) + choosers_dm.as_matrix(), fit_coeffs.Coefficient.values, num_alts, returnprobs=True) + results = pd.DataFrame(probs, columns=test_data['sim_expected'].columns) + results, expected = results.align(test_data['sim_expected']) + npt.assert_allclose(results.as_matrix(), expected.as_matrix(), rtol=1e-4) + + # now test with real data + probs = mnl.mnl_simulate( + choosers_dm.as_matrix(), fit_normalize.Coefficient.values, num_alts, returnprobs=True, + normalization_mean=fit_normalize['Normalization Mean'].values, + normalization_std=fit_normalize['Normalization Std'].values) results = pd.DataFrame(probs, columns=test_data['sim_expected'].columns) results, expected = results.align(test_data['sim_expected']) npt.assert_allclose(results.as_matrix(), expected.as_matrix(), rtol=1e-4)