Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

normalize data for dcm #209

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
46 changes: 39 additions & 7 deletions urbansim/models/dcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,13 +234,19 @@ class MNLDiscreteChoiceModel(DiscreteChoiceModel):
Whether (and how much) to sample alternatives during prediction.
Note that this can lead to multiple choosers picking the same
alternative.
choice_column : optional
choice_column : str, optional
Name of the column in the `alternatives` table that choosers
should choose. e.g. the 'building_id' column. If not provided
the alternatives index is used.
name : optional
name : str, optional
Optional descriptive name for this model that may be used
in output.
normalize : bool, optional default False
subtract the mean and divide by the standard deviation before fitting the Coefficients
l1 : float, optional default 0.0
the amount of l1 (Lasso) regularization when fitting the Coefficients
l2 : float, optional default 0.0
the amount of l2 (Ridge) regularization when fitting the Coefficients

"""
def __init__(
Expand All @@ -251,7 +257,8 @@ def __init__(
interaction_predict_filters=None,
estimation_sample_size=None,
prediction_sample_size=None,
choice_column=None, name=None):
choice_column=None, name=None,
normalize=False, l1=0.0, l2=0.0):
self._check_prob_choice_mode_compat(probability_mode, choice_mode)
self._check_prob_mode_interaction_compat(
probability_mode, interaction_predict_filters)
Expand All @@ -270,6 +277,9 @@ def __init__(
self.choice_column = choice_column
self.name = name if name is not None else 'MNLDiscreteChoiceModel'
self.sim_pdf = None
self.normalize = normalize
self.l1 = l1
self.l2 = l2

self.log_likelihoods = None
self.fit_parameters = None
Expand Down Expand Up @@ -308,7 +318,10 @@ def from_yaml(cls, yaml_str=None, str_or_buffer=None):
estimation_sample_size=cfg.get('estimation_sample_size', None),
prediction_sample_size=cfg.get('prediction_sample_size', None),
choice_column=cfg.get('choice_column', None),
name=cfg.get('name', None)
name=cfg.get('name', None),
normalize=cfg.get('normalize', False),
l1=cfg.get('l1', 0.0),
l2=cfg.get('l2', 0.0),
)

if cfg.get('log_likelihoods', None):
Expand Down Expand Up @@ -420,7 +433,8 @@ def fit(self, choosers, alternatives, current_choice):
'the input columns.')

self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate(
model_design.as_matrix(), chosen, self.sample_size)
model_design.as_matrix(), chosen, self.sample_size,
normalize=self.normalize, l1=self.l1, l2=self.l2)
self.fit_parameters.index = model_design.columns

logger.debug('finish: fit LCM model {}'.format(self.name))
Expand Down Expand Up @@ -534,6 +548,18 @@ def probabilities(self, choosers, alternatives, filter_tables=True):
coeffs = [self.fit_parameters['Coefficient'][x]
for x in model_design.columns]

normalization_mean = 0.0
if 'Normalization Mean' in self.fit_parameters.columns:
normalization_mean = self.fit_parameters['Normalization Mean']
normalization_mean = [normalization_mean[x]
for x in model_design.columns]

normalization_std = 1.0
if 'Normalization Std' in self.fit_parameters.columns:
normalization_std = self.fit_parameters['Normalization Std']
normalization_std = [normalization_std[x]
for x in model_design.columns]

# probabilities are returned from mnl_simulate as a 2d array
# with choosers along rows and alternatives along columns
if self.probability_mode == 'single_chooser':
Expand All @@ -544,7 +570,10 @@ def probabilities(self, choosers, alternatives, filter_tables=True):
probabilities = mnl.mnl_simulate(
model_design.as_matrix(),
coeffs,
numalts=numalts, returnprobs=True)
numalts,
normalization_mean,
normalization_std,
returnprobs=True)

# want to turn probabilities into a Series with a MultiIndex
# of chooser IDs and alternative IDs.
Expand Down Expand Up @@ -680,7 +709,10 @@ def to_dict(self):
'fitted': self.fitted,
'log_likelihoods': self.log_likelihoods,
'fit_parameters': (yamlio.frame_to_yaml_safe(self.fit_parameters)
if self.fitted else None)
if self.fitted else None),
'normalize': self.normalize,
'l1': self.l1,
'l2': self.l2,
}

def to_yaml(self, str_or_buffer=None):
Expand Down
83 changes: 57 additions & 26 deletions urbansim/models/tests/test_dcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def alternatives():


@pytest.fixture
def basic_dcm():
def simple_dcm(request):
model_exp = 'var2 + var1:var3'
sample_size = 5
probability_mode = 'full_product'
Expand All @@ -71,6 +71,21 @@ def basic_dcm():
return model


@pytest.fixture
def simple_dcm_fit(simple_dcm, choosers, alternatives):
simple_dcm.fit(choosers, alternatives, choosers.thing_id)
return simple_dcm


@pytest.fixture(params=[(False, 0.0, 0.0), (True, 0.0, 0.0),
(True, 1.0e-2, 0.0), (True, 0.0, 1.0e-2), (True, 1.0e-2, 1.0e-2)])
def basic_dcm(request, simple_dcm):
model = simple_dcm
model.normalize, model.l1, model.l2 = request.param

return model


@pytest.fixture
def basic_dcm_fit(basic_dcm, choosers, alternatives):
basic_dcm.fit(choosers, alternatives, choosers.thing_id)
Expand Down Expand Up @@ -144,7 +159,7 @@ def test_mnl_dcm(seed, basic_dcm, choosers, alternatives):
# involved, but can at least do a smoke test.
assert len(loglik) == 3
assert len(basic_dcm.fit_parameters) == 2
assert len(basic_dcm.fit_parameters.columns) == 3
assert len(basic_dcm.fit_parameters.columns) == 5 if basic_dcm.normalize else 3

filtered_choosers, filtered_alts = basic_dcm.apply_predict_filters(
choosers, alternatives)
Expand All @@ -160,10 +175,17 @@ def test_mnl_dcm(seed, basic_dcm, choosers, alternatives):

choices = basic_dcm.predict(choosers.iloc[1:], alternatives)

pdt.assert_series_equal(
choices,
pd.Series(
['h', 'c', 'f'], index=pd.Index([1, 3, 4], name='chooser_id')))
if basic_dcm.normalize:
# normalize should not change the predictions but we hit the limits of coeffrange
pdt.assert_series_equal(
choices,
pd.Series(
['j', 'b', 'b'], index=pd.Index([1, 3, 4], name='chooser_id')))
else:
pdt.assert_series_equal(
choices,
pd.Series(
['h', 'c', 'f'], index=pd.Index([1, 3, 4], name='chooser_id')))

# check that we can do a YAML round-trip
yaml_str = basic_dcm.to_yaml()
Expand All @@ -190,7 +212,7 @@ def test_mnl_dcm_repeated_alts(basic_dcm, choosers, alternatives):
# involved, but can at least do a smoke test.
assert len(loglik) == 3
assert len(basic_dcm.fit_parameters) == 2
assert len(basic_dcm.fit_parameters.columns) == 3
assert len(basic_dcm.fit_parameters.columns) == 5 if basic_dcm.normalize else 3

repeated_index = alternatives.index.repeat([1, 2, 3, 2, 4, 3, 2, 1, 5, 8])
repeated_alts = alternatives.loc[repeated_index].reset_index()
Expand Down Expand Up @@ -219,7 +241,10 @@ def test_mnl_dcm_yaml(basic_dcm, choosers, alternatives):
'choice_column': basic_dcm.choice_column,
'fitted': False,
'log_likelihoods': None,
'fit_parameters': None
'fit_parameters': None,
'normalize': basic_dcm.normalize,
'l1': basic_dcm.l1,
'l2': basic_dcm.l2,
}

assert yaml.load(basic_dcm.to_yaml()) == expected_dict
Expand All @@ -243,13 +268,13 @@ def test_mnl_dcm_yaml(basic_dcm, choosers, alternatives):
assert new_mod.fitted is True


def test_mnl_dcm_prob_mode_single(seed, basic_dcm_fit, choosers, alternatives):
basic_dcm_fit.probability_mode = 'single_chooser'
def test_mnl_dcm_prob_mode_single(seed, simple_dcm_fit, choosers, alternatives):
simple_dcm_fit.probability_mode = 'single_chooser'

filtered_choosers, filtered_alts = basic_dcm_fit.apply_predict_filters(
filtered_choosers, filtered_alts = simple_dcm_fit.apply_predict_filters(
choosers, alternatives)

probs = basic_dcm_fit.probabilities(choosers.iloc[1:], alternatives)
probs = simple_dcm_fit.probabilities(choosers.iloc[1:], alternatives)

pdt.assert_series_equal(
probs,
Expand All @@ -267,21 +292,21 @@ def test_mnl_dcm_prob_mode_single(seed, basic_dcm_fit, choosers, alternatives):
[[1], filtered_alts.index.values],
names=['chooser_id', 'alternative_id'])))

sprobs = basic_dcm_fit.summed_probabilities(choosers, alternatives)
sprobs = simple_dcm_fit.summed_probabilities(choosers, alternatives)
pdt.assert_index_equal(
sprobs.index, filtered_alts.index, check_names=False)
npt.assert_allclose(sprobs.sum(), len(filtered_choosers))


def test_mnl_dcm_prob_mode_single_prediction_sample_size(
seed, basic_dcm_fit, choosers, alternatives):
basic_dcm_fit.probability_mode = 'single_chooser'
basic_dcm_fit.prediction_sample_size = 5
seed, simple_dcm_fit, choosers, alternatives):
simple_dcm_fit.probability_mode = 'single_chooser'
simple_dcm_fit.prediction_sample_size = 5

filtered_choosers, filtered_alts = basic_dcm_fit.apply_predict_filters(
filtered_choosers, filtered_alts = simple_dcm_fit.apply_predict_filters(
choosers, alternatives)

probs = basic_dcm_fit.probabilities(choosers.iloc[1:], alternatives)
probs = simple_dcm_fit.probabilities(choosers.iloc[1:], alternatives)

pdt.assert_series_equal(
probs,
Expand All @@ -295,7 +320,7 @@ def test_mnl_dcm_prob_mode_single_prediction_sample_size(
[[1], ['g', 'j', 'f', 'd', 'a']],
names=['chooser_id', 'alternative_id'])))

sprobs = basic_dcm_fit.summed_probabilities(choosers, alternatives)
sprobs = simple_dcm_fit.summed_probabilities(choosers, alternatives)
pdt.assert_index_equal(
sprobs.index,
pd.Index(['d', 'g', 'a', 'c', 'd'], name='alternative_id'))
Expand All @@ -320,14 +345,14 @@ def test_mnl_dcm_prob_mode_full_prediction_sample_size(
npt.assert_allclose(sprobs.sum(), len(filtered_choosers))


def test_mnl_dcm_choice_mode_agg(seed, basic_dcm_fit, choosers, alternatives):
basic_dcm_fit.probability_mode = 'single_chooser'
basic_dcm_fit.choice_mode = 'aggregate'
def test_mnl_dcm_choice_mode_agg(seed, simple_dcm_fit, choosers, alternatives):
simple_dcm_fit.probability_mode = 'single_chooser'
simple_dcm_fit.choice_mode = 'aggregate'

filtered_choosers, filtered_alts = basic_dcm_fit.apply_predict_filters(
filtered_choosers, filtered_alts = simple_dcm_fit.apply_predict_filters(
choosers, alternatives)

choices = basic_dcm_fit.predict(choosers, alternatives)
choices = simple_dcm_fit.predict(choosers, alternatives)

pdt.assert_series_equal(
choices,
Expand Down Expand Up @@ -512,14 +537,20 @@ def test_mnl_dcm_segmented_yaml(grouped_choosers, alternatives):
'name': 'x',
'fitted': False,
'log_likelihoods': None,
'fit_parameters': None
'fit_parameters': None,
'normalize': False,
'l1': 0.0,
'l2': 0.0,
},
'y': {
'name': 'y',
'model_expression': 'var3 + var1:var2',
'fitted': False,
'log_likelihoods': None,
'fit_parameters': None
'fit_parameters': None,
'normalize': False,
'l1': 0.0,
'l2': 0.0,
}
}
}
Expand Down
32 changes: 28 additions & 4 deletions urbansim/urbanchoice/mnl.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def get_standard_error(hessian):


def mnl_loglik(beta, data, chosen, numalts, weights=None, lcgrad=False,
stderr=0):
stderr=0, l1=0.0, l2=0.0):
logger.debug('start: calculate MNL log-likelihood')
numvars = beta.size
numobs = data.size() // numvars // numalts
Expand Down Expand Up @@ -117,11 +117,18 @@ def mnl_loglik(beta, data, chosen, numalts, weights=None, lcgrad=False,
loglik = loglik.get_mat()[0, 0]
gradarr = np.reshape(gradarr.get_mat(), (1, gradarr.size()))[0]

loglik -= l1 * np.abs(beta.get_mat()).sum()
gradarr -= l1 * np.sign(beta.get_mat())[0]

loglik -= l2 * np.square(beta.get_mat()).sum()
gradarr -= l2 * beta.get_mat()[0]

logger.debug('finish: calculate MNL log-likelihood')
return -1 * loglik, -1 * gradarr


def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True):
def mnl_simulate(data, coeff, numalts, normalization_mean=0.0, normalization_std=1.0, GPU=False,
returnprobs=True):
"""
Get the probabilities for each chooser choosing between `numalts`
alternatives.
Expand All @@ -136,6 +143,10 @@ def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True):
The model coefficients corresponding to each column in `data`.
numalts : int
The number of alternatives available to each chooser.
normalization_mean : 1D array, optional
The model normalization constant corresponding to each column in `data`.
normalization_std : 1D array, optional
The model normalization factor corresponding to each column in `data`.
GPU : bool, optional
returnprobs : bool, optional
If True, return the probabilities for each chooser/alternative instead
Expand All @@ -153,6 +164,7 @@ def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True):
len(data), numalts))
atype = 'numpy' if not GPU else 'cuda'

data = (data.copy() - normalization_mean) / normalization_std
data = np.transpose(data)
coeff = np.reshape(np.array(coeff), (1, len(coeff)))

Expand All @@ -176,7 +188,7 @@ def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True):


def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3),
weights=None, lcgrad=False, beta=None):
weights=None, lcgrad=False, beta=None, normalize=False, l1=0.0, l2=0.0):
"""
Calculate coefficients of the MNL model.

Expand All @@ -200,6 +212,8 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3),
lcgrad : bool, optional
beta : 1D array, optional
Any initial guess for the coefficients.
normalize : bool, optional default False
subtract the mean and divide by the standard deviation before fitting the Coefficients

Returns
-------
Expand All @@ -224,6 +238,12 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3),
numvars = data.shape[1]
numobs = data.shape[0] // numalts

if normalize:
normalization_mean = data.mean(0)
normalization_std = data.std(0, ddof=1)

data = (data.copy() - normalization_mean) / normalization_std

if chosen is None:
chosen = np.ones((numobs, numalts)) # used for latent classes

Expand All @@ -239,7 +259,7 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3),
bounds = [coeffrange] * numvars

with log_start_finish('scipy optimization for MNL fit', logger):
args = (data, chosen, numalts, weights, lcgrad)
args = (data, chosen, numalts, weights, lcgrad, 0, l1, l2)
bfgs_result = scipy.optimize.fmin_l_bfgs_b(mnl_loglik,
beta,
args=args,
Expand Down Expand Up @@ -271,5 +291,9 @@ def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3),
'Std. Error': stderr,
'T-Score': beta / stderr})

if normalize:
fit_parameters['Normalization Mean'] = normalization_mean
fit_parameters['Normalization Std'] = normalization_std

logger.debug('finish: MNL fit')
return log_likelihood, fit_parameters
Loading