Skip to content

Commit

Permalink
Introduce BetaGeoNBD Random Variable (#1317)
Browse files Browse the repository at this point in the history
* Add BGNBD distribution and BGNBD Random Variable

* Add BGNBD excel test

* Remove logp in terms of Potential

* Rename BGNBD -> BetaGeoNBD

* Add logp and test matching lifetimes

* Add logp param.type.ndim > 1. Add logp pt.switch. Related tests

* Add test_bg_nbd_sample_prior

* Add _distribution_new_customers and related test. Rename population_dropout|purchase_rate as dropout|purchase_rate to consolidate naming among models

* Adjust test_model_repr expected result to BetaGeoNBD instead of BGNBD

* Improve distribution_new_customer, distribution_new_customer_purchase_rate. Introduce distribution_new_customer_recency_frequency. Improve tests

* Revert @pytest.mark.slow to in test_model_convergence

* Revert distribution changes related to #1269

* Revert more changes related to #1269

* Revert BetaGeoBetaBinomRV changes

* Revert ParetoNBDRV changes

* Docstring cleanup

* Revert changes in ContContract dist

* Clean ContContract changes

* Revert deletion _supp_shape_from_params

* Remove commented chunk on fit_result. Opted for data to standardize with other CLV models

* BetaGeoNBDRV as pre-#1269 definition

* Fix test_numerically_stable_logp

* Overrride ModifiedBetaGeoModel.distribution_new_customer method

* Modify test_bg_nbd tensor parametrization to only vectors in test_bg_nbd. Passing param.type.ndim > 1 now raises NotImplementedError

* Silence mypy in 2 lines

* Adapt BetaGeoNBDRV to #1269

* Fix test_clv_fit_mcmc

* Modify sim_data to reflect the beta-distributed dropout process

* Add reference to BetaGeoNBD

* Delete _logp

* Delete commented weights param in test_bg_nbd

* Ammend BetaGeoNBD docstring

* Fix BetaGeoNBD math

* Fix test_posterior_distributions to include dropout distributions

* Fix #NUM! docstring

* Tweak sim_data

* Add co-author.

Co-authored-by: Colt Allen <[email protected]>

---------

Co-authored-by: Colt Allen <[email protected]>
  • Loading branch information
PabloRoque and ColtAllen authored Jan 14, 2025
1 parent bcc9837 commit 8d94482
Show file tree
Hide file tree
Showing 6 changed files with 657 additions and 70 deletions.
156 changes: 155 additions & 1 deletion pymc_marketing/clv/distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@
from pytensor.graph import vectorize_graph
from pytensor.tensor.random.op import RandomVariable

__all__ = ["BetaGeoBetaBinom", "ContContract", "ContNonContract", "ParetoNBD"]
__all__ = [
"BetaGeoBetaBinom",
"BetaGeoNBD",
"ContContract",
"ContNonContract",
"ParetoNBD",
]


class ContNonContractRV(RandomVariable):
Expand Down Expand Up @@ -587,3 +593,151 @@ def logp_customer_died(t_x_i, x_i, alpha_i, beta_i, gamma_i, delta_i, T_i):
delta > 0,
msg="alpha > 0, beta > 0, gamma > 0, delta > 0",
)


class BetaGeoNBDRV(RandomVariable):
name = "bg_nbd"
signature = "(),(),(),(),()->(2)"

dtype = "floatX"
_print_name = ("BetaGeoNBD", "\\operatorname{BetaGeoNBD}")

def __call__(self, a, b, r, alpha, T, size=None, **kwargs):
return super().__call__(a, b, r, alpha, T, size=size, **kwargs)

@classmethod
def rng_fn(cls, rng, a, b, r, alpha, T, size):
if size is None:
size = np.broadcast_shapes(a.shape, b.shape, r.shape, alpha.shape, T.shape)

a = np.asarray(a)
b = np.asarray(b)
r = np.asarray(r)
alpha = np.asarray(alpha)
T = np.asarray(T)

if size == ():
size = np.broadcast_shapes(a.shape, b.shape, r.shape, alpha.shape, T.shape)

a = np.broadcast_to(a, size)
b = np.broadcast_to(b, size)
r = np.broadcast_to(r, size)
alpha = np.broadcast_to(alpha, size)
T = np.broadcast_to(T, size)

output = np.zeros(shape=size + (2,)) # noqa:RUF005

lam = rng.gamma(shape=r, scale=1 / alpha, size=size)
p = rng.beta(a=a, b=b, size=size)

def sim_data(lam, p, T):
t = 0 # recency
n = 0 # frequency

churn = 0 # BG/NBD assumes all non-repeat customers are active
wait = rng.exponential(scale=1 / lam)

while t + wait < T and not churn:
churn = rng.random() < p
n += 1
t += wait
wait = rng.exponential(scale=1 / lam)

return np.array([t, n])

for index in np.ndindex(*size):
output[index] = sim_data(lam[index], p[index], T[index])

return output


bg_nbd = BetaGeoNBDRV()


class BetaGeoNBD(PositiveContinuous):
r"""Population-level distribution class for a discrete, non-contractual, Beta-Geometric/Negative-Binomial process.
It is based on Fader, et al. in [1]_, [2]_ and [3]_.
.. math::
\mathbb{LL}(r, \alpha, a, b | x, t_x, T) =
D_1 + D_2 + \ln(C_3 + \delta_{x>0} C_4) \text{, where:} \\
\begin{align}
D_1 &= \ln \left[ \Gamma(r+x) \right] - \ln \left[ \Gamma(r) \right] + \ln \left[ \Gamma(a+b) \right] + \ln \left[ \Gamma(b+x) \right] \\
D_2 &= r \ln(\alpha) - (r+x) \ln(\alpha + t_x) \\
C_3 &= \left(\frac{\alpha + t_x}{\alpha + T} \right)^{r+x} \\
C_4 &= \left(\frac{a}{b+x-1} \right) \\
\end{align}
======== ===============================================
Support :math:`t_j >= 0` for :math:`j = 1, \dots,x`
Mean :math:`\mathbb{E}[X(n) | r, \alpha, a, b] = \frac{a+b-1}{a-1} \left[ 1 - \left(\frac{\alpha}{\alpha + T}\right)^r {_2}{F}{_1}(r,b;a+b-1;\frac{t}{\alpha + t}) \right]`
======== ===============================================
References
----------
.. [1] Fader, Peter S., Bruce G.S. Hardie, and Jen Shang (2010),
"Counting Your Customers" the Easy Way: An Alternative to the Pareto/NBD Model
Marketing Science, 24 (Spring), 275-284
.. [2] Implementing the BG/NBD Model for Customer Base Analysis in Excel http://brucehardie.com/notes/004/bgnbd_spreadsheet_note.pdf
.. [3] Overcoming the BG/NBD Model's #NUM! Error Problem https://brucehardie.com/notes/027/bgnbd_num_error.pdf
""" # noqa: E501

rv_op = bg_nbd

@classmethod
def dist(cls, a, b, r, alpha, T, **kwargs):
"""Get the distribution from the parameters."""
return super().dist([a, b, r, alpha, T], **kwargs)

def logp(value, a, b, r, alpha, T):
"""Log-likelihood of the distribution."""
t_x = pt.atleast_1d(value[..., 0])
x = pt.atleast_1d(value[..., 1])

for param in (t_x, x, a, b, r, alpha, T):
if param.type.ndim > 1:
raise NotImplementedError(
f"BetaGeoNBD logp only implemented for vector parameters, got ndim={param.type.ndim}"
)

x_non_zero = x > 0

d1 = (
pt.gammaln(r + x)
- pt.gammaln(r)
+ pt.gammaln(a + b)
+ pt.gammaln(b + x)
- pt.gammaln(b)
- pt.gammaln(a + b + x)
)

d2 = r * pt.log(alpha) - (r + x) * pt.log(alpha + t_x)
c3 = ((alpha + t_x) / (alpha + T)) ** (r + x)
c4 = a / (b + x - 1)

logp = d1 + d2 + pt.log(c3 + pt.switch(x_non_zero, c4, 0))

logp = pt.switch(
pt.or_(
pt.or_(
pt.lt(t_x, 0),
pt.lt(x, 0),
),
pt.gt(t_x, T),
),
-np.inf,
logp,
)

return check_parameters(
logp,
a > 0,
b > 0,
alpha > 0,
r > 0,
msg="a > 0, b > 0, alpha > 0, r > 0",
)
Loading

0 comments on commit 8d94482

Please sign in to comment.