Add HMM-constructing helper functions and Ops

aesara-devs · Apr 4, 2022 · 441ce60 · 441ce60
1 parent 0dd8831
commit 441ce60
Show file tree

Hide file tree

Showing 4 changed files with 912 additions and 3 deletions.
diff --git a/aeppl/dists.py b/aeppl/dists.py
@@ -1,8 +1,17 @@
 import warnings
+from copy import copy
+from typing import Sequence
 
+import aesara
 import aesara.tensor as at
-from aesara.graph.basic import Apply
+import numpy as np
+from aesara.compile.builders import OpFromGraph
+from aesara.graph.basic import Apply, Constant
 from aesara.graph.op import Op
+from aesara.tensor.basic import make_vector
+from aesara.tensor.random.basic import categorical
+from aesara.tensor.random.utils import broadcast_params, normalize_size_param
+from aesara.tensor.var import TensorVariable
 
 from aeppl.abstract import MeasurableVariable
 
@@ -40,3 +49,188 @@ def infer_shape(self, fgraph, node, input_shapes):
 dirac_delta = DiracDelta()
 
 MeasurableVariable.register(DiracDelta)
+
+
+def non_constant(x):
+    x = at.as_tensor_variable(x)
+    if isinstance(x, Constant):
+        # XXX: This isn't good for `size` parameters, because it could result
+        # in `at.get_vector_length` exceptions.
+        res = x.type()
+        res.tag = copy(res.tag)
+        if aesara.config.compute_test_value != "off":
+            res.tag.test_value = x.data
+        res.name = x.name
+        return res
+    else:
+        return x
+
+
+def switching_process(
+    comp_rvs: Sequence[TensorVariable],
+    states: TensorVariable,
+):
+    """Construct a switching process over arbitrary univariate mixtures and a state sequence.
+
+    This simply constructs a graph of the following form:
+
+        at.stack(comp_rvs)[states, *idx]
+
+    where ``idx`` makes sure that `states` selects mixture components along all
+    the other axes.
+
+    Parameters
+    ----------
+    comp_rvs
+        A list containing `MeasurableVariable` objects for each mixture component.
+    states
+        The hidden state sequence.  It should have a number of states
+        equal to the size of `comp_dists`.
+
+    """
+
+    states = at.as_tensor(states, dtype=np.int64)
+    comp_rvs_bcast = at.broadcast_arrays(*[at.as_tensor(rv) for rv in comp_rvs])
+    M_rv = at.stack(comp_rvs_bcast)
+    indices = (states,) + tuple(at.arange(d) for d in tuple(M_rv.shape)[1:])
+    rv_var = M_rv[indices]
+    return rv_var
+
+
+class DiscreteMarkovChainFactory(OpFromGraph):
+    """An `Op` constructed from an Aesara graph that represents a discrete Markov chain.
+
+    This "composite" `Op` allows us to mark a sub-graph as measurable and
+    assign a `_logprob` dispatch implementation.
+
+    As far as broadcasting is concerned, this `Op` has the following
+    `RandomVariable`-like properties:
+
+        ndim_supp = 1
+        ndims_params = (3, 1)
+
+    TODO: It would be nice to express this as a `Blockwise` `Op`.
+    """
+
+
+MeasurableVariable.register(DiscreteMarkovChainFactory)
+
+
+def create_discrete_mc_op(rng, size, Gammas, gamma_0):
+    """Construct a `DiscreteMarkovChainFactory` `Op`.
+
+    This returns a `Scan` that performs the follow:
+
+        states[0] = categorical(gamma_0)
+        for t in range(1, N):
+            states[t] = categorical(Gammas[t, state[t-1]])
+
+    The Aesara graph representing the above is wrapped in an `OpFromGraph` so
+    that we can easily assign it a specific log-probability.
+
+    TODO: Eventually, AePPL should be capable of parsing more sophisticated
+    `Scan`s and producing nearly the same log-likelihoods, and the use of
+    `OpFromGraph` will no longer be necessary.
+
+    """
+
+    # Again, we need to preserve the length of this symbolic vector, so we do
+    # this.
+    size_param = make_vector(
+        *[non_constant(size[i]) for i in range(at.get_vector_length(size))]
+    )
+    size_param.name = "size"
+
+    # We make shallow copies so that unwanted ancestors don't appear in the
+    # graph.
+    Gammas_param = non_constant(Gammas).type()
+    Gammas_param.name = "Gammas_param"
+
+    gamma_0_param = non_constant(gamma_0).type()
+    gamma_0_param.name = "gamma_0_param"
+
+    bcast_Gammas_param, bcast_gamma_0_param = broadcast_params(
+        (Gammas_param, gamma_0_param), (3, 1)
+    )
+
+    # Sample state 0 in each state sequence
+    state_0 = categorical(
+        bcast_gamma_0_param,
+        size=tuple(size_param) + tuple(bcast_gamma_0_param.shape[:-1]),
+        # size=at.join(0, size_param, bcast_gamma_0_param.shape[:-1]),
+        rng=rng,
+    )
+
+    N = bcast_Gammas_param.shape[-3]
+    states_shape = tuple(state_0.shape) + (N,)
+
+    bcast_Gammas_param = at.broadcast_to(
+        bcast_Gammas_param, states_shape + tuple(bcast_Gammas_param.shape[-2:])
+    )
+
+    def loop_fn(n, state_nm1, Gammas_inner, rng):
+        gamma_t = Gammas_inner[..., n, :, :]
+        idx = tuple(at.ogrid[[slice(None, d) for d in tuple(state_0.shape)]]) + (
+            state_nm1.T,
+        )
+        gamma_t = gamma_t[idx]
+        state_n = categorical(gamma_t, rng=rng)
+        return state_n.T
+
+    res, _ = aesara.scan(
+        loop_fn,
+        outputs_info=[{"initial": state_0.T, "taps": [-1]}],
+        sequences=[at.arange(N)],
+        non_sequences=[bcast_Gammas_param, rng],
+        # strict=True,
+    )
+
+    return DiscreteMarkovChainFactory(
+        [size_param, Gammas_param, gamma_0_param],
+        [res.T],
+        inline=True,
+        on_unused_input="ignore",
+    )
+
+
+def discrete_markov_chain(
+    Gammas: TensorVariable, gamma_0: TensorVariable, size=None, rng=None, **kwargs
+):
+    """Construct a first-order discrete Markov chain distribution.
+
+    This characterizes vector random variables consisting of state indicator
+    values (i.e. ``0`` to ``M - 1``) that are driven by a discrete Markov chain.
+
+
+    Parameters
+    ----------
+    Gammas
+        An array of transition probability matrices.  `Gammas` takes the
+        shape ``... x N x M x M`` for a state sequence of length ``N`` having
+        ``M``-many distinct states.  Each row, ``r``, in a transition probability
+        matrix gives the probability of transitioning from state ``r`` to each
+        other state.
+    gamma_0
+        The initial state probabilities.  The last dimension should be length ``M``,
+        i.e. the number of distinct states.
+    """
+    gamma_0 = at.as_tensor_variable(gamma_0)
+
+    assert Gammas.ndim >= 3
+
+    Gammas = at.as_tensor_variable(Gammas)
+
+    size = normalize_size_param(size)
+
+    if rng is None:
+        rng = aesara.shared(np.random.RandomState(), borrow=True)
+
+    DiscreteMarkovChainOp = create_discrete_mc_op(rng, size, Gammas, gamma_0)
+    rv_var = DiscreteMarkovChainOp(size, Gammas, gamma_0)
+
+    testval = kwargs.pop("testval", None)
+
+    if testval is not None:
+        rv_var.tag.test_value = testval
+
+    return rv_var
diff --git a/aeppl/logprob.py b/aeppl/logprob.py
@@ -9,7 +9,7 @@
 from aesara.tensor.slinalg import Cholesky, solve_lower_triangular
 from aesara.tensor.var import TensorVariable
 
-from aeppl.dists import DiracDelta
+from aeppl.dists import DiracDelta, DiscreteMarkovChainFactory
 
 
 class ParameterValueError(ValueError):
@@ -606,3 +606,59 @@ def diracdelta_logprob(op, values, *inputs, **kwargs):
     return at.switch(
         at.isclose(values, const_value, rtol=op.rtol, atol=op.atol), 0.0, -np.inf
     )
+
+
+@_logprob.register(DiscreteMarkovChainFactory)
+def discrete_mc_logp(op, values, *inputs, **kwargs):
+    r"""Create a Aesara graph that computes the log-likelihood for a discrete Markov chain.
+
+    This is the log-likelihood for the joint distribution of states, :math:`S_t`, conditional
+    on state samples, :math:`s_t`, given by the following:
+
+    .. math::
+
+        \int_{S_0} P(S_1 = s_1 \mid S_0) dP(S_0) \prod^{T}_{t=2} P(S_t = s_t \mid S_{t-1} = s_{t-1})
+
+    The first term (i.e. the integral) simply computes the marginal :math:`P(S_1 = s_1)`, so
+    another way to express this result is as follows:
+
+    .. math::
+
+        P(S_1 = s_1) \prod^{T}_{t=2} P(S_t = s_t \mid S_{t-1} = s_{t-1})
+
+    XXX TODO: This does not implement complete broadcasting support!
+
+    """
+
+    (states,) = values
+    _, Gammas, gamma_0 = inputs[: len(inputs) - len(op.shared_inputs)]
+
+    if states.ndim != 1 or Gammas.ndim > 3 or gamma_0.ndim > 1:
+        raise NotImplementedError()
+
+    Gammas_at = at.broadcast_to(Gammas, (states.shape[0],) + tuple(Gammas.shape)[-2:])
+    gamma_0_at = gamma_0
+
+    Gamma_1_at = Gammas_at[0]
+    P_S_1_at = at.dot(gamma_0_at, Gamma_1_at)[states[0]]
+
+    # def S_logp_fn(S_tm1, S_t, Gamma):
+    #     return at.log(Gamma[..., S_tm1, S_t])
+    #
+    # P_S_2T_at, _ = theano.scan(
+    #     S_logp_fn,
+    #     sequences=[
+    #         {
+    #             "input": states_at,
+    #             "taps": [-1, 0],
+    #         },
+    #         Gammas_at,
+    #     ],
+    # )
+    P_S_2T_at = Gammas_at[at.arange(0, states.shape[0] - 1), states[:-1], states[1:]]
+
+    log_P_S_1T_at = at.concatenate(
+        [at.shape_padright(at.log(P_S_1_at)), at.log(P_S_2T_at)]
+    )
+
+    return log_P_S_1T_at