From f2f5c5661b1d1d33e2d4db24257da7fdeb4d5ac3 Mon Sep 17 00:00:00 2001 From: Tor Erlend Fjelde Date: Fri, 5 Apr 2024 21:12:56 +0100 Subject: [PATCH] flow matching blog: only number equations which are referenced --- _posts/2024-01-20-flow-matching.md | 236 ++++++++++++++--------------- 1 file changed, 118 insertions(+), 118 deletions(-) diff --git a/_posts/2024-01-20-flow-matching.md b/_posts/2024-01-20-flow-matching.md index 4f4126c..3c50b68 100644 --- a/_posts/2024-01-20-flow-matching.md +++ b/_posts/2024-01-20-flow-matching.md @@ -257,12 +257,12 @@ Let $\phi: \mathbb{R}^d \rightarrow \mathbb{R}^d$ be a continuously differentiab Let $q_0(x)$ be a density on $\mathbb{R}^d$ and let $p_1(\cdot)$ be the density induced by the following sampling procedure $$ -\begin{equation} +\begin{equation*} \begin{split} x &\sim q_0 \\ y &= \phi(x), \end{split} -\end{equation} +\end{equation*} $$ which corresponds to transforming the samples of $q_0$ by the mapping $\phi$. @@ -369,9 +369,9 @@ Let's denote the induced parametric density by the flow $\phi_\theta$ as $p_1 \t A natural optimisation objective for learning the parameters $\theta \in \Theta$ is to consider maximising the probability of the data under the model: $$ -\begin{equation} +\begin{equation*} \textrm{argmax}_{\theta}\ \ \mathbb{E}_{x\sim \mathcal{D}} [\log p_1(x)]. -\end{equation} +\end{equation*} $$ Parameterising $\phi_\theta$ as a deep neural network leads to several constraints: @@ -421,18 +421,18 @@ As opposed to auto-regressive flows (Huang et al., 2018, Larochelle and Murray, We can also compose such flows to get a new flow: $$ -\begin{equation} +\begin{equation*} \phi = \phi_K \circ \ldots \circ \phi_2 \circ \phi_1. -\end{equation} +\end{equation*} $$ This can be a useful way to construct move expressive flow! The model's log-likelihood is then given by summing each flow's contribution $$ -\begin{equation} +\begin{equation*} \log q(y) = \log p(\phi^{-1}(y)) + \sum_{k=1}^K \log \det\left[\frac{\partial \phi_k^{-1}}{\partial x_{k+1}}(x_{k+1})\right] -\end{equation} +\end{equation*} $$ with $x_k = \phi_K^{-1} \circ \ldots \circ \phi^{-1}_{k} (y)$. @@ -445,34 +445,34 @@ $\phi(x) = x + \delta \ u(x)$ for some $\delta > 0$ and Lipschitz residual connection $u$. We can re-arrange this to get $$ -\begin{equation} +\begin{equation*} \frac{\phi(x) - x}{\delta} = u(x) -\end{equation} +\end{equation*} $$ which is looking awfully similar to $u$ being a derivative. In fact, letting $\delta = 1/K$ and taking the limit $K \rightarrow \infty$ under certain conditions[^ODE_conditions], a composition of residual flows $\phi_K \circ \cdots \circ \phi_2 \circ \phi_1$ is given by an ordinary differential equation (ODE): $$ -\begin{equation} +\begin{equation*} \frac{\dd x_t}{\dd t} = \lim_{\delta \rightarrow 0} \frac{x_{t+\delta} - x_t}{\delta} = \frac{\phi_t(x_t) - x_t}{\delta} = u_t(x_t) -\end{equation} +\end{equation*} $$ where the *flow* of the ODE $\phi_t: [0,1]\times\mathbb{R}^d\rightarrow\mathbb{R}^d$ is defined such that $$ -\begin{equation} +\begin{equation*} \frac{d\phi_t}{dt} = u_t(\phi_t(x_0)). -\end{equation} +\end{equation*} $$ That is, $\phi_t$ maps initial condition $x_0$ to the ODE solution at time $t$: $$ -\begin{equation} +\begin{equation*} x_t \triangleq \phi_t(x_0) = x_0 + \int_{0}^t u_s(x_s) \dd{s} . -\end{equation} +\end{equation*} $$ @@ -484,10 +484,10 @@ Of course, this only defines the map $\phi_t(x)$; for this to be a useful normal As it turns out, the density induced by $\phi_t$ (or equivalently $u_t$) can be computed via the following equation[^FPE] $$ -\begin{equation} +\begin{equation*} \frac{\partial}{\partial_t} p_t(x_t) = - (\nabla \cdot (u_t p_t))(x_t). -\end{equation} +\end{equation*} $$ @@ -497,17 +497,17 @@ This statement on the time-evolution of $p_t$ is generally known as the *Transpo Computing the *total* derivative (as $x_t$ also depends on $t$) in log-space yields[^log_pdf] $$ -\begin{equation} +\begin{equation*} \frac{\dd}{\dd t} \log p_t(x_t) = - (\nabla \cdot u_t)(x_t) -\end{equation} +\end{equation*} $$ resulting in the log density $$ -\begin{equation} +\begin{equation*} \log p_t(x) = \log p_0(x_0) - \int_0^t (\nabla \cdot u_s)(x_s) \dd{s}. -\end{equation} +\end{equation*} $$ Parameterising a vector field neural network $u_\theta: \mathbb{R}_+ \times \mathbb{R^d} \rightarrow \mathbb{R^d}$ therefore induces a parametric log-density @@ -519,9 +519,9 @@ $$ In practice, to compute $\log p_t$ one can either solve both the time evolution of $x_t$ and its log density $\log p_t$ jointly $$ -\begin{equation} +\begin{equation*} \frac{\dd}{\dd t} \Biggl( \begin{aligned} x_t \ \quad \\ \log p_t(x_t) \end{aligned} \Biggr) = \Biggl( \begin{aligned} u_\theta(t, x_t) \quad \\ - \div u_\theta(t, x_t) \end{aligned} \Biggr), -\end{equation} +\end{equation*} $$ or solve only for $x_t$ and then use quadrature methods to estimate $\log p_t(x_t)$. @@ -550,9 +550,9 @@ In contrast to previously where we derived a 'one-shot' (i.e. *discrete*) flow b We have the following two distributions $$ -\begin{equation} +\begin{equation*} p_0 = \mathcal{N}(0, 1) \quad \text{and} \quad p_1 = \mathcal{N}(\mu, 1). -\end{equation} +\end{equation*} $$ @@ -561,9 +561,9 @@ $$ It's not difficult to see that we can continuously bridge between these with a simple linear transformation $$ -\begin{equation} +\begin{equation*} \phi(t, x_0) = x_0 + \mu t -\end{equation} +\end{equation*} $$ which is visualized in the figure below. @@ -591,17 +591,17 @@ which is visualized in the figure below. By linearity, we know that every marginal $p_t$ is a Gaussian, and so $$ -\begin{equation} +\begin{equation*} \mathbb{E}_{p_0}[\phi_t(x_0)] = \mu t -\end{equation} +\end{equation*} $$ which, in particular, implies that $\mathbb{E}_{p_0}[\phi_1(x_0)] = \mu = \mathbb{E}\_{p_1}[x_1]$. Similarly, we have $$ -\begin{equation} +\begin{equation*} \mathrm{Var}_{p_0}[\phi_t(x_0)] = 1 \quad \implies \quad \mathrm{Var}_{p_0}[\phi_1(x_0)] = 1 = \mathrm{Var}_{p_1}[x_1] -\end{equation} +\end{equation*} $$ Hence we have a probability path $p_t = \mathcal{N}(\mu t, 1)$ bridging $p_0$ and $p_1$. @@ -632,33 +632,33 @@ Hence we have a probability path $p_t = \mathcal{N}(\mu t, 1)$ bridging $p_0$ an Now let's determine what the vector field $u_t(x)$ would be in this case. As mentioned earlier, $u(t, x)$ should satisfy the following $$ -\begin{equation} +\begin{equation*} \dv{\phi_t}{t}(x_0) = u_t \big( \phi_t(x_0) \big). -\end{equation} +\end{equation*} $$ Since we have already specified $\phi$, we can plug it in on the left hand side to get $$ -\begin{equation} +\begin{equation*} \dv{\phi_t}{t}(x_0) = \dv{t} \big( x_0 + \mu t \big) = \mu -\end{equation} +\end{equation*} $$ which gives us $$ -\begin{equation} +\begin{equation*} \mu = u_t \big( x_0 + \mu t \big). -\end{equation} +\end{equation*} $$ The above needs to hold for *all* $t \in [0, 1]$, and so it's not too difficult to see that one such solution is the constant vector field $$ -\begin{equation} +\begin{equation*} u_t(x) = \mu. -\end{equation} +\end{equation*} $$ We could of course have gone the other way, i.e. define the $u_t$ such that $p_0 \overset{u_t}{\longleftrightarrow} p_1$ and derive the corresponding $\phi_t$ by solving the ODE. @@ -700,24 +700,24 @@ Flow matching is a simulation-free way to train CNF models where we directly for $$ -\begin{equation} +\begin{equation*} \mathcal{L}(\theta)_{} = \mathbb{E}_{t \sim \mathcal{U}[0, 1]} \mathbb{E}_{x \sim p_t}\left[\| u_\theta(t, x) - u(t, x) \|^2 \right]. -\end{equation} +\end{equation*} $$ In the equation above, $u(t, x)$ would be a vector field inducing a *probability path* (or bridge) $p_t$ interpolating the reference $p_0$ to $p_1$, i.e. $$ -\begin{equation} +\begin{equation*} \log p_1(x) = \log p_0 - \int_0^1 (\nabla \cdot u_t)(x_t) \dd{t}. -\end{equation} +\end{equation*} $$ In words: we're just performing regression on $u_t(x)$ for all $t \in [0, 1]$. @@ -822,9 +822,9 @@ Figure 7: *Different paths with the same endpoints marginals[^interpolation].* First, let's remind ourselves that the transport equation relates a vector field $u_t$ to (the time evolution of) a probability path $p_t$ $$ -\begin{equation} +\begin{equation*} \pdv{p_t(x)}{t} = - \nabla \cdot \big( u_t(x) p_t(x) \big), -\end{equation} +\end{equation*} $$ thus constructing $p_t$ or $u_t$ is *equivalent*. @@ -836,9 +836,9 @@ In addition, as opposed to the marginal $p_t$ , the conditional $p_{t\mid1}$ cou In particular, as we have access to data samples $x_1 \sim q_1$, it sounds pretty reasonable to condition on $z=x_1$, leading to the following marignal probabilithy path $$ -\begin{equation} +\begin{equation*} p_t(x_t) = \int q_1(x_1) ~p_{t\mid 1}(x_t\mid x_1) \dd{x_1}. -\end{equation} +\end{equation*} $$ @@ -847,9 +847,9 @@ $$ In this setting, the conditional probability path $p_{t\mid 1}$ need to satisfy the boundary conditions $$ -\begin{equation} +\begin{equation*} p_0(x \mid x_1) = p_0 \quad \text{and} \quad p_1(x \mid x_1) = \mathcal{N}(x; x_1, \sigmamin^2 I) \xrightarrow[\sigmamin \rightarrow 0]{} \delta_{x_1}(x) -\end{equation} +\end{equation*} $$ with $\sigmamin > 0$ small, and for whatever reference $p_0$ we choose, typically something "simple" like $p_0(x) = \mathcal{N}(x; 0, I)$, as illustrated in the [figure](#figure-heatmap_with_cond_traj-v3) below. @@ -879,9 +879,9 @@ $$ Lipman et al. (2023) introduced the notion of **Conditional Flow Matching (CFM)** by noticing that this *conditional* vector field $u_t(x \mid x_1)$ @@ -902,9 +902,9 @@ $$ To see why this $u_t$ the same the vector field as the one defined earlier, i.e. the one generating the (marginal) pribability path $p_t$, we need to show that the expression above for the marginal vector field $u_t(x)$ satisfies the transport equation $$ -\begin{equation} +\begin{equation*} \pdv{\hlthree{p_t(x)}}{t} = - \nabla \cdot \big( \hltwo{u_t(x)} \hlthree{p_t(x)} \big). -\end{equation} +\end{equation*} $$ Writing out the left-hand side, we have @@ -1039,13 +1039,13 @@ $$ \begin{equation*} \begin{split} u_t \big( \phi_t(x_0) \big) -&= \E_{p_{1|t}}\left[u_t \big( \phi_t(x_0) \mid x_1 \big)\right] \\ -&\approx \frac{1}{n} \sum_{i = 1}^n u_t \big( \phi_t(x_0) \mid x_1^{(i)} \big) \ \text{with } x_1^{(i)} \sim p_{1|t}(x_1 \mid \phi_t(x_0)). +&= \E_{p_{1 \mid t}}\left[u_t \big( \phi_t(x_0) \mid x_1 \big)\right] \\ +&\approx \frac{1}{n} \sum_{i = 1}^n u_t \big( \phi_t(x_0) \mid x_1^{(i)} \big) \ \text{with } x_1^{(i)} \sim p_{1 \mid t}(x_1 \mid \phi_t(x_0)). \end{split} \end{equation*} $$ -In practice we don't have access to the posterior $$p_{1|t}(x_1|x_t)$$, but in this specific setting we do have closed-form expressions for everything (Albergo & Vanden-Eijnden, 2022), and so we can visualise the marginal vector field $$u_t\big( \phi_t(x_0)\big)$$ and the conditional vector fields $$u_t \big( \phi_t(x_0) \mid x_1^{(i)} \big)$$ for all our "data" samples $$x_1^{(i)}$$ and see how they compare. +In practice we don't have access to the posterior $$p_{1 \mid t}(x_1 \mid x_t)$$, but in this specific setting we do have closed-form expressions for everything (Albergo & Vanden-Eijnden, 2022), and so we can visualise the marginal vector field $$u_t\big( \phi_t(x_0)\big)$$ and the conditional vector fields $$u_t \big( \phi_t(x_0) \mid x_1^{(i)} \big)$$ for all our "data" samples $$x_1^{(i)}$$ and see how they compare. This is shown in the figure below.
@@ -1088,7 +1088,7 @@ Figure 13: Marginal vector field $u_t(x)$ vs. conditional vector field $u_t(x \m
-From the above figures, we can immediately see how for small $t$, i.e. near 0, the posterior $p_{1|t}(x_1|x_t)$ is quite scattered so the marginalisation giving $u_t$ involves many equally likely data samples $x_1$. In contrast, when $t$ increases and get closer to 1, $p_{1|t}(x_1|x_t)$ gets quite concentrated over much fewer samples $x_1$. +From the above figures, we can immediately see how for small $t$, i.e. near 0, the posterior $p_{1 \mid t}(x_1 \mid x_t)$ is quite scattered so the marginalisation giving $u_t$ involves many equally likely data samples $x_1$. In contrast, when $t$ increases and get closer to 1, $p_{1 \mid t}(x_1 \mid x_t)$ gets quite concentrated over much fewer samples $x_1$. @@ -1108,18 +1108,18 @@ where $u_t(x) = \mathbb{E}\_{x_1 \sim p_{1 \mid t}} \left[ u_t(x \mid x_1) \rig with an equivalent loss regressing the *conditional* vector field $u_t(x \mid x_1)$ and marginalising $x_1$ instead: $$ -\begin{equation} +\begin{equation*} \mathcal{L}_{\mathrm{CFM}}(\theta) = \mathbb{E}_{t \sim \mathcal{U}[0, 1], x_1 \sim q, x_t \sim p_t(x \mid x_1)}\left[\| u_\theta(t, x) - u_t(x \mid x_1) \|^2 \right]. -\end{equation} +\end{equation*} $$ These losses are equivalent in the sense that $$ -\begin{equation} +\begin{equation*} \nabla_\theta \mathcal{L}_{\mathrm{FM}}(\theta) = \nabla_\theta \mathcal{L}_{\mathrm{CFM}}(\theta), -\end{equation} +\end{equation*} $$ which implies that we can use $${\mathcal{L}}_{\text{CFM}}$$ instead to train the parametric vector field $u_{\theta}$. @@ -1130,10 +1130,10 @@ Let's develop inner product term for $${\mathcal{L}}_{\text{FM}}$$ and show that $$ \begin{align} \mathbb{E}_{x \sim p_t} ~\langle u_\theta(t, x), \hltwo{u_t(x)} \rangle -&= \int \langle u_\theta(t, x), \hltwo{\int} u_t(x|x_1) \hltwo{\frac{p_t(x|x_1)q(x_1)}{p_t(x)} dx_1} \rangle p_t(x) \mathrm{d} x \\ -&= \int \langle u_\theta(t, x), \int u_t(x|x_1) p_t(x|x_1)q(x_1) dx_1 \rangle \dd{x} \\ -&= \int \int \langle u_\theta(t, x), u_t(x|x_1) \rangle p_t(x|x_1)q(x_1) dx_1 \dd{x} \\ -&= \mathbb{E}_{q_1(x_1) p(x|x_1)} ~\langle u_\theta(t, x), u_t(x|x_1) \rangle +&= \int \langle u_\theta(t, x), \hltwo{\int} u_t(x \mid x_1) \hltwo{\frac{p_t(x \mid x_1)q(x_1)}{p_t(x)} dx_1} \rangle p_t(x) \mathrm{d} x \\ +&= \int \langle u_\theta(t, x), \int u_t(x \mid x_1) p_t(x \mid x_1)q(x_1) dx_1 \rangle \dd{x} \\ +&= \int \int \langle u_\theta(t, x), u_t(x \mid x_1) \rangle p_t(x \mid x_1)q(x_1) dx_1 \dd{x} \\ +&= \mathbb{E}_{q_1(x_1) p(x \mid x_1)} ~\langle u_\theta(t, x), u_t(x \mid x_1) \rangle \end{align} $$ @@ -1144,7 +1144,7 @@ where in the $\hltwo{\text{first highlighted step}}$ we used the expression of $ The benefit of the CFM loss being that once we define the conditional probability path $p_t(x \mid x_1)$, we can construct an unbiased Monte Carlo estimator of the objective using samples $\big( x_1^{(i)} \big)_{i = 1}^n$ from the data target $q_1$! This estimator can be efficiently computed as it involves an expectation over the joint $q_1(x_1)p_t(x \mid x_1)$ -, of the conditional vector field $u_t (x \mid x_1)$ both being available as opposed to the marginal vector field $u_t$ which involves an expectation over the posterior $p_{1|t}(x_1|x)$. +, of the conditional vector field $u_t (x \mid x_1)$ both being available as opposed to the marginal vector field $u_t$ which involves an expectation over the posterior $p_{1 \mid t}(x_1 \mid x)$. @@ -1155,11 +1155,11 @@ We note that, as opposed to the log-likelihood maximisation loss of CNFs which d To do so we construct a _probability path_ $p_t$ which interpolates between the reference (i.e. noise) distribution $q_0$ and the data distribution $q_1$, i.e. $p_{t=0}=q_0$ and $p_{t=1}=q_1 * \mathrm{N}(0, \sigma^2)$. -Lipman et al. (2023) achieves this by constructing $p_t$ as mixture of simpler probability paths: $p_t \triangleq \int p_t(\cdot|x_1) q_1(x_1) \dd{x}_1$, via a _conditional probability path_ $p_t(\cdot|x_1)$ satisfying $p_1(\cdot|x_1)=\mathrm{N}(x_1, \sigma^2) \xrightarrow[\sigma \rightarrow 0]{} \delta_{x_1}$ and $p_0(\cdot|x_1)=p_0$. +Lipman et al. (2023) achieves this by constructing $p_t$ as mixture of simpler probability paths: $p_t \triangleq \int p_t(\cdot \mid x_1) q_1(x_1) \dd{x}_1$, via a _conditional probability path_ $p_t(\cdot \mid x_1)$ satisfying $p_1(\cdot \mid x_1)=\mathrm{N}(x_1, \sigma^2) \xrightarrow[\sigma \rightarrow 0]{} \delta_{x_1}$ and $p_0(\cdot \mid x_1)=p_0$. As a result both endpoints constraint are satisfied since ones recovers -- at $t=1$ the data distribution $p_1(x) = \int p_1(x|x_1) q_1(x_1) \dd{x}_1 = \int \mathrm{N}(x_1, \sigma^2) q_1(x_1) \dd{x}_1 \xrightarrow[\sigma \rightarrow 0]{} q_1(x)$ -- at $t=0$ the reference distribution $p_0(x) = \int p_0(x|x_1) q_1(x_1) \dd{x}_1 = \int q_0(x) q_1(x_1) \dd{x}_1 = q_0(x)$. +- at $t=1$ the data distribution $p_1(x) = \int p_1(x \mid x_1) q_1(x_1) \dd{x}_1 = \int \mathrm{N}(x_1, \sigma^2) q_1(x_1) \dd{x}_1 \xrightarrow[\sigma \rightarrow 0]{} q_1(x)$ +- at $t=0$ the reference distribution $p_0(x) = \int p_0(x \mid x_1) q_1(x_1) \dd{x}_1 = \int q_0(x) q_1(x_1) \dd{x}_1 = q_0(x)$.
@@ -1168,9 +1168,9 @@ As a result both endpoints constraint are satisfied since ones recovers
-We have defined a probability path $p_t$ in terms of conditional probability path $p_t(\cdot|x_1)$, yet how do we define the latter? +We have defined a probability path $p_t$ in terms of conditional probability path $p_t(\cdot \mid x_1)$, yet how do we define the latter? We know that the transport equation $\frac{\partial}{\partial_t} p_t(x_t) = - (\nabla \cdot (u_t p_t))(x_t)$ relates a vector field (i.e. vector field) to a propability path $p_t$ (given an initial value $p_{t=0} = q_0$). -As such it is sufficient to construct a _conditional vector field_ $u_t(\cdot|x_1)$ which induces a conditional probability path $p_t(\cdot|x_1)$ with the right boundary conditions. +As such it is sufficient to construct a _conditional vector field_ $u_t(\cdot \mid x_1)$ which induces a conditional probability path $p_t(\cdot \mid x_1)$ with the right boundary conditions. --> @@ -1179,21 +1179,21 @@ As such it is sufficient to construct a _conditional vector field_ $u_t(\cdot|x_ Let's now look at practical example of conditional vector field and the corresponding probability path. Suppose we want conditional vector field which generates a path of Gaussians, i.e. $$ -\begin{equation} +\begin{equation*} p_t(x \mid x_1) = \mathcal{N}(x; \mu_t(x_1), \sigma_t(x_1)^2 \mathrm{I}) -\end{equation} +\end{equation*} $$ for some mean $\mu_t(x_1)$ and standard deviation $\sigma_t(x_1)$. - + One conditional vector field inducing the above-defined conditional probability path is given by the following expression: $$ \begin{equation} \label{eq:gaussian-path} -u_t(x|x_1) = \frac{\dot{\sigma_t}(x_1)}{\sigma_t(x_1)} (x - \mu_t(x_1)) + \dot{\mu_t}(x_1) +u_t(x \mid x_1) = \frac{\dot{\sigma_t}(x_1)}{\sigma_t(x_1)} (x - \mu_t(x_1)) + \dot{\mu_t}(x_1) \end{equation} $$ @@ -1205,17 +1205,17 @@ as shown in the proof below. We have $$ -\begin{equation} +\begin{equation*} \phi_t(x \mid x_1) = \mu_t(x_1) + \sigma_t(x_1) x -\end{equation} +\end{equation*} $$ and we want to determine $u_t(x \mid x_1)$ such that $$ -\begin{equation} +\begin{equation*} \frac{\dd}{\dd t} \phi_t(x) = u_t \big( \phi_t(x) \mid x_1 \big) -\end{equation} +\end{equation*} $$ First note that the LHS is @@ -1232,59 +1232,59 @@ $$ so we have $$ -\begin{equation} +\begin{equation*} \dot{\mu_t}(x_1) + \dot{\sigma_t}(x_1) x = u_1 \big( \phi_t(x \mid x_1) \mid x_1 \big) -\end{equation} +\end{equation*} $$ Suppose that $u_1$ is of the form $$ -\begin{equation} +\begin{equation*} u_1\big( \phi_t(x) \mid x_1\big) = h\big(t, \phi_t(x), x_1\big) \dot{\mu_t}(x_1) + g\big(t, \phi_t(x), x_1\big) \dot{\sigma_t}(x_1) -\end{equation} +\end{equation*} $$ for some functions $h$ and $g$. Reading of the components from the previous equation, we then see that we require $$ -\begin{equation} +\begin{equation*} h\big(t, \phi_t(x), x_1\big) = 1 \quad \text{and} \quad g(t, \phi_t(x), x_1) = x -\end{equation} +\end{equation*} $$ The simplest solution to the above is then just $$ -\begin{equation} +\begin{equation*} h(t, x, x_1) = 1 -\end{equation} +\end{equation*} $$ i.e. constant function, and $$ -\begin{equation} +\begin{equation*} g(t, x, x_1) = \phi_t^{-1}(x) = \frac{x - \mu_t(x_1)}{\sigma_t(x_1)} -\end{equation} +\end{equation*} $$ such that $$ -\begin{equation} +\begin{equation*} g\big(t, \phi_t(x), x_1) = \phi_t^{-1} \big( \phi_t(x) \big) = x -\end{equation} +\end{equation*} $$ resulting in $$ -\begin{equation} +\begin{equation*} u_t \big( x \mid x_1 \big) = \dot{\mu_t}(x_1) + \dot{\sigma_t}(x_1) \bigg( \frac{x - \mu_t(x_1)}{\sigma_t(x_1)} \bigg) -\end{equation} +\end{equation*} $$ as claimed. @@ -1311,9 +1311,9 @@ $$ so that $$ -\begin{equation} +\begin{equation*} \big( {\hlone{\mu_0(x_1)}} + {\hlthree{\sigma_0(x_1)}} x_1 \big) \sim p_0 \quad \text{and} \quad \big( {\hlone{\mu_1(x_1)}} + {\hlthree{\sigma_1(x_1)}} x_1 \big) \sim \mathcal{N}(x_1, \sigmamin^2 I) -\end{equation} +\end{equation*} $$ @@ -1398,11 +1398,11 @@ remove diffusion vf - $dx_t = -\frac{1}{2}\sqrt{\beta(t)} x_t \dd{t} + \beta(t) \dd{B}_t$ - $\alpha_t = e^{-\frac{1}{2}\int_0^t \beta(s) \dd{s}}$ - $\mu_t = \alpha_{1-t} x_1$ and $\sigma_t^2 = 1 - \alpha_{1-t}^2$ -- $u_t(x|x_1) = -\frac{\sigma'_{1-t}}{\sigma_{1-t}}(x - x_1)$ +- $u_t(x \mid x_1) = -\frac{\sigma'_{1-t}}{\sigma_{1-t}}(x - x_1)$ ### (conditional) OT vf - $\mu_t = t x_1$ and $\sigma_t = 1 - t = (1 - (1 - \sigma_{\min})t$ -- $u_t(x|x_1) = \frac{1}{1 - t}(x_1 - x) = \frac{1}{1 - (1 - \sigma_\min)t}(x_1 - (1 - \sigma_\min)x)$ +- $u_t(x \mid x_1) = \frac{1}{1 - t}(x_1 - x) = \frac{1}{1 - (1 - \sigma_\min)t}(x_1 - (1 - \sigma_\min)x)$
Does not guarantee that the _marginal_ vector field is the OT map!
@@ -1561,7 +1561,7 @@ More generally, samples from the reference distribution which are arbitrarily cl {% include image.html name="Figure 21" - alt="Variance of conditional vector field over $p_{1|t}$ for both blue and red trajectories for \eqref{eq:mog2mog}." + alt="Variance of conditional vector field over $p_{1 \mid t}$ for both blue and red trajectories for \eqref{eq:mog2mog}." ref="variance_cond_vector_field.png" src="flow-matching/variance_cond_vector_field.png" width=400 @@ -1595,13 +1595,13 @@ There are two immediate consequences:
![forward_samples](https://hackmd.io/_uploads/SyG9OF_IT.png) -*Interpolation path $\phi(x_t|x_0,x_1)$*. +*Interpolation path $\phi(x_t \mid x_0,x_1)$*.
![forward_samples](https://hackmd.io/_uploads/SJusOFuL6.png) -*Inference path $\phi(x_t|x_0)$*. +*Inference path $\phi(x_t \mid x_0)$*.
@@ -1631,7 +1631,7 @@ p_t(x_t) = \int p_t(x_t \mid z) q(z) \dd{z} = \int p_t(x_t \mid x_1) q(x_1) \dd{ $$ e.g. -$$p(x_t | x_1) = \mathcal{N}(x_t \mid x_1, (1-t)^2)$$. +$$p(x_t \mid x_1) = \mathcal{N}(x_t \mid x_1, (1-t)^2)$$. @@ -1655,7 +1655,7 @@ $$p(x_t | x_1) = \mathcal{N}(x_t \mid x_1, (1-t)^2)$$. Yet, more generally, we can consider conditioning and marginalising over latent variables $z$, and minimising the following loss: $$ -\mathcal{L}_{\mathrm{CFM}}(\theta) = \mathbb{E}_{(t,z,x_t) \sim \mathcal{U}[0,1] q(z) p(\cdot|z)}[\| u_\theta(t, x_t) - u_t(x_t|z)\|^2]. +\mathcal{L}_{\mathrm{CFM}}(\theta) = \mathbb{E}_{(t,z,x_t) \sim \mathcal{U}[0,1] q(z) p(\cdot \mid z)}[\| u_\theta(t, x_t) - u_t(x_t \mid z)\|^2]. $$ As suggested in Liu et al. (2023), Tong et al. (2023), Albergo & Vanden-Eijnden (2022) and Pooladian et al. (2023) one can condition on *both* endpoints $z=(x_1, x_0)$ of the process, referred as *two-sided conditioning*. The marginal probability path is defined as: @@ -1693,7 +1693,7 @@ One main advantage being that this allows for non Gaussian reference distributio Choosing a standard normal as noise distribution $q(x_0) = \mathcal{N}(0, \mathrm{I})$ we recover the same _one-sided_ conditional probability path as earlier: $$ -p(x_t \mid x_1) = \int p(x_t \mid x_0, x_1) q(x_0) \dd{x_0} = \mathcal{N}(x_t|tx_1, (1-t)^2). +p(x_t \mid x_1) = \int p(x_t \mid x_0, x_1) q(x_0) \dd{x_0} = \mathcal{N}(x_t \mid tx_1, (1-t)^2). $$ @@ -1949,9 +1949,9 @@ We deeply thank Michael Albergo, Valentin Debortoli and James Thornton for givin [^chainrule]: The property $\phi \circ \phi^{-1} = \Id$ implies, by the chain rule, $$ - \begin{equation} + \begin{equation*} \pdv{\phi}{x} \bigg|_{x = \phi^{-1}(y)} \pdv{\phi^{-1}}{y} \bigg|_{y} = 0 \iff \pdv{\phi}{x} \bigg|_{x = \phi^{-1}(y)} = \bigg( \pdv{\phi^{-1}}{y} \bigg|_{y} \bigg)^{-1} \quad \forall y \in \mathbb{R}^d - \end{equation} + \end{equation*} $$ [^jac_structure]: **Autoregressive** (Papamakarios et al., 2018; Huang et al., 2018) @@ -1959,9 +1959,9 @@ We deeply thank Michael Albergo, Valentin Debortoli and James Thornton for givin **Low rank residual** (Van Den Berg et al., 2018) Another approach is to construct a flow via a residual connection: $$ - \begin{equation} + \begin{equation*} \phi(x) = x + A h(B x + b) - \end{equation} + \end{equation*} $$ with parameters $A \in \R^{d\times m}$, $B \in \R^{ m\times m}$ and $b \in \R^m$. Leveraging Sylvester's determinant identity $\det(I_d + AB)=\det(I_m + BA)$, the determinant computation can be reduced to one of a $m \times m$ matrix which is advantageous if $m \mathrm{<<} d$. @@ -1971,11 +1971,11 @@ We deeply thank Michael Albergo, Valentin Debortoli and James Thornton for givin [^log_pdf]: Expanding the divergence in the _transport equation_ we have: $$ - \begin{equation} + \begin{equation*} \frac{\partial}{\partial_t} p_t(x_t) = - (\nabla \cdot (u_t p_t))(x_t) = - p_t(x_t) (\nabla \cdot u_t)(x_t) - \langle \nabla_{x_t} p_t(x_t), u_t(x_t) \rangle. - \end{equation} + \end{equation*} $$ Yet since $x_t$ also depends on $t$, to get the _total derivative_ we have $$ @@ -1990,22 +1990,22 @@ We deeply thank Michael Albergo, Valentin Debortoli and James Thornton for givin Hence, $\frac{\dd}{\dd t} \log p_t(x_t) = \frac{1}{p_t(x_t)} \frac{\dd}{\dd t} p_t(x_t) = - (\nabla \cdot u_t)(x_t).$ [^CFM]: Developing the square in both losses we get: - $$\|u_\theta(t, x) - u_t(x|x_1)\|^2 = \|u_\theta(t, x)\|^2 + \|u_t(x|x_1)\|^2 - 2 \langle u_\theta(t, x), u_t(x|x_1) \rangle,$$ + $$\|u_\theta(t, x) - u_t(x \mid x_1)\|^2 = \|u_\theta(t, x)\|^2 + \|u_t(x \mid x_1)\|^2 - 2 \langle u_\theta(t, x), u_t(x \mid x_1) \rangle,$$ and $$\|u_\theta(t, x) - u_t(x)\|^2 = \|u_\theta(t, x)\|^2 + \|u_t(x)\|^2 - 2 \langle u_\theta(t, x), u_t(x) \rangle.$$ Taking the expectation over the last inner product term: $$ \begin{align} \mathbb{E}_{x \sim p_t} ~\langle u_\theta(t, x), u_t(x) \rangle - &= \int \langle u_\theta(t, x), \int u_t(x|x_1) \frac{p_t(x|x_1)q(x_1)}{p_t(x)} dx_1 \rangle p_t(x) \dd{x} \\ - &= \int \langle u_\theta(t, x), \int u_t(x|x_1) p_t(x|x_1)q(x_1) dx_1 \rangle \dd{x} \\ - &= \int \int \langle u_\theta(t, x), u_t(x|x_1) \rangle p_t(x|x_1)q(x_1) dx_1 \dd{x} \\ - &= \mathbb{E}_{q_1(x_1) p(x|x_1)} ~\langle u_\theta(t, x), u_t(x|x_1) \rangle. + &= \int \langle u_\theta(t, x), \int u_t(x|x_1) \frac{p_t(x \mid x_1)q(x_1)}{p_t(x)} dx_1 \rangle p_t(x) \dd{x} \\ + &= \int \langle u_\theta(t, x), \int u_t(x \mid x_1) p_t(x \mid x_1)q(x_1) dx_1 \rangle \dd{x} \\ + &= \int \int \langle u_\theta(t, x), u_t(x \mid x_1) \rangle p_t(x \mid x_1)q(x_1) dx_1 \dd{x} \\ + &= \mathbb{E}_{q_1(x_1) p(x \mid x_1)} ~\langle u_\theta(t, x), u_t(x \mid x_1) \rangle. \end{align} $$ Then we see that the neural network squared norm terms are equal since: $$ - \mathbb{E}_{p_t} \|u_\theta(t, x)\|^2 = \int \|u_\theta(t, x)\|^2 p_t(x|x_1) q(x_1) \dd{x} \dd{x_1} = \mathbb{E}_{q_1(x_1) p(x|x_1)} \|u_\theta(t, x)\|^2 + \mathbb{E}_{p_t} \|u_\theta(t, x)\|^2 = \int \|u_\theta(t, x)\|^2 p_t(x \mid x_1) q(x_1) \dd{x} \dd{x_1} = \mathbb{E}_{q_1(x_1) p(x \mid x_1)} \|u_\theta(t, x)\|^2 $$ [^ODE_conditions]: A sufficient condition for $\phi_t$ to be invertible is for $u_t$ to be Lipschitz and continuous by Picard–Lindelöf theorem.