From e6cacae53e301216c9b059e3392cd4bc2e0607ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Mon, 16 Dec 2024 20:17:55 -0600
Subject: [PATCH] enh: migrate theta to c++ (#955)

---
 include/statsforecast/nelder_mead.h |  17 +-
 nbs/src/theta.ipynb                 | 845 +++++++++-------------------
 python/statsforecast/_modidx.py     |  13 +-
 python/statsforecast/theta.py       | 542 ++++--------------
 src/statsforecast.cpp               |  18 +-
 src/theta.cpp                       | 195 +++++++
 6 files changed, 572 insertions(+), 1058 deletions(-)
 create mode 100644 src/theta.cpp
diff --git a/include/statsforecast/nelder_mead.h b/include/statsforecast/nelder_mead.h
index e79b41314..513da7171 100644
--- a/include/statsforecast/nelder_mead.h
+++ b/include/statsforecast/nelder_mead.h
@@ -8,15 +8,16 @@ using Eigen::VectorXd;
 using RowMajorMatrixXd =
     Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
 
-auto Clamp(const VectorXd &x, const VectorXd &lower, const VectorXd &upper) {
+inline auto Clamp(const VectorXd &x, const VectorXd &lower,
+                  const VectorXd &upper) {
   return x.cwiseMax(lower).cwiseMin(upper);
 }
 
-double StandardDeviation(const VectorXd &x) {
+inline double StandardDeviation(const VectorXd &x) {
   return std::sqrt((x.array() - x.mean()).square().mean());
 }
 
-Eigen::VectorX<Eigen::Index> ArgSort(const VectorXd &v) {
+inline Eigen::VectorX<Eigen::Index> ArgSort(const VectorXd &v) {
   Eigen::VectorX<Eigen::Index> indices(v.size());
   std::iota(indices.begin(), indices.end(), 0);
   std::sort(indices.begin(), indices.end(),
@@ -25,11 +26,11 @@ Eigen::VectorX<Eigen::Index> ArgSort(const VectorXd &v) {
 }
 
 template <typename Func, typename... Args>
-std::tuple<VectorXd, double, int> NelderMead(Func F, const VectorXd &x, const VectorXd &lower,
-                         const VectorXd upper, double init_step,
-                         double zero_pert, double alpha, double gamma,
-                         double rho, double sigma, int max_iter, double tol_std,
-                         bool adaptive, Args &&...args) {
+std::tuple<VectorXd, double, int>
+NelderMead(Func F, const VectorXd &x, const VectorXd &lower,
+           const VectorXd upper, double init_step, double zero_pert,
+           double alpha, double gamma, double rho, double sigma, int max_iter,
+           double tol_std, bool adaptive, Args &&...args) {
   auto x0 = Clamp(x, lower, upper);
   auto n = x0.size();
   if (adaptive) {
diff --git a/nbs/src/theta.ipynb b/nbs/src/theta.ipynb
index f9bc07259..bd86b92b4 100644
--- a/nbs/src/theta.ipynb
+++ b/nbs/src/theta.ipynb
@@ -19,22 +19,15 @@
    "source": [
     "#| export\n",
     "import math\n",
-    "from typing import Tuple\n",
     "\n",
     "import numpy as np\n",
-    "from numba import njit\n",
     "from scipy.stats import norm\n",
     "from statsmodels.tsa.seasonal import seasonal_decompose\n",
     "from statsmodels.tsa.stattools import acf\n",
     "\n",
-    "from statsforecast.utils import (\n",
-    "    CACHE,\n",
-    "    NOGIL,\n",
-    "    _repeat_val_seas,\n",
-    "    _seasonal_naive,\n",
-    "    restrict_to_bounds,\n",
-    "    results,\n",
-    ")"
+    "from statsforecast._lib import theta as _theta\n",
+    "from statsforecast.arima import is_constant\n",
+    "from statsforecast.utils import _repeat_val_seas, _seasonal_naive, results"
    ]
   },
   {
@@ -45,33 +38,6 @@
     "# Theta Model"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "03e43d2e-6341-4efd-81d6-b4ee8cf268e4",
-   "metadata": {},
-   "source": [
-    "## thetacalc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1b54e61d-3ddc-44c4-9b6e-6b3bd2aa3bd7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "# Global variables \n",
-    "STM = 0\n",
-    "OTM = 1\n",
-    "DSTM = 2\n",
-    "DOTM = 3\n",
-    "TOL = 1.0e-10\n",
-    "HUGEN = 1.0e10\n",
-    "NA = -99999.0\n",
-    "smalno = np.finfo(float).eps"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -80,44 +46,13 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "from fastcore.test import test_eq\n",
     "from statsforecast.utils import AirPassengers as ap"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b45ce636-f541-4f8b-bb6d-b27a6586d65f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def initstate(y, modeltype, initial_smoothed, alpha, theta):\n",
-    "    states = np.zeros((1, 5), dtype=np.float32)\n",
-    "    states[0, 0] = alpha * y[0] + (1 - alpha) * initial_smoothed # level\n",
-    "    states[0, 1] = y[0] #mean y\n",
-    "    if modeltype in [DSTM, DOTM]:\n",
-    "        # dynamic models\n",
-    "        states[0, 2] = y[0] # An\n",
-    "        states[0, 3] = 0 # Bn\n",
-    "        states[0, 4] = y[0] # mu\n",
-    "    else:\n",
-    "        # nodynamic models\n",
-    "        n = len(y)\n",
-    "        Bn = 6 * (2 * np.mean(np.arange(1, n + 1) * y) - (1 + n) * np.mean(y)) / ( n ** 2 - 1)\n",
-    "        An = np.mean(y) - (n + 1) * Bn / 2\n",
-    "        states[0, 2] = An\n",
-    "        states[0, 3] = Bn\n",
-    "        states[0, 4] = initial_smoothed + (1 - 1 / theta) * (An + Bn)\n",
-    "        \n",
-    "    return states"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "04033d29-19eb-4006-8886-79a701deaf8b",
+   "id": "7c3e561e-cffc-4d21-ab04-f142c82b84a1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -125,150 +60,16 @@
     "initial_smoothed = ap[0] / 2\n",
     "alpha = 0.5\n",
     "theta = 2\n",
-    "initstate(ap, modeltype=STM, initial_smoothed=initial_smoothed, alpha=alpha, theta=theta)\n",
-    "initstate(ap, modeltype=OTM, initial_smoothed=initial_smoothed, alpha=alpha, theta=theta)\n",
-    "initstate(ap, modeltype=DSTM, initial_smoothed=initial_smoothed, alpha=alpha, theta=theta)\n",
-    "initstate(ap, modeltype=DOTM, initial_smoothed=initial_smoothed, alpha=alpha, theta=theta)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "467c7ef6-267d-4d6a-9268-55a0c1ca24f6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def thetacalc(y: np.ndarray,\n",
-    "              states: np.ndarray, # states\n",
-    "              modeltype: int, \n",
-    "              initial_smoothed: float, \n",
-    "              alpha: float,\n",
-    "              theta: float, \n",
-    "              e: np.ndarray, \n",
-    "              amse: np.ndarray, \n",
-    "              nmse: int) -> float:\n",
-    "    denom = np.zeros(nmse)\n",
-    "    f = np.zeros(nmse)\n",
-    "    # update first state\n",
-    "    states[0, :] = initstate(y=y, modeltype=modeltype, \n",
-    "                             initial_smoothed=initial_smoothed, \n",
-    "                             alpha=alpha, theta=theta) \n",
-    "    \n",
-    "    amse[:nmse] = 0.\n",
-    "    e[0] = y[0] - states[0, 4]\n",
-    "    n = len(y)\n",
-    "    for i in range(1, n):\n",
-    "        # one step forecast \n",
-    "        thetafcst(states=states, i=i, modeltype=modeltype, f=f, h=nmse, alpha=alpha, theta=theta)\n",
-    "        if math.fabs(f[0] - NA) < TOL:\n",
-    "            mse = NA\n",
-    "            return mse\n",
-    "        e[i] = y[i] - f[0]\n",
-    "        for j in range(nmse):\n",
-    "            if (i + j) < n:\n",
-    "                denom[j] += 1.\n",
-    "                tmp = y[i + j] - f[j]\n",
-    "                amse[j] = (amse[j] * (denom[j] - 1.0) + (tmp * tmp)) / denom[j]\n",
-    "        # update state\n",
-    "        thetaupdate(states=states, i=i, modeltype=modeltype, \n",
-    "                    alpha=alpha, theta=theta, y=y[i], usemu=0)\n",
-    "    mean_y = np.mean(np.abs(y))\n",
-    "    if math.fabs(mean_y - 0.) < TOL:\n",
-    "        mean_y = TOL\n",
-    "    mse = np.sum(e[3:] ** 2) / mean_y\n",
-    "    return mse"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e2d34a68-2641-4eeb-b025-17f07472bbdc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def thetafcst(states, i, \n",
-    "              modeltype, \n",
-    "              f, h, \n",
-    "              alpha, theta):\n",
-    "    # obs:\n",
-    "    # forecast are obtained in a recursive manner\n",
-    "    # this is not standard, for example in ets\n",
-    "    #forecasts\n",
-    "    new_states = np.zeros((i + h, states.shape[1]), dtype=np.float32)\n",
-    "    new_states[:i] = states[:i]\n",
-    "    for i_h in range(h):\n",
-    "        thetaupdate(states=new_states, i=i + i_h, modeltype=modeltype, \n",
-    "                    alpha=alpha, theta=theta, y=0, usemu=1)\n",
-    "        f[i_h] = new_states[i + i_h, 4]  # mu is the forecast"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5a88b4c7-fad0-489f-b7c3-eb0e6db34f12",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def thetaupdate(states, i,\n",
-    "                modeltype, # kind of model \n",
-    "                alpha, theta,\n",
-    "                y, usemu):\n",
-    "    # states\n",
-    "    # level, meany, An, Bn, mu\n",
-    "    # get params\n",
-    "    level = states[i - 1, 0]\n",
-    "    meany = states[i - 1, 1]\n",
-    "    An = states[i - 1, 2]\n",
-    "    Bn = states[i - 1, 3]\n",
-    "    # update mu\n",
-    "    states[i, 4] = level + (1 - 1 / theta) * (An * ((1 - alpha) ** i) + Bn * (1 - (1 - alpha)**(i + 1)) / alpha)\n",
-    "    if usemu:\n",
-    "        y = states[i, 4]\n",
-    "    # update level\n",
-    "    states[i, 0] = alpha * y + (1 - alpha) * level\n",
-    "    # update meany\n",
-    "    states[i, 1] = (i * meany + y) / (i + 1)\n",
-    "    # update Bn and An\n",
-    "    if modeltype in [DSTM, DOTM]:\n",
-    "        # dynamic models\n",
-    "        states[i, 3] = ((i - 1) * Bn + 6 * (y - meany) / (i + 1)) / (i + 2)\n",
-    "        states[i, 2] = states[i, 1] - states[i, 3] * (i + 2) / 2\n",
-    "    else:\n",
-    "        states[i, 2] = An\n",
-    "        states[i, 3] = Bn\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6ff6588c-b9fd-4892-bf66-cac0ebe4a47c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def thetaforecast(states, n, modeltype, \n",
-    "                  f, h, alpha, theta):\n",
-    "    # compute forecasts\n",
-    "    new_states = thetafcst(\n",
-    "        states=states, i=n, modeltype=modeltype, \n",
-    "        f=f, h=h, \n",
-    "        alpha=alpha,\n",
-    "        theta=theta\n",
-    "    ) \n",
-    "    return new_states"
+    "_theta.init_state(ap, _theta.ModelType.STM, initial_smoothed, alpha, theta)\n",
+    "_theta.init_state(ap, _theta.ModelType.OTM, initial_smoothed, alpha, theta)\n",
+    "_theta.init_state(ap, _theta.ModelType.DSTM, initial_smoothed, alpha, theta)\n",
+    "_theta.init_state(ap, _theta.ModelType.DOTM, initial_smoothed, alpha, theta)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2585519c-6c14-45d3-9b28-ac4ef984d55e",
+   "id": "e92decc0-aa29-46ff-bf09-a6f7b2d8fa9e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -280,19 +81,24 @@
     "initial_smoothed = ap[0] / 2\n",
     "alpha = 0.5\n",
     "theta = 2.\n",
-    "init_states = np.zeros((len(ap), 5), dtype=np.float32)\n",
-    "mse = thetacalc(\n",
-    "    y=ap,\n",
-    "    states=init_states, \n",
-    "    modeltype=STM, \n",
-    "    initial_smoothed=initial_smoothed, alpha=alpha, theta=theta,\n",
-    "    e=e_, amse=amse_, nmse=3\n",
+    "init_states = np.zeros((len(ap), 5))\n",
+    "mse = _theta.calc(\n",
+    "    ap,\n",
+    "    init_states, \n",
+    "    _theta.ModelType.STM, \n",
+    "    initial_smoothed,\n",
+    "    alpha,\n",
+    "    theta,\n",
+    "    e_,\n",
+    "    amse_,\n",
+    "    3,\n",
     ")\n",
     "#verify we recover the fitted values\n",
     "np.testing.assert_array_equal(\n",
     "    ap - e_,\n",
     "    init_states[:, -1]\n",
     ")\n",
+    "\n",
     "#verify we get same fitted values than R\n",
     "# use stm(AirPassengers, s=F, estimation=F, h = 12)\n",
     "# to recover\n",
@@ -301,18 +107,20 @@
     "    np.array([101.1550, 107.9061, 449.1692]), \n",
     "    decimal=2\n",
     ")\n",
+    "\n",
     "# recover mse\n",
-    "test_eq(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
+    "assert math.isclose(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
     "\n",
     "# test forecasts\n",
     "h = 5\n",
-    "fcsts = np.zeros(h, dtype=np.float32)\n",
-    "thetaforecast(\n",
-    "    states=init_states, n=len(ap), \n",
-    "    modeltype=STM, \n",
-    "    f=fcsts, h=h, \n",
-    "    alpha=alpha,\n",
-    "    theta=theta\n",
+    "fcsts = np.zeros(h)\n",
+    "_theta.forecast(\n",
+    "    init_states,\n",
+    "    len(ap), \n",
+    "    _theta.ModelType.STM, \n",
+    "    fcsts,\n",
+    "    alpha,\n",
+    "    theta,\n",
     ")\n",
     "# test same forecast than R's\n",
     "np.testing.assert_array_almost_equal(\n",
@@ -325,7 +133,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b4aa2268-6181-4870-813d-ef26e9c75605",
+   "id": "600c15d8-a49c-493e-9753-a8f59bfc84b3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -337,13 +145,17 @@
     "initial_smoothed = ap[0] / 2\n",
     "alpha = 0.5\n",
     "theta = 2.\n",
-    "init_states = np.zeros((len(ap), 5), dtype=np.float32)\n",
-    "mse = thetacalc(\n",
-    "    y=ap,\n",
-    "    states=init_states, \n",
-    "    modeltype=OTM, \n",
-    "    initial_smoothed=initial_smoothed, alpha=alpha, theta=theta,\n",
-    "    e=e_, amse=amse_, nmse=3\n",
+    "init_states = np.zeros((len(ap), 5))\n",
+    "mse = _theta.calc(\n",
+    "    ap,\n",
+    "    init_states, \n",
+    "    _theta.ModelType.OTM, \n",
+    "    initial_smoothed,\n",
+    "    alpha,\n",
+    "    theta,\n",
+    "    e_,\n",
+    "    amse_,\n",
+    "    3,\n",
     ")\n",
     "#verify we recover the fitted values\n",
     "np.testing.assert_array_equal(\n",
@@ -359,17 +171,18 @@
     "    decimal=2\n",
     ")\n",
     "# recover mse\n",
-    "test_eq(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
+    "assert math.isclose(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
     "\n",
     "# test forecasts\n",
     "h = 5\n",
-    "fcsts = np.zeros(h, dtype=np.float32)\n",
-    "thetaforecast(\n",
-    "    states=init_states, n=len(ap), \n",
-    "    modeltype=OTM, \n",
-    "    f=fcsts, h=h, \n",
-    "    alpha=alpha,\n",
-    "    theta=theta\n",
+    "fcsts = np.zeros(h)\n",
+    "_theta.forecast(\n",
+    "    init_states,\n",
+    "    len(ap), \n",
+    "    _theta.ModelType.OTM, \n",
+    "    fcsts,\n",
+    "    alpha,\n",
+    "    theta,\n",
     ")\n",
     "# test same forecast than R's\n",
     "np.testing.assert_array_almost_equal(\n",
@@ -382,7 +195,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e98a7f30-bffa-4f14-828b-af1da9fe5e47",
+   "id": "1ee39aac-0500-4207-b440-dd36807675a3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -394,13 +207,17 @@
     "initial_smoothed = ap[0] / 2\n",
     "alpha = 0.5\n",
     "theta = 2.\n",
-    "init_states = np.zeros((len(ap), 5), dtype=np.float32)\n",
-    "mse = thetacalc(\n",
-    "    y=ap,\n",
-    "    states=init_states, \n",
-    "    modeltype=DSTM, \n",
-    "    initial_smoothed=initial_smoothed, alpha=alpha, theta=theta,\n",
-    "    e=e_, amse=amse_, nmse=3\n",
+    "init_states = np.zeros((len(ap), 5))\n",
+    "mse = _theta.calc(\n",
+    "    ap,\n",
+    "    init_states, \n",
+    "    _theta.ModelType.DSTM, \n",
+    "    initial_smoothed,\n",
+    "    alpha,\n",
+    "    theta,\n",
+    "    e_,\n",
+    "    amse_,\n",
+    "    3,\n",
     ")\n",
     "#verify we recover the fitted values\n",
     "np.testing.assert_array_equal(\n",
@@ -416,17 +233,18 @@
     "    decimal=2\n",
     ")\n",
     "# recover mse\n",
-    "test_eq(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
+    "assert math.isclose(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
     "\n",
     "# test forecasts\n",
     "h = 5\n",
-    "fcsts = np.zeros(h, dtype=np.float32)\n",
-    "thetaforecast(\n",
-    "    states=init_states, n=len(ap), \n",
-    "    modeltype=DSTM, \n",
-    "    f=fcsts, h=h, \n",
-    "    alpha=alpha,\n",
-    "    theta=theta\n",
+    "fcsts = np.zeros(h)\n",
+    "_theta.forecast(\n",
+    "    init_states,\n",
+    "    len(ap), \n",
+    "    _theta.ModelType.DSTM, \n",
+    "    fcsts,\n",
+    "    alpha,\n",
+    "    theta\n",
     ")\n",
     "# test same forecast than R's\n",
     "np.testing.assert_array_almost_equal(\n",
@@ -439,7 +257,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3f69faca-9587-4b81-b58a-4a6b0e7e2378",
+   "id": "ce095f83-4a1d-4c60-a931-5e3f366833a0",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -451,13 +269,17 @@
     "initial_smoothed = ap[0] / 2\n",
     "alpha = 0.5\n",
     "theta = 2.\n",
-    "init_states = np.zeros((len(ap), 5), dtype=np.float32)\n",
-    "mse = thetacalc(\n",
-    "    y=ap,\n",
-    "    states=init_states, \n",
-    "    modeltype=DOTM, \n",
-    "    initial_smoothed=initial_smoothed, alpha=alpha, theta=theta,\n",
-    "    e=e_, amse=amse_, nmse=3\n",
+    "init_states = np.zeros((len(ap), 5))\n",
+    "mse = _theta.calc(\n",
+    "    ap,\n",
+    "    init_states, \n",
+    "    _theta.ModelType.DOTM, \n",
+    "    initial_smoothed,\n",
+    "    alpha,\n",
+    "    theta,\n",
+    "    e_,\n",
+    "    amse_,\n",
+    "    3\n",
     ")\n",
     "#verify we recover the fitted values\n",
     "np.testing.assert_array_equal(\n",
@@ -473,17 +295,18 @@
     "    decimal=2\n",
     ")\n",
     "# recover mse\n",
-    "test_eq(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
+    "assert math.isclose(np.sum(e_[3:] ** 2) / np.mean(np.abs(ap)), mse)\n",
     "\n",
     "# test forecasts\n",
     "h = 5\n",
-    "fcsts = np.zeros(h, dtype=np.float32)\n",
-    "thetaforecast(\n",
-    "    states=init_states, n=len(ap), \n",
-    "    modeltype=DOTM, \n",
-    "    f=fcsts, h=h, \n",
-    "    alpha=alpha,\n",
-    "    theta=theta\n",
+    "fcsts = np.zeros(h)\n",
+    "_theta.forecast(\n",
+    "    init_states,\n",
+    "    len(ap), \n",
+    "    _theta.ModelType.DOTM, \n",
+    "    fcsts,\n",
+    "    alpha,\n",
+    "    theta\n",
     ")\n",
     "# test same forecast than R's\n",
     "np.testing.assert_array_almost_equal(\n",
@@ -501,42 +324,50 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def initparamtheta(initial_smoothed: float, alpha: float, theta: float,\n",
-    "                   y: np.ndarray,\n",
-    "                   modeltype: str):\n",
-    "    if modeltype in ['STM', 'DSTM']:\n",
-    "        if np.isnan(initial_smoothed):\n",
+    "def initparamtheta(\n",
+    "    initial_smoothed: float,\n",
+    "    alpha: float,\n",
+    "    theta: float,\n",
+    "    y: np.ndarray,\n",
+    "    modeltype: _theta.ModelType,\n",
+    "):\n",
+    "    if modeltype in [_theta.ModelType.STM, _theta.ModelType.DSTM]:\n",
+    "        if math.isnan(initial_smoothed):\n",
     "            initial_smoothed = y[0] / 2\n",
-    "            optimize_level = 1\n",
+    "            optimize_level = True\n",
     "        else:\n",
-    "            optimize_level = 0\n",
-    "        if np.isnan(alpha):\n",
+    "            optimize_level = False\n",
+    "        if math.isnan(alpha):\n",
     "            alpha = 0.5\n",
-    "            optimize_alpha = 1\n",
+    "            optimize_alpha = True\n",
     "        else:\n",
-    "            optimize_alpha = 0\n",
-    "        theta = 2. # no optimize\n",
-    "        optimize_theta = 0\n",
-    "    elif modeltype in ['OTM', 'DOTM']:\n",
-    "        if np.isnan(initial_smoothed):\n",
+    "            optimize_alpha = False\n",
+    "        theta = 2.0 # no optimize\n",
+    "        optimize_theta = False\n",
+    "    else:\n",
+    "        if math.isnan(initial_smoothed):\n",
     "            initial_smoothed = y[0] / 2\n",
-    "            optimize_level = 1\n",
+    "            optimize_level = True\n",
     "        else:\n",
-    "            optimize_level = 0\n",
-    "        if np.isnan(alpha):\n",
+    "            optimize_level = False\n",
+    "        if math.isnan(alpha):\n",
     "            alpha = 0.5\n",
-    "            optimize_alpha = 1\n",
+    "            optimize_alpha = True\n",
     "        else:\n",
-    "            optimize_alpha = 0\n",
-    "        if np.isnan(theta):\n",
-    "            theta = 2.\n",
-    "            optimize_theta = 1\n",
+    "            optimize_alpha = False\n",
+    "        if math.isnan(theta):\n",
+    "            theta = 2.0\n",
+    "            optimize_theta = True\n",
     "        else:\n",
-    "            optimize_theta = 0\n",
-    "    return {'initial_smoothed': initial_smoothed, 'optimize_initial_smoothed': optimize_level,\n",
-    "            'alpha': alpha, 'optimize_alpha': optimize_alpha,\n",
-    "            'theta': theta, 'optimize_theta': optimize_theta}"
+    "            optimize_theta = False\n",
+    "    return {\n",
+    "        'initial_smoothed': initial_smoothed,\n",
+    "        'optimize_initial_smoothed': optimize_level,\n",
+    "        'alpha': alpha,\n",
+    "        'optimize_alpha': optimize_alpha,\n",
+    "        'theta': theta,\n",
+    "        'optimize_theta': optimize_theta,\n",
+    "    }"
    ]
   },
   {
@@ -547,28 +378,39 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "initparamtheta(initial_smoothed=np.nan, alpha=np.nan, theta=np.nan,\n",
-    "               y=ap,\n",
-    "               modeltype='DOTM')"
+    "initparamtheta(\n",
+    "    initial_smoothed=np.nan,\n",
+    "    alpha=np.nan,\n",
+    "    theta=np.nan,\n",
+    "    y=ap,\n",
+    "    modeltype=_theta.ModelType.DOTM,\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ca89edd3-1d38-4a8d-b931-cde5b8f192d8",
+   "id": "6c95a65d-0eff-4196-963d-648a8f1cf768",
    "metadata": {},
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def switch_theta(x: str):\n",
-    "    return {'STM': 0, 'OTM': 1, 'DSTM': 2, 'DOTM': 3}[x]"
+    "def switch_theta(model: str) -> _theta.ModelType:\n",
+    "    if model == 'STM':\n",
+    "        return _theta.ModelType.STM\n",
+    "    if model == 'OTM':\n",
+    "        return _theta.ModelType.OTM\n",
+    "    if model == 'DSTM':\n",
+    "        return _theta.ModelType.DSTM\n",
+    "    if model == 'DOTM':\n",
+    "        return _theta.ModelType.DOTM\n",
+    "    raise ValueError(f'Invalid model type: {model}.')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a3f204a3-15e4-4d37-a571-824f6855c9bd",
+   "id": "e5d7cb36-af32-4028-9be0-05274c7bcc00",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -576,209 +418,6 @@
     "switch_theta('STM')"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ad1e5d88-c650-4780-9926-41b58cf2710b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def pegelsresid_theta(y: np.ndarray, \n",
-    "                      modeltype: str, \n",
-    "                      initial_smoothed: float, alpha: float,\n",
-    "                      theta: float, \n",
-    "                      nmse: int):\n",
-    "    states = np.zeros((len(y), 5), dtype=np.float32)\n",
-    "    e = np.full_like(y, fill_value=np.nan)\n",
-    "    amse = np.full(nmse, fill_value=np.nan)\n",
-    "    mse = thetacalc(y=y, states=states, \n",
-    "                    modeltype=switch_theta(modeltype), \n",
-    "                    initial_smoothed=initial_smoothed, alpha=alpha, theta=theta, \n",
-    "                    e=e, amse=amse, nmse=nmse)\n",
-    "    if not np.isnan(mse):\n",
-    "        if np.abs(mse + 99999) < 1e-7:\n",
-    "            mse = np.nan\n",
-    "    return amse, e, states, mse"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6ffc4052-c1d0-4350-a55d-1482c164907c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| export\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def theta_target_fn(\n",
-    "        optimal_param,\n",
-    "        init_level,\n",
-    "        init_alpha,\n",
-    "        init_theta,\n",
-    "        opt_level,\n",
-    "        opt_alpha,\n",
-    "        opt_theta,\n",
-    "        y,\n",
-    "        modeltype,\n",
-    "        nmse\n",
-    "    ):\n",
-    "    states = np.zeros((len(y), 5), dtype=np.float32)\n",
-    "    j = 0\n",
-    "    if opt_level:\n",
-    "        level = optimal_param[j]\n",
-    "        j+=1\n",
-    "    else:\n",
-    "        level = init_level\n",
-    "        \n",
-    "    if opt_alpha:\n",
-    "        alpha = optimal_param[j]\n",
-    "        j+=1\n",
-    "    else:\n",
-    "        alpha = init_alpha\n",
-    "        \n",
-    "    if opt_theta:\n",
-    "        theta = optimal_param[j]\n",
-    "        j+=1\n",
-    "    else:\n",
-    "        theta = init_theta\n",
-    "        \n",
-    "    e = np.full_like(y, fill_value=np.nan)\n",
-    "    amse = np.full(nmse, fill_value=np.nan)\n",
-    "    mse = thetacalc(y=y, states=states, \n",
-    "                    modeltype=switch_theta(modeltype), \n",
-    "                    initial_smoothed=level, alpha=alpha, theta=theta, \n",
-    "                    e=e, amse=amse, nmse=nmse)\n",
-    "    if mse < -1e10: \n",
-    "        mse = -1e10 \n",
-    "    if math.isnan(mse): \n",
-    "        mse = -np.inf\n",
-    "    if math.fabs(mse + 99999) < 1e-7: \n",
-    "        mse = -np.inf\n",
-    "    return mse"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e11e1340-8cf9-4e2e-a76e-bcfb43885622",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def nelder_mead_theta(\n",
-    "        x0: np.ndarray, \n",
-    "        args: Tuple = (), \n",
-    "        lower: np.ndarray = np.empty(0), \n",
-    "        upper: np.ndarray = np.empty(0), \n",
-    "        init_step: float = 0.05,\n",
-    "        zero_pert: float = 0.0001,\n",
-    "        alpha: float = 1.,\n",
-    "        gamma: float = 2.,\n",
-    "        rho: float = 0.5,\n",
-    "        sigma: float = 0.5,\n",
-    "        max_iter: int = 2_000,\n",
-    "        tol_std: float = 1e-10,\n",
-    "        adaptive: bool = False,\n",
-    "    ):\n",
-    "    #We are trying to minimize the function fn(x, args)\n",
-    "    #with initial point x0.\n",
-    "    #Step 0:\n",
-    "    # get x1, ..., x_{n+1}\n",
-    "    # the original article suggested a simplex where an initial point is given as x0\n",
-    "    # with the others generated with a fixed step along each dimension in turn.\n",
-    "    bounds = len(lower) and len(upper)\n",
-    "    if bounds:\n",
-    "        x0 = restrict_to_bounds(x0, lower, upper)\n",
-    "    \n",
-    "    n = x0.size\n",
-    "    if adaptive:\n",
-    "        gamma = 1. + 2. / n\n",
-    "        rho = 0.75 - 1. / (2. * n)\n",
-    "        sigma = 1. - 1. / n\n",
-    "    simplex = np.full((n + 1, n), fill_value=np.nan, dtype=np.float64) #each row is x_j\n",
-    "    simplex[:] = x0\n",
-    "    # perturb simplex using `init_step`\n",
-    "    diag = np.copy(np.diag(simplex))\n",
-    "    diag[diag == 0.] = zero_pert\n",
-    "    diag[diag != 0.] *= (1 + init_step)\n",
-    "    np.fill_diagonal(simplex, diag)\n",
-    "    # restrict simplex to bounds if passed\n",
-    "    if bounds:\n",
-    "        for j in range(n + 1):\n",
-    "            simplex[j] = restrict_to_bounds(simplex[j], lower, upper)\n",
-    "    # array of the value of f\n",
-    "    f_simplex = np.full(n + 1, fill_value=np.nan)\n",
-    "    for j in range(n + 1):\n",
-    "        f_simplex[j] = theta_target_fn(simplex[j], *args)\n",
-    "    for it in range(max_iter):\n",
-    "        #Step1: order of f_simplex\n",
-    "        #print(simplex)\n",
-    "        #print(f_simplex)\n",
-    "        order_f = f_simplex.argsort()\n",
-    "        best_idx = order_f[0]\n",
-    "        worst_idx = order_f[-1]\n",
-    "        second_worst_idx = order_f[-2]\n",
-    "        #Check whether method should stop.\n",
-    "        if np.std(f_simplex) < tol_std:\n",
-    "            break\n",
-    "        #calculate centroid except argmax f_simplex\n",
-    "        x_o = simplex[np.delete(order_f, -1)].sum(axis=0) / n\n",
-    "        #Step2: Reflection, Compute reflected point\n",
-    "        x_r = x_o + alpha * (x_o - simplex[worst_idx])\n",
-    "        # restrict x_r to bounds if passed\n",
-    "        if bounds:\n",
-    "            x_r = restrict_to_bounds(x_r, lower, upper)\n",
-    "        f_r = theta_target_fn(x_r, *args)\n",
-    "        if f_simplex[best_idx] <= f_r < f_simplex[second_worst_idx]:\n",
-    "            simplex[worst_idx] = x_r\n",
-    "            f_simplex[worst_idx] = f_r\n",
-    "            continue\n",
-    "        #Step3: Expansion, reflected point is the best point so far\n",
-    "        if f_r < f_simplex[best_idx]:\n",
-    "            x_e = x_o + gamma * (x_r - x_o)\n",
-    "            # restrict x_e to bounds if passed\n",
-    "            if bounds:\n",
-    "                x_e = restrict_to_bounds(x_e, lower, upper)\n",
-    "            f_e = theta_target_fn(x_e, *args)\n",
-    "            if f_e < f_r:\n",
-    "                simplex[worst_idx] = x_e\n",
-    "                f_simplex[worst_idx] = f_e\n",
-    "            else:\n",
-    "                simplex[worst_idx] = x_r\n",
-    "                f_simplex[worst_idx] = f_r\n",
-    "            continue\n",
-    "        #Step4: outside Contraction\n",
-    "        if f_simplex[second_worst_idx] <= f_r < f_simplex[worst_idx]:\n",
-    "            x_oc = x_o + rho * (x_r - x_o)\n",
-    "            if bounds:\n",
-    "                x_oc = restrict_to_bounds(x_oc, lower, upper)\n",
-    "            f_oc = theta_target_fn(x_oc, *args)\n",
-    "            if f_oc <= f_r:\n",
-    "                simplex[worst_idx] = x_oc\n",
-    "                f_simplex[worst_idx] = f_oc\n",
-    "                continue\n",
-    "        #step 5 inside contraction\n",
-    "        else:\n",
-    "            x_ic = x_o - rho * (x_r - x_o)\n",
-    "            # restrict x_c to bounds if passed\n",
-    "            if bounds:\n",
-    "                x_ic = restrict_to_bounds(x_ic, lower, upper)\n",
-    "            f_ic = theta_target_fn(x_ic, *args)\n",
-    "            if f_ic < f_simplex[worst_idx]:\n",
-    "                simplex[worst_idx] = x_ic\n",
-    "                f_simplex[worst_idx] = f_ic\n",
-    "                continue\n",
-    "        #step 6: shrink\n",
-    "        simplex[np.delete(order_f, 0)] = simplex[best_idx] + sigma * (simplex[np.delete(order_f, 0)] - simplex[best_idx])\n",
-    "        for i in np.delete(order_f, 0):\n",
-    "            simplex[i] = restrict_to_bounds(simplex[i], lower, upper)\n",
-    "            f_simplex[i] = theta_target_fn(simplex[i], *args)\n",
-    "    return results(simplex[best_idx], f_simplex[best_idx], it + 1, simplex)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -788,14 +427,36 @@
    "source": [
     "#| exporti\n",
     "def optimize_theta_target_fn(\n",
-    "        init_par, optimize_params, y, \n",
-    "        modeltype, nmse\n",
-    "    ):\n",
-    "    x0 = [init_par[key] for key, val in optimize_params.items() if val]\n",
-    "    x0 = np.array(x0, dtype=np.float32)\n",
-    "    if not len(x0):\n",
+    "    init_par: dict[str, float],\n",
+    "    optimize_params: dict[str, bool],\n",
+    "    y: np.ndarray, \n",
+    "    modeltype: _theta.ModelType,\n",
+    "    nmse: int,\n",
+    "):\n",
+    "    x0 = []\n",
+    "    lower = []\n",
+    "    upper = []\n",
+    "    lower_bounds = {\n",
+    "        'initial_smoothed': -1e10,\n",
+    "        'alpha': 0.1,\n",
+    "        'theta': 1.0,\n",
+    "    }\n",
+    "    upper_bounds = {\n",
+    "        'initial_smoothed': 1e10,\n",
+    "        'alpha': 0.99,\n",
+    "        'theta': 1e10,\n",
+    "    }\n",
+    "    for param, optim in optimize_params.items():\n",
+    "        if optim:\n",
+    "            x0.append(init_par[param])\n",
+    "            lower.append(lower_bounds[param])\n",
+    "            upper.append(upper_bounds[param])\n",
+    "    if not x0:\n",
     "        return\n",
-    "    \n",
+    "    x0 = np.array(x0)\n",
+    "    lower = np.array(lower)\n",
+    "    upper = np.array(upper)\n",
+    "\n",
     "    init_level = init_par['initial_smoothed']\n",
     "    init_alpha = init_par['alpha']\n",
     "    init_theta = init_par['theta']\n",
@@ -803,50 +464,22 @@
     "    opt_level = optimize_params['initial_smoothed']\n",
     "    opt_alpha = optimize_params['alpha']\n",
     "    opt_theta = optimize_params['theta']\n",
-    "    \n",
-    "    res = nelder_mead_theta(\n",
-    "        x0, \n",
-    "        args=(\n",
-    "            init_level,\n",
-    "            init_alpha,\n",
-    "            init_theta,\n",
-    "            opt_level,\n",
-    "            opt_alpha,\n",
-    "            opt_theta,\n",
-    "            y,\n",
-    "            modeltype,\n",
-    "            nmse\n",
-    "        ),\n",
-    "        tol_std=1e-4, \n",
-    "        lower=np.array([-1e10, 0.1, 1.0]),\n",
-    "        upper=np.array([1e10, 0.99, 1e10]),\n",
-    "        max_iter=1_000,\n",
-    "        adaptive=True,\n",
+    "\n",
+    "    opt_res = _theta.optimize(\n",
+    "        x0,\n",
+    "        lower,\n",
+    "        upper,\n",
+    "        init_level,\n",
+    "        init_alpha,\n",
+    "        init_theta,\n",
+    "        opt_level,\n",
+    "        opt_alpha,\n",
+    "        opt_theta,\n",
+    "        y,\n",
+    "        modeltype,\n",
+    "        nmse\n",
     "    )\n",
-    "    return res"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f9172367-0b40-4e54-9fbb-aaad181d865e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| export\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def is_constant(x):\n",
-    "    return np.all(x[0] == x)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6ab84bc3-22be-4489-a140-dcd974313a94",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "is_constant(ap)"
+    "    return results(*opt_res, None)  # type: ignore"
    ]
   },
   {
@@ -858,26 +491,39 @@
    "source": [
     "#| exporti\n",
     "def thetamodel(\n",
-    "        y: np.ndarray, m: int, \n",
-    "        modeltype: str, \n",
-    "        initial_smoothed: float, alpha: float,\n",
-    "        theta: float, nmse: int\n",
-    "    ):\n",
+    "    y: np.ndarray,\n",
+    "    m: int, \n",
+    "    modeltype: str, \n",
+    "    initial_smoothed: float,\n",
+    "    alpha: float,\n",
+    "    theta: float,\n",
+    "    nmse: int\n",
+    "):\n",
+    "    y = y.astype(np.float64, copy=False)\n",
+    "    model_type = switch_theta(modeltype)\n",
     "    #initial parameters\n",
-    "    par = initparamtheta(initial_smoothed=initial_smoothed, \n",
-    "                         alpha=alpha, theta=theta, \n",
-    "                         y=y, modeltype=modeltype)\n",
+    "    par = initparamtheta(\n",
+    "        initial_smoothed=initial_smoothed, \n",
+    "        alpha=alpha,\n",
+    "        theta=theta, \n",
+    "        y=y,\n",
+    "        modeltype=model_type,\n",
+    "    )\n",
     "    optimize_params = {key.replace('optimize_', ''): val for key, val in par.items() if 'optim' in key}\n",
     "    par = {key: val for key, val in par.items() if 'optim' not in key}\n",
     "    # parameter optimization\n",
     "    fred = optimize_theta_target_fn(\n",
-    "        init_par=par, optimize_params=optimize_params, y=y, \n",
-    "        modeltype=modeltype, nmse=nmse\n",
+    "        init_par=par,\n",
+    "        optimize_params=optimize_params,\n",
+    "        y=y, \n",
+    "        modeltype=model_type,\n",
+    "        nmse=nmse,\n",
     "    )\n",
     "    if fred is not None:\n",
     "        fit_par = fred.x\n",
     "    j = 0\n",
     "    if optimize_params['initial_smoothed']:\n",
+    "        par['initial_smoothed'] = fit_par[j]\n",
     "        j += 1\n",
     "    if optimize_params['alpha']:\n",
     "        par['alpha'] = fit_par[j]\n",
@@ -886,11 +532,15 @@
     "        par['theta'] = fit_par[j]\n",
     "        j += 1\n",
     "    \n",
-    "    amse, e, states, mse = pegelsresid_theta(\n",
-    "        y=y, modeltype=modeltype,\n",
-    "        nmse=nmse, **par\n",
+    "    amse, e, states, mse = _theta.pegels_resid(\n",
+    "        y,\n",
+    "        model_type,\n",
+    "        par['initial_smoothed'],\n",
+    "        par['alpha'],\n",
+    "        par['theta'],\n",
+    "        nmse,\n",
     "    )\n",
-    "    \n",
+    "\n",
     "    return dict(mse=mse, amse=amse, fit=fred, residuals=e,\n",
     "                m=m, states=states, par=par, n=len(y), \n",
     "                modeltype=modeltype, mean_y=np.mean(y))"
@@ -905,7 +555,9 @@
    "source": [
     "#| hide\n",
     "res = thetamodel(\n",
-    "    y=ap, m=12, modeltype='STM',\n",
+    "    y=ap,\n",
+    "    m=12,\n",
+    "    modeltype='STM',\n",
     "    initial_smoothed=np.nan,\n",
     "    alpha=np.nan,\n",
     "    theta=np.nan, \n",
@@ -943,19 +595,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| exporti\n",
+    "#| export\n",
     "def forecast_theta(obj, h, level=None):\n",
-    "    forecast = np.full(h, fill_value=np.nan)\n",
+    "    forecast = np.empty(h)\n",
     "    n = obj['n']\n",
     "    states = obj['states']\n",
     "    alpha=obj['par']['alpha']\n",
     "    theta=obj['par']['theta']\n",
-    "    thetaforecast(\n",
-    "        states=states, n=n, modeltype=switch_theta(obj['modeltype']), \n",
-    "        h=h, f=forecast, alpha=alpha, theta=theta\n",
+    "    _theta.forecast(\n",
+    "        states,\n",
+    "        n,\n",
+    "        switch_theta(obj['modeltype']), \n",
+    "        forecast,\n",
+    "        alpha,\n",
+    "        theta,\n",
     "    )\n",
     "    res = {'mean': forecast}\n",
-    "    \n",
+    "\n",
     "    if level is not None:\n",
     "        sigma = np.std(obj['residuals'][3:], ddof=1)\n",
     "        mean_y = obj['mean_y']\n",
@@ -966,7 +622,7 @@
     "            max_q = min_q + lv / 100\n",
     "            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=1)\n",
     "            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=1)\n",
-    "            \n",
+    "\n",
     "    if obj.get('decompose', False):\n",
     "        seas_forecast = _repeat_val_seas(obj['seas_forecast']['mean'], h=h)\n",
     "        for key in res:\n",
@@ -994,14 +650,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| exporti\n",
+    "#| export\n",
     "def auto_theta(\n",
-    "        y, m, model=None, \n",
-    "        initial_smoothed=None, alpha=None, \n",
-    "        theta=None,\n",
-    "        nmse=3,\n",
-    "        decomposition_type='multiplicative'\n",
-    "    ):\n",
+    "    y,\n",
+    "    m,\n",
+    "    model=None, \n",
+    "    initial_smoothed=None,\n",
+    "    alpha=None, \n",
+    "    theta=None,\n",
+    "    nmse=3,\n",
+    "    decomposition_type='multiplicative'\n",
+    "):\n",
     "    # converting params to floats \n",
     "    # to improve numba compilation\n",
     "    if initial_smoothed is None:\n",
@@ -1041,7 +700,7 @@
     "    \n",
     "    # validate model\n",
     "    if model not in [None, 'STM', 'OTM', 'DSTM', 'DOTM']:\n",
-    "        raise ValueError('Invalid model type')\n",
+    "        raise ValueError(f'Invalid model type: {model}.')\n",
     "\n",
     "    n = len(y)\n",
     "    npars = 3 \n",
@@ -1064,7 +723,7 @@
     "                best_ic = fit_ic\n",
     "    if np.isinf(best_ic):\n",
     "        raise Exception('no model able to be fitted')\n",
-    "        \n",
+    "\n",
     "    if decompose:\n",
     "        if decomposition_type == 'multiplicative':\n",
     "            model['residuals'] = model['residuals'] * y_decompose\n",
@@ -1395,7 +1054,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| exporti\n",
+    "#| export\n",
     "def forward_theta(fitted_model, y):\n",
     "    m = fitted_model['m']\n",
     "    model = fitted_model['modeltype']\n",
@@ -1417,7 +1076,7 @@
    "source": [
     "#| hide\n",
     "res = auto_theta(ap, m=12)\n",
-    "test_eq(\n",
+    "np.testing.assert_allclose(\n",
     "    forecast_theta(forward_theta(res, ap), h=12)['mean'],\n",
     "    forecast_theta(res, h=12)['mean']\n",
     ")\n",
@@ -1425,7 +1084,7 @@
     "forecast_theta(forward_theta(res, inttermitent_series), h=12, level=[80,90])\n",
     "res_transfer = forward_theta(res, inttermitent_series)\n",
     "for key in res_transfer['par']:\n",
-    "    test_eq(res['par'][key], res_transfer['par'][key])"
+    "    assert res['par'][key] == res_transfer['par'][key]"
    ]
   }
  ],
diff --git a/python/statsforecast/_modidx.py b/python/statsforecast/_modidx.py
index 857925a4f..d51e60e23 100644
--- a/python/statsforecast/_modidx.py
+++ b/python/statsforecast/_modidx.py
@@ -763,21 +763,10 @@
                                      'statsforecast.theta.forecast_theta': ('src/theta.html#forecast_theta', 'statsforecast/theta.py'),
                                      'statsforecast.theta.forward_theta': ('src/theta.html#forward_theta', 'statsforecast/theta.py'),
                                      'statsforecast.theta.initparamtheta': ('src/theta.html#initparamtheta', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.initstate': ('src/theta.html#initstate', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.is_constant': ('src/theta.html#is_constant', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.nelder_mead_theta': ( 'src/theta.html#nelder_mead_theta',
-                                                                                'statsforecast/theta.py'),
                                      'statsforecast.theta.optimize_theta_target_fn': ( 'src/theta.html#optimize_theta_target_fn',
                                                                                        'statsforecast/theta.py'),
-                                     'statsforecast.theta.pegelsresid_theta': ( 'src/theta.html#pegelsresid_theta',
-                                                                                'statsforecast/theta.py'),
                                      'statsforecast.theta.switch_theta': ('src/theta.html#switch_theta', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.theta_target_fn': ('src/theta.html#theta_target_fn', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.thetacalc': ('src/theta.html#thetacalc', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.thetafcst': ('src/theta.html#thetafcst', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.thetaforecast': ('src/theta.html#thetaforecast', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.thetamodel': ('src/theta.html#thetamodel', 'statsforecast/theta.py'),
-                                     'statsforecast.theta.thetaupdate': ('src/theta.html#thetaupdate', 'statsforecast/theta.py')},
+                                     'statsforecast.theta.thetamodel': ('src/theta.html#thetamodel', 'statsforecast/theta.py')},
             'statsforecast.utils': { 'statsforecast.utils.ConformalIntervals': ( 'src/utils.html#conformalintervals',
                                                                                  'statsforecast/utils.py'),
                                      'statsforecast.utils.ConformalIntervals.__init__': ( 'src/utils.html#conformalintervals.__init__',
diff --git a/python/statsforecast/theta.py b/python/statsforecast/theta.py
index 9999ae6ef..545ed89b7 100644
--- a/python/statsforecast/theta.py
+++ b/python/statsforecast/theta.py
@@ -1,220 +1,57 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/src/theta.ipynb.
 
 # %% auto 0
-__all__ = ['theta_target_fn', 'is_constant']
+__all__ = ['forecast_theta', 'auto_theta', 'forward_theta']
 
 # %% ../../nbs/src/theta.ipynb 1
 import math
-from typing import Tuple
 
 import numpy as np
-from numba import njit
 from scipy.stats import norm
 from statsmodels.tsa.seasonal import seasonal_decompose
 from statsmodels.tsa.stattools import acf
 
-from statsforecast.utils import (
-    CACHE,
-    NOGIL,
-    _repeat_val_seas,
-    _seasonal_naive,
-    restrict_to_bounds,
-    results,
-)
+from ._lib import theta as _theta
+from .arima import is_constant
+from .utils import _repeat_val_seas, _seasonal_naive, results
 
-# %% ../../nbs/src/theta.ipynb 4
-# Global variables
-STM = 0
-OTM = 1
-DSTM = 2
-DOTM = 3
-TOL = 1.0e-10
-HUGEN = 1.0e10
-NA = -99999.0
-smalno = np.finfo(float).eps
-
-# %% ../../nbs/src/theta.ipynb 6
-@njit(nogil=NOGIL, cache=CACHE)
-def initstate(y, modeltype, initial_smoothed, alpha, theta):
-    states = np.zeros((1, 5), dtype=np.float32)
-    states[0, 0] = alpha * y[0] + (1 - alpha) * initial_smoothed  # level
-    states[0, 1] = y[0]  # mean y
-    if modeltype in [DSTM, DOTM]:
-        # dynamic models
-        states[0, 2] = y[0]  # An
-        states[0, 3] = 0  # Bn
-        states[0, 4] = y[0]  # mu
-    else:
-        # nodynamic models
-        n = len(y)
-        Bn = (
-            6
-            * (2 * np.mean(np.arange(1, n + 1) * y) - (1 + n) * np.mean(y))
-            / (n**2 - 1)
-        )
-        An = np.mean(y) - (n + 1) * Bn / 2
-        states[0, 2] = An
-        states[0, 3] = Bn
-        states[0, 4] = initial_smoothed + (1 - 1 / theta) * (An + Bn)
-
-    return states
-
-# %% ../../nbs/src/theta.ipynb 8
-@njit(nogil=NOGIL, cache=CACHE)
-def thetacalc(
-    y: np.ndarray,
-    states: np.ndarray,  # states
-    modeltype: int,
+# %% ../../nbs/src/theta.ipynb 9
+def initparamtheta(
     initial_smoothed: float,
     alpha: float,
     theta: float,
-    e: np.ndarray,
-    amse: np.ndarray,
-    nmse: int,
-) -> float:
-    denom = np.zeros(nmse)
-    f = np.zeros(nmse)
-    # update first state
-    states[0, :] = initstate(
-        y=y,
-        modeltype=modeltype,
-        initial_smoothed=initial_smoothed,
-        alpha=alpha,
-        theta=theta,
-    )
-
-    amse[:nmse] = 0.0
-    e[0] = y[0] - states[0, 4]
-    n = len(y)
-    for i in range(1, n):
-        # one step forecast
-        thetafcst(
-            states=states,
-            i=i,
-            modeltype=modeltype,
-            f=f,
-            h=nmse,
-            alpha=alpha,
-            theta=theta,
-        )
-        if math.fabs(f[0] - NA) < TOL:
-            mse = NA
-            return mse
-        e[i] = y[i] - f[0]
-        for j in range(nmse):
-            if (i + j) < n:
-                denom[j] += 1.0
-                tmp = y[i + j] - f[j]
-                amse[j] = (amse[j] * (denom[j] - 1.0) + (tmp * tmp)) / denom[j]
-        # update state
-        thetaupdate(
-            states=states,
-            i=i,
-            modeltype=modeltype,
-            alpha=alpha,
-            theta=theta,
-            y=y[i],
-            usemu=0,
-        )
-    mean_y = np.mean(np.abs(y))
-    if math.fabs(mean_y - 0.0) < TOL:
-        mean_y = TOL
-    mse = np.sum(e[3:] ** 2) / mean_y
-    return mse
-
-# %% ../../nbs/src/theta.ipynb 9
-@njit(nogil=NOGIL, cache=CACHE)
-def thetafcst(states, i, modeltype, f, h, alpha, theta):
-    # obs:
-    # forecast are obtained in a recursive manner
-    # this is not standard, for example in ets
-    # forecasts
-    new_states = np.zeros((i + h, states.shape[1]), dtype=np.float32)
-    new_states[:i] = states[:i]
-    for i_h in range(h):
-        thetaupdate(
-            states=new_states,
-            i=i + i_h,
-            modeltype=modeltype,
-            alpha=alpha,
-            theta=theta,
-            y=0,
-            usemu=1,
-        )
-        f[i_h] = new_states[i + i_h, 4]  # mu is the forecast
-
-# %% ../../nbs/src/theta.ipynb 10
-@njit(nogil=NOGIL, cache=CACHE)
-def thetaupdate(states, i, modeltype, alpha, theta, y, usemu):  # kind of model
-    # states
-    # level, meany, An, Bn, mu
-    # get params
-    level = states[i - 1, 0]
-    meany = states[i - 1, 1]
-    An = states[i - 1, 2]
-    Bn = states[i - 1, 3]
-    # update mu
-    states[i, 4] = level + (1 - 1 / theta) * (
-        An * ((1 - alpha) ** i) + Bn * (1 - (1 - alpha) ** (i + 1)) / alpha
-    )
-    if usemu:
-        y = states[i, 4]
-    # update level
-    states[i, 0] = alpha * y + (1 - alpha) * level
-    # update meany
-    states[i, 1] = (i * meany + y) / (i + 1)
-    # update Bn and An
-    if modeltype in [DSTM, DOTM]:
-        # dynamic models
-        states[i, 3] = ((i - 1) * Bn + 6 * (y - meany) / (i + 1)) / (i + 2)
-        states[i, 2] = states[i, 1] - states[i, 3] * (i + 2) / 2
-    else:
-        states[i, 2] = An
-        states[i, 3] = Bn
-
-# %% ../../nbs/src/theta.ipynb 11
-@njit(nogil=NOGIL, cache=CACHE)
-def thetaforecast(states, n, modeltype, f, h, alpha, theta):
-    # compute forecasts
-    new_states = thetafcst(
-        states=states, i=n, modeltype=modeltype, f=f, h=h, alpha=alpha, theta=theta
-    )
-    return new_states
-
-# %% ../../nbs/src/theta.ipynb 16
-@njit(nogil=NOGIL, cache=CACHE)
-def initparamtheta(
-    initial_smoothed: float, alpha: float, theta: float, y: np.ndarray, modeltype: str
+    y: np.ndarray,
+    modeltype: _theta.ModelType,
 ):
-    if modeltype in ["STM", "DSTM"]:
-        if np.isnan(initial_smoothed):
+    if modeltype in [_theta.ModelType.STM, _theta.ModelType.DSTM]:
+        if math.isnan(initial_smoothed):
             initial_smoothed = y[0] / 2
-            optimize_level = 1
+            optimize_level = True
         else:
-            optimize_level = 0
-        if np.isnan(alpha):
+            optimize_level = False
+        if math.isnan(alpha):
             alpha = 0.5
-            optimize_alpha = 1
+            optimize_alpha = True
         else:
-            optimize_alpha = 0
+            optimize_alpha = False
         theta = 2.0  # no optimize
-        optimize_theta = 0
-    elif modeltype in ["OTM", "DOTM"]:
-        if np.isnan(initial_smoothed):
+        optimize_theta = False
+    else:
+        if math.isnan(initial_smoothed):
             initial_smoothed = y[0] / 2
-            optimize_level = 1
+            optimize_level = True
         else:
-            optimize_level = 0
-        if np.isnan(alpha):
+            optimize_level = False
+        if math.isnan(alpha):
             alpha = 0.5
-            optimize_alpha = 1
+            optimize_alpha = True
         else:
-            optimize_alpha = 0
-        if np.isnan(theta):
+            optimize_alpha = False
+        if math.isnan(theta):
             theta = 2.0
-            optimize_theta = 1
+            optimize_theta = True
         else:
-            optimize_theta = 0
+            optimize_theta = False
     return {
         "initial_smoothed": initial_smoothed,
         "optimize_initial_smoothed": optimize_level,
@@ -224,217 +61,49 @@ def initparamtheta(
         "optimize_theta": optimize_theta,
     }
 
-# %% ../../nbs/src/theta.ipynb 18
-@njit(nogil=NOGIL, cache=CACHE)
-def switch_theta(x: str):
-    return {"STM": 0, "OTM": 1, "DSTM": 2, "DOTM": 3}[x]
-
-# %% ../../nbs/src/theta.ipynb 20
-@njit(nogil=NOGIL, cache=CACHE)
-def pegelsresid_theta(
+# %% ../../nbs/src/theta.ipynb 11
+def switch_theta(model: str) -> _theta.ModelType:
+    if model == "STM":
+        return _theta.ModelType.STM
+    if model == "OTM":
+        return _theta.ModelType.OTM
+    if model == "DSTM":
+        return _theta.ModelType.DSTM
+    if model == "DOTM":
+        return _theta.ModelType.DOTM
+    raise ValueError(f"Invalid model type: {model}.")
+
+# %% ../../nbs/src/theta.ipynb 13
+def optimize_theta_target_fn(
+    init_par: dict[str, float],
+    optimize_params: dict[str, bool],
     y: np.ndarray,
-    modeltype: str,
-    initial_smoothed: float,
-    alpha: float,
-    theta: float,
+    modeltype: _theta.ModelType,
     nmse: int,
 ):
-    states = np.zeros((len(y), 5), dtype=np.float32)
-    e = np.full_like(y, fill_value=np.nan)
-    amse = np.full(nmse, fill_value=np.nan)
-    mse = thetacalc(
-        y=y,
-        states=states,
-        modeltype=switch_theta(modeltype),
-        initial_smoothed=initial_smoothed,
-        alpha=alpha,
-        theta=theta,
-        e=e,
-        amse=amse,
-        nmse=nmse,
-    )
-    if not np.isnan(mse):
-        if np.abs(mse + 99999) < 1e-7:
-            mse = np.nan
-    return amse, e, states, mse
-
-# %% ../../nbs/src/theta.ipynb 21
-@njit(nogil=NOGIL, cache=CACHE)
-def theta_target_fn(
-    optimal_param,
-    init_level,
-    init_alpha,
-    init_theta,
-    opt_level,
-    opt_alpha,
-    opt_theta,
-    y,
-    modeltype,
-    nmse,
-):
-    states = np.zeros((len(y), 5), dtype=np.float32)
-    j = 0
-    if opt_level:
-        level = optimal_param[j]
-        j += 1
-    else:
-        level = init_level
-
-    if opt_alpha:
-        alpha = optimal_param[j]
-        j += 1
-    else:
-        alpha = init_alpha
-
-    if opt_theta:
-        theta = optimal_param[j]
-        j += 1
-    else:
-        theta = init_theta
-
-    e = np.full_like(y, fill_value=np.nan)
-    amse = np.full(nmse, fill_value=np.nan)
-    mse = thetacalc(
-        y=y,
-        states=states,
-        modeltype=switch_theta(modeltype),
-        initial_smoothed=level,
-        alpha=alpha,
-        theta=theta,
-        e=e,
-        amse=amse,
-        nmse=nmse,
-    )
-    if mse < -1e10:
-        mse = -1e10
-    if math.isnan(mse):
-        mse = -np.inf
-    if math.fabs(mse + 99999) < 1e-7:
-        mse = -np.inf
-    return mse
-
-# %% ../../nbs/src/theta.ipynb 22
-@njit(nogil=NOGIL, cache=CACHE)
-def nelder_mead_theta(
-    x0: np.ndarray,
-    args: Tuple = (),
-    lower: np.ndarray = np.empty(0),
-    upper: np.ndarray = np.empty(0),
-    init_step: float = 0.05,
-    zero_pert: float = 0.0001,
-    alpha: float = 1.0,
-    gamma: float = 2.0,
-    rho: float = 0.5,
-    sigma: float = 0.5,
-    max_iter: int = 2_000,
-    tol_std: float = 1e-10,
-    adaptive: bool = False,
-):
-    # We are trying to minimize the function fn(x, args)
-    # with initial point x0.
-    # Step 0:
-    # get x1, ..., x_{n+1}
-    # the original article suggested a simplex where an initial point is given as x0
-    # with the others generated with a fixed step along each dimension in turn.
-    bounds = len(lower) and len(upper)
-    if bounds:
-        x0 = restrict_to_bounds(x0, lower, upper)
-
-    n = x0.size
-    if adaptive:
-        gamma = 1.0 + 2.0 / n
-        rho = 0.75 - 1.0 / (2.0 * n)
-        sigma = 1.0 - 1.0 / n
-    simplex = np.full(
-        (n + 1, n), fill_value=np.nan, dtype=np.float64
-    )  # each row is x_j
-    simplex[:] = x0
-    # perturb simplex using `init_step`
-    diag = np.copy(np.diag(simplex))
-    diag[diag == 0.0] = zero_pert
-    diag[diag != 0.0] *= 1 + init_step
-    np.fill_diagonal(simplex, diag)
-    # restrict simplex to bounds if passed
-    if bounds:
-        for j in range(n + 1):
-            simplex[j] = restrict_to_bounds(simplex[j], lower, upper)
-    # array of the value of f
-    f_simplex = np.full(n + 1, fill_value=np.nan)
-    for j in range(n + 1):
-        f_simplex[j] = theta_target_fn(simplex[j], *args)
-    for it in range(max_iter):
-        # Step1: order of f_simplex
-        # print(simplex)
-        # print(f_simplex)
-        order_f = f_simplex.argsort()
-        best_idx = order_f[0]
-        worst_idx = order_f[-1]
-        second_worst_idx = order_f[-2]
-        # Check whether method should stop.
-        if np.std(f_simplex) < tol_std:
-            break
-        # calculate centroid except argmax f_simplex
-        x_o = simplex[np.delete(order_f, -1)].sum(axis=0) / n
-        # Step2: Reflection, Compute reflected point
-        x_r = x_o + alpha * (x_o - simplex[worst_idx])
-        # restrict x_r to bounds if passed
-        if bounds:
-            x_r = restrict_to_bounds(x_r, lower, upper)
-        f_r = theta_target_fn(x_r, *args)
-        if f_simplex[best_idx] <= f_r < f_simplex[second_worst_idx]:
-            simplex[worst_idx] = x_r
-            f_simplex[worst_idx] = f_r
-            continue
-        # Step3: Expansion, reflected point is the best point so far
-        if f_r < f_simplex[best_idx]:
-            x_e = x_o + gamma * (x_r - x_o)
-            # restrict x_e to bounds if passed
-            if bounds:
-                x_e = restrict_to_bounds(x_e, lower, upper)
-            f_e = theta_target_fn(x_e, *args)
-            if f_e < f_r:
-                simplex[worst_idx] = x_e
-                f_simplex[worst_idx] = f_e
-            else:
-                simplex[worst_idx] = x_r
-                f_simplex[worst_idx] = f_r
-            continue
-        # Step4: outside Contraction
-        if f_simplex[second_worst_idx] <= f_r < f_simplex[worst_idx]:
-            x_oc = x_o + rho * (x_r - x_o)
-            if bounds:
-                x_oc = restrict_to_bounds(x_oc, lower, upper)
-            f_oc = theta_target_fn(x_oc, *args)
-            if f_oc <= f_r:
-                simplex[worst_idx] = x_oc
-                f_simplex[worst_idx] = f_oc
-                continue
-        # step 5 inside contraction
-        else:
-            x_ic = x_o - rho * (x_r - x_o)
-            # restrict x_c to bounds if passed
-            if bounds:
-                x_ic = restrict_to_bounds(x_ic, lower, upper)
-            f_ic = theta_target_fn(x_ic, *args)
-            if f_ic < f_simplex[worst_idx]:
-                simplex[worst_idx] = x_ic
-                f_simplex[worst_idx] = f_ic
-                continue
-        # step 6: shrink
-        simplex[np.delete(order_f, 0)] = simplex[best_idx] + sigma * (
-            simplex[np.delete(order_f, 0)] - simplex[best_idx]
-        )
-        for i in np.delete(order_f, 0):
-            simplex[i] = restrict_to_bounds(simplex[i], lower, upper)
-            f_simplex[i] = theta_target_fn(simplex[i], *args)
-    return results(simplex[best_idx], f_simplex[best_idx], it + 1, simplex)
-
-# %% ../../nbs/src/theta.ipynb 23
-def optimize_theta_target_fn(init_par, optimize_params, y, modeltype, nmse):
-    x0 = [init_par[key] for key, val in optimize_params.items() if val]
-    x0 = np.array(x0, dtype=np.float32)
-    if not len(x0):
+    x0 = []
+    lower = []
+    upper = []
+    lower_bounds = {
+        "initial_smoothed": -1e10,
+        "alpha": 0.1,
+        "theta": 1.0,
+    }
+    upper_bounds = {
+        "initial_smoothed": 1e10,
+        "alpha": 0.99,
+        "theta": 1e10,
+    }
+    for param, optim in optimize_params.items():
+        if optim:
+            x0.append(init_par[param])
+            lower.append(lower_bounds[param])
+            upper.append(upper_bounds[param])
+    if not x0:
         return
+    x0 = np.array(x0)
+    lower = np.array(lower)
+    upper = np.array(upper)
 
     init_level = init_par["initial_smoothed"]
     init_alpha = init_par["alpha"]
@@ -444,33 +113,23 @@ def optimize_theta_target_fn(init_par, optimize_params, y, modeltype, nmse):
     opt_alpha = optimize_params["alpha"]
     opt_theta = optimize_params["theta"]
 
-    res = nelder_mead_theta(
+    opt_res = _theta.optimize(
         x0,
-        args=(
-            init_level,
-            init_alpha,
-            init_theta,
-            opt_level,
-            opt_alpha,
-            opt_theta,
-            y,
-            modeltype,
-            nmse,
-        ),
-        tol_std=1e-4,
-        lower=np.array([-1e10, 0.1, 1.0]),
-        upper=np.array([1e10, 0.99, 1e10]),
-        max_iter=1_000,
-        adaptive=True,
+        lower,
+        upper,
+        init_level,
+        init_alpha,
+        init_theta,
+        opt_level,
+        opt_alpha,
+        opt_theta,
+        y,
+        modeltype,
+        nmse,
     )
-    return res
-
-# %% ../../nbs/src/theta.ipynb 24
-@njit(nogil=NOGIL, cache=CACHE)
-def is_constant(x):
-    return np.all(x[0] == x)
+    return results(*opt_res, None)  # type: ignore
 
-# %% ../../nbs/src/theta.ipynb 26
+# %% ../../nbs/src/theta.ipynb 14
 def thetamodel(
     y: np.ndarray,
     m: int,
@@ -480,13 +139,15 @@ def thetamodel(
     theta: float,
     nmse: int,
 ):
+    y = y.astype(np.float64, copy=False)
+    model_type = switch_theta(modeltype)
     # initial parameters
     par = initparamtheta(
         initial_smoothed=initial_smoothed,
         alpha=alpha,
         theta=theta,
         y=y,
-        modeltype=modeltype,
+        modeltype=model_type,
     )
     optimize_params = {
         key.replace("optimize_", ""): val for key, val in par.items() if "optim" in key
@@ -497,13 +158,14 @@ def thetamodel(
         init_par=par,
         optimize_params=optimize_params,
         y=y,
-        modeltype=modeltype,
+        modeltype=model_type,
         nmse=nmse,
     )
     if fred is not None:
         fit_par = fred.x
     j = 0
     if optimize_params["initial_smoothed"]:
+        par["initial_smoothed"] = fit_par[j]
         j += 1
     if optimize_params["alpha"]:
         par["alpha"] = fit_par[j]
@@ -512,7 +174,14 @@ def thetamodel(
         par["theta"] = fit_par[j]
         j += 1
 
-    amse, e, states, mse = pegelsresid_theta(y=y, modeltype=modeltype, nmse=nmse, **par)
+    amse, e, states, mse = _theta.pegels_resid(
+        y,
+        model_type,
+        par["initial_smoothed"],
+        par["alpha"],
+        par["theta"],
+        nmse,
+    )
 
     return dict(
         mse=mse,
@@ -527,7 +196,7 @@ def thetamodel(
         mean_y=np.mean(y),
     )
 
-# %% ../../nbs/src/theta.ipynb 28
+# %% ../../nbs/src/theta.ipynb 16
 def compute_pi_samples(
     n, h, states, sigma, alpha, theta, mean_y, seed=0, n_samples=200
 ):
@@ -546,21 +215,20 @@ def compute_pi_samples(
         A = mean_y - B * (i + 2) / 2
     return samples
 
-# %% ../../nbs/src/theta.ipynb 29
+# %% ../../nbs/src/theta.ipynb 17
 def forecast_theta(obj, h, level=None):
-    forecast = np.full(h, fill_value=np.nan)
+    forecast = np.empty(h)
     n = obj["n"]
     states = obj["states"]
     alpha = obj["par"]["alpha"]
     theta = obj["par"]["theta"]
-    thetaforecast(
-        states=states,
-        n=n,
-        modeltype=switch_theta(obj["modeltype"]),
-        h=h,
-        f=forecast,
-        alpha=alpha,
-        theta=theta,
+    _theta.forecast(
+        states,
+        n,
+        switch_theta(obj["modeltype"]),
+        forecast,
+        alpha,
+        theta,
     )
     res = {"mean": forecast}
 
@@ -591,7 +259,7 @@ def forecast_theta(obj, h, level=None):
                 res[key] = res[key] + seas_forecast
     return res
 
-# %% ../../nbs/src/theta.ipynb 31
+# %% ../../nbs/src/theta.ipynb 19
 def auto_theta(
     y,
     m,
@@ -650,7 +318,7 @@ def auto_theta(
 
     # validate model
     if model not in [None, "STM", "OTM", "DSTM", "DOTM"]:
-        raise ValueError("Invalid model type")
+        raise ValueError(f"Invalid model type: {model}.")
 
     n = len(y)
     npars = 3
@@ -691,7 +359,7 @@ def auto_theta(
         model["seas_forecast"] = dict(seas_forecast)
     return model
 
-# %% ../../nbs/src/theta.ipynb 41
+# %% ../../nbs/src/theta.ipynb 29
 def forward_theta(fitted_model, y):
     m = fitted_model["m"]
     model = fitted_model["modeltype"]
diff --git a/src/statsforecast.cpp b/src/statsforecast.cpp
index edbbed6b5..1fd5b1003 100644
--- a/src/statsforecast.cpp
+++ b/src/statsforecast.cpp
@@ -2,18 +2,20 @@
 
 namespace py = pybind11;
 
-namespace ets
-{
-  void init(py::module_ &);
+namespace ets {
+void init(py::module_ &);
 }
 
-namespace arima
-{
-  void init(py::module_ &);
+namespace arima {
+void init(py::module_ &);
 }
 
-PYBIND11_MODULE(_lib, m)
-{
+namespace theta {
+void init(py::module_ &);
+}
+
+PYBIND11_MODULE(_lib, m) {
   arima::init(m);
   ets::init(m);
+  theta::init(m);
 }
diff --git a/src/theta.cpp b/src/theta.cpp
new file mode 100644
index 000000000..cca71a3f8
--- /dev/null
+++ b/src/theta.cpp
@@ -0,0 +1,195 @@
+#include <pybind11/pybind11.h>
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+
+#include "nelder_mead.h"
+
+namespace theta {
+namespace py = pybind11;
+using Eigen::VectorXd;
+using RowMajorMatrixXd =
+    Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+enum class ModelType { STM, OTM, DSTM, DOTM };
+constexpr double HUGE_N = 1e10;
+constexpr double NA = -99999.0;
+constexpr double TOL = 1e-10;
+
+Eigen::Vector<double, 5> init_state(const Eigen::Ref<const VectorXd> &y,
+                                    ModelType model_type,
+                                    double initial_smoothed, double alpha,
+                                    double theta) {
+  double An, Bn, mu;
+  if (model_type == ModelType::DSTM || model_type == ModelType::DOTM) {
+    An = y[0];
+    Bn = double{};
+    mu = y[0];
+  } else {
+    size_t n = y.size();
+    double y_mean = y.array().mean();
+    double weighted_avg = y.dot(VectorXd::LinSpaced(y.size(), 1, y.size())) / n;
+    Bn = (6 * (2 * weighted_avg - (n + 1) * y_mean)) / (n * n - 1);
+    An = y_mean - (n + 1) * Bn / 2;
+    mu = initial_smoothed + (1 - 1 / theta) * (An + Bn);
+  }
+  return {alpha * y[0] + (1 - alpha) * initial_smoothed, y[0], An, Bn, mu};
+}
+
+void update(Eigen::Ref<RowMajorMatrixXd> states, size_t i, ModelType model_type,
+            double alpha, double theta, double y, bool usemu) {
+  double level = states(i - 1, 0);
+  double meany = states(i - 1, 1);
+  double An = states(i - 1, 2);
+  double Bn = states(i - 1, 3);
+  states(i, 4) =
+      level + (1 - 1 / theta) * (An * std::pow(1 - alpha, i) +
+                                 Bn * (1 - std::pow(1 - alpha, i + 1)) / alpha);
+  if (usemu) {
+    y = states(i, 4);
+  }
+  states(i, 0) = alpha * y + (1 - alpha) * level;
+  states(i, 1) = (i * meany + y) / (i + 1);
+  if (model_type == ModelType::DSTM || model_type == ModelType::DOTM) {
+    states(i, 3) = ((i - 1) * Bn + 6 * (y - meany) / (i + 1)) / (i + 2);
+    states(i, 2) = states(i, 1) - states(i, 3) * (i + 2) / 2;
+  } else {
+    states(i, 2) = An;
+    states(i, 3) = Bn;
+  }
+}
+
+void forecast(const Eigen::Ref<const RowMajorMatrixXd> &states, size_t i,
+              ModelType model_type, Eigen::Ref<VectorXd> f, double alpha,
+              double theta) {
+  size_t h = f.size();
+  RowMajorMatrixXd new_states = RowMajorMatrixXd::Zero(i + h, states.cols());
+  std::copy(states.data(), states.data() + i * states.cols(),
+            new_states.data());
+  for (size_t j = 0; j < h; ++j) {
+    update(new_states, i + j, model_type, alpha, theta, double{}, true);
+    f[j] = new_states(i + j, 4);
+  }
+}
+
+double calc(const Eigen::Ref<const VectorXd> &y,
+            Eigen::Ref<RowMajorMatrixXd> states, ModelType model_type,
+            double initial_smoothed, double alpha, double theta,
+            Eigen::Ref<VectorXd> e, Eigen::Ref<VectorXd> amse, size_t nmse) {
+  VectorXd denom = VectorXd::Zero(nmse);
+  VectorXd f = VectorXd::Zero(nmse);
+  auto init_states = init_state(y, model_type, initial_smoothed, alpha, theta);
+  std::ranges::copy(init_states, states.row(0).begin());
+  std::fill_n(amse.begin(), nmse, double{});
+  e[0] = y[0] - states(0, 4);
+  size_t n = y.size();
+  for (size_t i = 1; i < n; ++i) {
+    forecast(states, i, model_type, f, alpha, theta);
+    if (std::abs(f[0] - NA) < TOL) {
+      return NA;
+    }
+    e[i] = y[i] - f[0];
+    for (size_t j = 0; j < nmse; ++j) {
+      if (i + j < n) {
+        denom[j] += 1.0;
+        double tmp = y[i + j] - f[j];
+        amse[j] = (amse[j] * (denom[j] - 1.0) + tmp * tmp) / denom[j];
+      }
+    }
+    update(states, i, model_type, alpha, theta, y[i], false);
+  }
+  double mean_y = y.array().abs().mean();
+  if (mean_y < TOL) {
+    mean_y = TOL;
+  }
+  return e.tail(e.size() - 3).array().square().sum() / mean_y;
+}
+
+std::tuple<VectorXd, VectorXd, RowMajorMatrixXd, double>
+pegels_resid(const Eigen::Ref<const VectorXd> &y, ModelType model_type,
+             double initial_smoothed, double alpha, double theta, size_t nmse) {
+  RowMajorMatrixXd states = RowMajorMatrixXd::Zero(y.size(), 5);
+  VectorXd e = VectorXd::Zero(y.size());
+  VectorXd amse = VectorXd::Zero(nmse);
+  double mse = calc(y, states, model_type, initial_smoothed, alpha, theta, e,
+                    amse, nmse);
+  if (!std::isnan(mse) && std::abs(mse + 99999) < 1e-7) {
+    mse = std::numeric_limits<double>::quiet_NaN();
+  }
+  return {amse, e, states, mse};
+}
+
+double theta_target_fn(const VectorXd &params, double init_level,
+                       double init_alpha, double init_theta, bool opt_level,
+                       bool opt_alpha, bool opt_theta, const VectorXd &y,
+                       ModelType model_type, size_t nmse) {
+  RowMajorMatrixXd states = RowMajorMatrixXd::Zero(y.size(), 5);
+  size_t j = 0;
+  double level, alpha, theta;
+  if (opt_level) {
+    level = params[j++];
+  } else {
+    level = init_level;
+  }
+  if (opt_alpha) {
+    alpha = params[j++];
+  } else {
+    alpha = init_alpha;
+  }
+  if (opt_theta) {
+    theta = params[j++];
+  } else {
+    theta = init_theta;
+  }
+  VectorXd e = VectorXd::Zero(y.size());
+  VectorXd amse = VectorXd::Zero(nmse);
+  double mse = calc(y, states, model_type, level, alpha, theta, e, amse, nmse);
+  mse = std::max(mse, -1e10);
+  if (std::isnan(mse) || std::abs(mse + 99999) < 1e-7) {
+    mse = -std::numeric_limits<double>::infinity();
+  }
+  return mse;
+}
+
+std::tuple<VectorXd, double, int>
+optimize(const Eigen::Ref<const VectorXd> &x0,
+         const Eigen::Ref<const VectorXd> &lower,
+         const Eigen::Ref<const VectorXd> &upper, double init_level,
+         double init_alpha, double init_theta, bool opt_level, bool opt_alpha,
+         bool opt_theta, const Eigen::Ref<const VectorXd> &y,
+         ModelType model_type, size_t nmse) {
+  double init_step = 0.05;
+  double zero_pert = 1e-4;
+  double alpha = 1.0;
+  double gamma = 2.0;
+  double rho = 0.5;
+  double sigma = 0.5;
+  int max_iter = 1'000;
+  double tol_std = 1e-4;
+  bool adaptive = true;
+  return nm::NelderMead(theta_target_fn, x0, lower, upper, init_step, zero_pert,
+                        alpha, gamma, rho, sigma, max_iter, tol_std, adaptive,
+                        init_level, init_alpha, init_theta, opt_level,
+                        opt_alpha, opt_theta, y, model_type, nmse);
+}
+
+void init(py::module_ &m) {
+  py::module_ theta = m.def_submodule("theta");
+  theta.attr("HUGE_N") = HUGE_N;
+  theta.attr("NA") = NA;
+  theta.attr("TOL") = TOL;
+  py::enum_<ModelType>(theta, "ModelType")
+      .value("STM", ModelType::STM)
+      .value("OTM", ModelType::OTM)
+      .value("DSTM", ModelType::DSTM)
+      .value("DOTM", ModelType::DOTM);
+  theta.def("init_state", &init_state);
+  theta.def("calc", &calc);
+  theta.def("forecast", &forecast);
+  theta.def("update", &update);
+  theta.def("optimize", &optimize);
+  theta.def("pegels_resid", &pegels_resid);
+}
+
+} // namespace theta