diff --git a/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp b/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp
new file mode 100644
index 0000000..08249a0
--- /dev/null
+++ b/tessellate_ipu/core/vertex/tile_hessenberg_vertex.cpp
@@ -0,0 +1,204 @@
+// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
+#include <poplar/HalfFloat.hpp>
+#include <poplar/Vertex.hpp>
+
+#include "intrinsics_utils.hpp"
+
+using namespace poplar;
+
+/* popc -O2 -I tessellate/tile/vertex\
+     tessellate/tile/vertex/tile_qr_vertex.cpp \
+     -o tessellate/tile/vertex/tile_qr_vertex.gp
+*/
+static constexpr size_t MIN_ALIGN = 8;
+
+/*
+  The code here is just a minor modification of tile_qr_vertex.cpp
+*/
+
+/**
+ * @brief Vertex computing the correction vector in the Hessenberg algorithm.
+ */
+class HessenbergCorrectionVectorVertex : public MultiVertex {
+ public:
+  using T = float;
+  Input<Vector<T, poplar::VectorLayout::SPAN, MIN_ALIGN>> Rcol;      // (N,) R column.
+  Input<Vector<T, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>> sdiag;  // (N,) R diag. sign.
+  Input<Vector<int, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>>
+    cidx;
+
+  Output<Vector<T, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>>
+      v;  // (N,) QR correction vector (not normalized)
+  Output<Vector<T, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>>
+      vrescale;  // (1,) QR correction vector rescaling (2 / norm)
+
+
+  // Use static variables as easy sync. mechanisms between worker threads.
+  // see: https://graphcore.slack.com/archives/C013LPHPX61/p1647443852989259
+  static T shared_partial_sqnorms[6];
+
+  HessenbergCorrectionVectorVertex();
+
+  bool compute(unsigned wid) {
+    const unsigned num_workers = 6;
+    const unsigned step = num_workers * 2;
+    const unsigned size = Rcol.size();
+
+    const unsigned col_idx = cidx[0];
+
+    const unsigned col_idx_rem = col_idx % 2;
+    const unsigned col_idx_mrem = col_idx - col_idx_rem;
+    const unsigned col_idx_prem = col_idx + col_idx_rem;
+
+    const T initial_rcol_val = Rcol[col_idx];
+    const T initial_rcol_val_sq = initial_rcol_val * initial_rcol_val;
+
+    // FIRST: SET SHARED STATE between workers.
+    shared_partial_sqnorms[wid] = -1;
+
+    const float2 zeros_f2{0, 0};
+    float2* ptr_outdata_f2 = reinterpret_cast<float2*>(v.data()) + wid;
+    // Push to col_idx_prem, may write one zero too much, but does not matter!
+    float2* ptr_outdata_end_f2 = reinterpret_cast<float2*>(&v[col_idx_prem]);
+    // First chunk of v initialized with zeros.
+    while (ptr_outdata_f2 < ptr_outdata_end_f2) {
+      ipu::store_postinc(&ptr_outdata_f2, zeros_f2, num_workers);
+    }
+
+    float2 partials_f2{0, 0};
+    const float2* ptr_indata_f2 =
+        reinterpret_cast<const float2*>(&Rcol[col_idx_prem]) + wid;
+    ptr_outdata_f2 = reinterpret_cast<float2*>(&v[col_idx_prem]) + wid;
+    ptr_outdata_end_f2 = reinterpret_cast<float2*>(&v[size]);
+    // Copy Rcol data and accumulate squared norm.
+    while (ptr_outdata_f2 < ptr_outdata_end_f2) {
+      const float2 v = ipu::load_postinc(&ptr_indata_f2, num_workers);
+      partials_f2 += v * v;
+      ipu::store_postinc(&ptr_outdata_f2, v, num_workers);
+    }
+    T partial = partials_f2[0] + partials_f2[1];
+
+    // GLOBAL STATE shared by all workers.
+    shared_partial_sqnorms[wid] = partial;
+
+    if (wid == 0) {
+      // Special case of odd R column index.
+      // On thread 0: correction to squared normed depending on `col_idx_rem`
+      T norm_squared = partial + col_idx_rem * initial_rcol_val_sq;
+      // Accumulate & wait.
+      for (unsigned w = 1; w < num_workers; ++w) {
+        // Avoid compiler optimizer with volatile pointer.
+        volatile T* ptr_partial = &shared_partial_sqnorms[w];
+        while (*ptr_partial < 0) {
+        }
+        norm_squared += shared_partial_sqnorms[w];
+      }
+
+      // Compute the norm.
+      const T norm = std::sqrt(norm_squared);
+      // Change the entry of v that corresponds to the diagonal element of R.
+      const auto update_vidx_val = initial_rcol_val - norm * sdiag[col_idx];
+      // Re-writing the full new value is faster than updating.
+      v[col_idx] = update_vidx_val;
+
+      // Update the squared norm of v.
+      norm_squared -= initial_rcol_val_sq;
+      norm_squared += update_vidx_val * update_vidx_val;
+
+      // Vector rescaling for QR householder update.
+      vrescale[0] = T(2) / norm_squared;
+    }
+    return true;
+  }
+};
+
+float HessenbergCorrectionVectorVertex::shared_partial_sqnorms[6] = {-1};
+
+/**
+ * @brief Vertex implementing the inplace householder (row) update in the QR
+ * algorithm. NOTE: the vertex is only updating the sub-slice of x corresponding
+ * to v.
+ *
+ * More specifically: x[end-len(v)+i] -= scale1[0] * scale2[0] * v[i]
+ *
+ * NOTE: poplar::constraint here to make sure x and v are not part of the same
+ * memory bank, allowing simultaneous loads (see `ld2x64pace` instruction).
+ */
+class [[poplar::constraint(
+    "elem(*x) != elem(*v)")]] HessenbergHouseholderRowUpdateVertex
+    : public MultiVertex {
+ public:
+  using T = float;
+  // Using `uint16` seems to be generating more efficient loops?
+  using IndexType = unsigned short;
+
+  InOut<Vector<T, poplar::VectorLayout::ONE_PTR, 8>> x;  // (N,) row of Q or R
+  Input<Vector<T, poplar::VectorLayout::SPAN, 8>>
+      v;  // (M,) v correction vector
+
+  // Passing 2 scaling factors is more efficient for the QR implementation.
+  // Avoids another full pass on the v vector in the vertex it is constructed.
+  Input<Vector<T, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>>
+      scale1;  // (1,) first scaling factor.
+  Input<Vector<T, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>>
+      scale2;  // (1,) 2nd scaling factor.
+  Input<Vector<int, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>>
+      start_idx_;
+
+  Input<Vector<IndexType, poplar::VectorLayout::ONE_PTR, MIN_ALIGN>>
+      worker_offsets;  // (7,) threads work size + 1.
+
+
+
+  bool compute(unsigned wid) {
+    // Always assuming size % 2 == 0
+    constexpr unsigned ptr_step = 1;
+    const IndexType wstart = worker_offsets[wid];
+    const IndexType wend = worker_offsets[wid + 1];
+    const IndexType wsize = wend - wstart;
+
+    IndexType start_idx = start_idx_[0];  // X start idx. Must be a multiple of 4 (for bank
+                        // alignment aspects).
+
+    // Set the $TAS register with the proper scale.
+    const T s = -scale1[0] * scale2[0];
+    // __builtin_ipu_put_tas(s);
+    __ipu_and_ipumodel_tas tas;
+    tas.put(s);
+
+    // Nothing to do in this worker thread.
+    if (wstart == wend) {
+      return true;
+    }
+    // X and v IO pointers.
+    const float2* ptr_inxdata_f2 =
+        reinterpret_cast<const float2*>(&x[start_idx]) + wstart;
+    float2* ptr_outxdata_f2 = reinterpret_cast<float2*>(&x[start_idx]) + wstart;
+    const float2* ptr_vdata_f2 =
+        reinterpret_cast<const float2*>(&v[0]) + wstart;
+
+    float2 xin, vin, rtmp, rout;
+    // First vectors loading.
+    xin = ipu::load_postinc(&ptr_inxdata_f2, ptr_step);
+    vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step);
+    // TODO: use ld2x64pace + tapack instructions.
+    for (IndexType idx = 1; idx != wsize; ++idx) {
+      rtmp = tas.f32v2axpy(xin, vin);
+      // rtmp = __builtin_ipu_f32v2axpy(xin, vin);
+      // Grouping here seems to help the compiler optimising loads?
+      xin = ipu::load_postinc(&ptr_inxdata_f2, ptr_step);
+      vin = ipu::load_postinc(&ptr_vdata_f2, ptr_step);
+      rout = tas.f32v2axpy(rtmp, rtmp);
+      // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
+      ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step);
+    }
+    // Finish the loop, getting the last computation.
+    // rtmp = __builtin_ipu_f32v2axpy(xin, vin);
+    // rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
+    rtmp = tas.f32v2axpy(xin, vin);
+    rout = tas.f32v2axpy(rtmp, rtmp);
+    ipu::store_postinc(&ptr_outxdata_f2, rout, ptr_step);
+
+    return true;
+  }
+};
diff --git a/tessellate_ipu/linalg/tile_linalg_hessenberg.py b/tessellate_ipu/linalg/tile_linalg_hessenberg.py
index 3560c44..a9c88a6 100644
--- a/tessellate_ipu/linalg/tile_linalg_hessenberg.py
+++ b/tessellate_ipu/linalg/tile_linalg_hessenberg.py
@@ -1,26 +1,216 @@
 # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+import math
+import os
 from typing import Any, Tuple
 
 import jax.lax
-
-from tessellate_ipu import TileShardedArray, tile_data_barrier, tile_map, tile_put_replicated, tile_put_sharded
+import numpy as np
+from jax.core import ShapedArray
+
+from tessellate_ipu import (
+    TileShardedArray,
+    create_ipu_tile_primitive,
+    tile_data_barrier,
+    tile_map,
+    tile_put_replicated,
+    tile_put_sharded,
+)
+from tessellate_ipu.core.tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets
 
 from .tile_linalg_qr import dot_product1d_p
-from .tile_linalg_qr import ipu_qr_shard_inputs as ipu_hessenberg_shard_inputs
-from .tile_linalg_qr import qr_correction_vector_p, qr_householder_row_update_p
 
 Array = Any
 
-# Heavily based on ipu_qr_iterations in tile_linalg_qr.py
-# The body of the for-loop computes
-# v = Householder(R[i])         # v is chosen to annihilate the elements below the first lower diagonal
-# R = R - 2 *  v.reshape(-1, 1) @ (v.reshape(1, -1)  @ R)
-# R = R - 2 * (R @ v.reshape(-1, 1)) @ v.reshape(1, -1)  # Not present in QR algorithm
-# Q = Q - 2 * (Q @ v.reshape(-1, 1)) @ v.reshape(1, -1)
+# The code here is heavily based on tile_linalg_qr.py
+
+
+def get_hessenberg_vertex_gp_filename() -> str:
+    return os.path.join(os.path.dirname(__file__), "../core", "vertex", "tile_hessenberg_vertex.cpp")
+
+
+"""Vertex computing Hessenberg correction vector.
+"""
+hessenberg_correction_vector_p = create_ipu_tile_primitive(
+    "hessenberg_correction_vector",
+    "HessenbergCorrectionVectorVertex",
+    inputs=["Rcol", "sdiag", "cidx"],
+    outputs={"v": 0, "vrescale": ShapedArray((1,), dtype=np.float32)},
+    gp_filename=get_hessenberg_vertex_gp_filename(),
+    perf_estimate=1000,
+)
+
+"""Vertex Hessenberg HouseHolder performing row inplace update: x -= scale1[0] * scale2[0] * v
+"""
+hessenberg_householder_row_update_p = create_ipu_tile_primitive(
+    "hessenberg_householder_row_update",
+    "HessenbergHouseholderRowUpdateVertex",
+    inputs=["x", "v", "scale1", "scale2", "start_idx_"],
+    outputs={"x": 0},
+    constants={
+        "worker_offsets": lambda inavals, *_: make_ipu_vector1d_worker_offsets(
+            inavals[1].size, vector_size=2, wdtype=np.uint16
+        )
+    },
+    gp_filename=get_hessenberg_vertex_gp_filename(),
+    perf_estimate=1000,
+)
+
+
+def ipu_hessenberg_shard_inputs(x: Array, xsdiag: Array) -> Tuple[TileShardedArray, TileShardedArray, TileShardedArray]:
+    """IPU QR initial sharding of input arrays across IPU tiles.
+
+    Args:
+        x: X array.
+        sdiag: X diagonal sign.
+    Returns:
+        Tile sharded Q, R, sdiag.
+    """
+    assert x.shape[0] == x.shape[1]
+    N = x.shape[0]
+    n_tiles = 1472
+
+    # Sharding R and Q
+    if N <= 736:
+        Q_tiles = list(range(N))
+        R_tiles = list(range(N, 2 * N))
+    else:
+        n_per_tile = math.ceil(N / float(n_tiles))
+        full_tiles = N % n_tiles
+        if full_tiles == 0:
+            full_tiles = n_tiles
+
+        Q_tiles = [i for i in range(full_tiles) for _ in range(n_per_tile)] + [
+            i for i in range(full_tiles, n_tiles) for _ in range(n_per_tile - 1)
+        ]
+        R_tiles = Q_tiles
+
+    # TODO: on-device construction of identity
+    Q = tile_put_sharded(np.identity(N, dtype=x.dtype), Q_tiles)
+    R = tile_put_sharded(x, R_tiles)
+    # Replicate once on all tiles. Faster then for the looping.
+    sdiag_full = tile_put_replicated(xsdiag.T, R_tiles)
+    return Q, R, sdiag_full
+
+
+def ipu_hessenberg_body(
+    i: int, carry: Tuple[TileShardedArray, TileShardedArray, TileShardedArray]
+) -> Tuple[TileShardedArray, TileShardedArray, TileShardedArray]:
+    """
+    The body of the for-loop that operates on rows of R. It computes
+    v = Householder(R[i])         # v is chosen to annihilate the elements below the first lower diagonal
+    R = R - 2 *  v.reshape(-1, 1) @ (v.reshape(1, -1)  @ R)
+    R = R - 2 * (R @ v.reshape(-1, 1)) @ v.reshape(1, -1)  # Not present in QR algorithm
+    Q = Q - 2 * (Q @ v.reshape(-1, 1)) @ v.reshape(1, -1)
+    """
+
+    Q, R, sdiag_full = carry
+
+    # Extract the i-th col of R and the i-th element of sdiag_full
+    # Using the gather_p primitive avoids inefficient general-case processing
+
+    dim_numbers = jax.lax.GatherDimensionNumbers(offset_dims=tuple(), collapsed_slice_dims=(0,), start_index_map=(0,))
+
+    i_rep = tile_put_replicated(jax.numpy.array([[i]], dtype=np.uint32), R.tiles)
+
+    Rcol = tile_map(
+        jax.lax.gather_p,
+        R,
+        i_rep,
+        dimension_numbers=dim_numbers,
+        slice_sizes=(1,),
+        mode=jax.lax.GatherScatterMode.PROMISE_IN_BOUNDS,
+        unique_indices=False,
+        indices_are_sorted=False,
+        fill_value=None,
+    )  # => TileShardedArray() (Num_tiles, 1)
+
+    # This determines also where the computation of v (Householder correction vector) takes place
+    # For now, the tile is picked arbitrarily. Are there better choices? R.tiles[0]?
+    Rcol_replicated = tile_put_replicated(Rcol.array, tiles=[736])  # type:ignore
+
+    sdiag = tile_map(
+        jax.lax.gather_p,
+        sdiag_full,
+        i_rep,
+        dimension_numbers=dim_numbers,
+        slice_sizes=(1,),
+        mode=jax.lax.GatherScatterMode.PROMISE_IN_BOUNDS,
+        unique_indices=False,
+        indices_are_sorted=False,
+        fill_value=None,
+    )  # => TileShardedArray() (Num_tiles, 1)
+
+    sdiag_rep = tile_put_replicated(sdiag.array, Rcol_replicated.tiles)  # type:ignore
+
+    # Smart-indexing
+    # start_idx = (i // 2) * 2
+    start_idx = 0
+
+    start_idxQ = tile_put_replicated(start_idx, Q.tiles)
+    start_idxR = tile_put_replicated(start_idx, R.tiles)
+
+    # Correction vector. Computed on the tile where Rcol is located
+    v, vrescale = tile_map(
+        hessenberg_correction_vector_p, Rcol_replicated, sdiag_rep, tile_put_replicated(i + 1, Rcol_replicated.tiles)
+    )  # type:ignore
+
+    # Replicate to all Q and R tiles.
+    vQ = tile_put_replicated(v.array, Q.tiles)  # 0
+    vR = tile_put_replicated(v.array, R.tiles)  # 0
+    # v normalization factor to pass to householder update.
+    vrescaleQ = tile_put_replicated(vrescale.array, Q.tiles)  # 0
+    vrescaleR = tile_put_replicated(vrescale.array, R.tiles)  # 0
+
+    # Transpose R so that we can use hessenberg_householder_row_update_p() to compute R @ ...
+    RT = tile_put_sharded(R.array.T, R.tiles)
+
+    # w = R^T @ v
+    w = tile_map(
+        # dot_product1d_indexed_p, vR, RT, start_idxR
+        dot_product1d_p,
+        vR,
+        RT,
+    )  # this returns size 12 array (6 worker threads)
+    w = tile_map(jax.lax.reduce_sum_p, w, axes=(0,))  # type:ignore
+    # Inplace update of R.
+    RT = tile_map(  # type:ignore
+        hessenberg_householder_row_update_p, RT, vR, w, vrescaleR, start_idxR  # type:ignore
+    )
+
+    # We compute the Q updates.
+    # It is done here and is followed by tile_data_barrier() because this induces the Poplar
+    # to schedule it in parallel to the RT updates, when RT and Q are mapped on disjoint tiles.
+    # w = Q @ v
+    # w = tile_map(dot_product1d_indexed_p, vQ, Q, start_idxQ)
+    w = tile_map(dot_product1d_p, vQ, Q)
+    w = tile_map(jax.lax.reduce_sum_p, w, axes=(0,))  # type:ignore
+    # Inplace update of Q.
+    Q = tile_map(
+        hessenberg_householder_row_update_p, Q, vQ, w, vrescaleQ, start_idxQ  # type:ignore
+    )
+    RT, Q = tile_data_barrier(RT, Q)
+
+    # Transpose the RT matrix so that we can use hessenberg_householder_row_update_p() to compute ... @ R
+    R = tile_put_sharded(RT.array.T, RT.tiles)
+
+    # w = R^T @ v
+    w = tile_map(
+        # dot_product1d_indexed_p, vR, R, start_idxR
+        dot_product1d_p,
+        vR,
+        R,
+    )  # this returns size 12 array (6 worker threads)
+    w = tile_map(jax.lax.reduce_sum_p, w, axes=(0,))  # type:ignore
+    # Inplace update of R.
+    R = tile_map(  # type:ignore
+        hessenberg_householder_row_update_p, R, vR, w, vrescaleR, start_idxR  # type:ignore
+    )
+
+    return (Q, R, sdiag_full)
 
 
 def ipu_hessenberg_iterations(
-    Q: TileShardedArray, RT: TileShardedArray, sdiag_full: TileShardedArray
+    Q: TileShardedArray, R: TileShardedArray, sdiag_full: TileShardedArray
 ) -> Tuple[TileShardedArray, TileShardedArray]:
     """IPU Hessenberg algorithm iterations.
 
@@ -29,63 +219,12 @@ def ipu_hessenberg_iterations(
         RT: Initial R.T sharded array.
         sdiag_full: Diagonal sign (replicated).
     Returns:
-        (Q, RT) after N-1 iterations.
+        (Q, RT) after N-2 iterations.
     """
-    assert len(Q) == len(RT)
+    assert len(Q) == len(R)
     N = len(Q)
-    # Sharding of R and Q on tiles.
-    Q_tiles = Q.tiles
-    R_tiles = RT.tiles
-
-    for cidx in range(N - 2):
-        # From which column to start computation: skipping zeros. Must be a multiple of 2 for proper vectorization.
-        start_idx = (cidx // 2) * 2
-        # Extract the proper R column (no tile copy, pure view).
-        Rcol = RT[cidx]
-        sdiag = sdiag_full[cidx]
-        # Correction vector. NOTE: computed on a single tile, changing at every loop.
-        v, vrescale = tile_map(qr_correction_vector_p, Rcol, sdiag, col_idx=cidx + 1)  # type:ignore
-
-        # Replicate to all Q and R tiles.
-        vQ = tile_put_replicated(v.array[0], Q_tiles)
-        vR = tile_put_replicated(v.array[0], R_tiles)
-        # v normalization factor to pass to householder update.
-        vrescaleQ = tile_put_replicated(vrescale.array[0], Q_tiles)
-        vrescaleR = tile_put_replicated(vrescale.array[0], R_tiles)
-
-        # Using "smart" slicing to reduce compute to do.
-        # w = R^T @ v
-        w = tile_map(
-            dot_product1d_p, vR[:, start_idx:], RT[:, start_idx:]
-        )  # this returns size 12 array (6 worker threads)
-        w = tile_map(jax.lax.reduce_sum_p, w, axes=(0,))  # type:ignore
-        # Inplace update of R.
-        RT = tile_map(  # type:ignore
-            qr_householder_row_update_p, RT, vR[:, start_idx:], w, vrescaleR, start_idx=start_idx  # type:ignore
-        )
-
-        # w = Q @ v
-        w = tile_map(dot_product1d_p, vQ[:, start_idx:], Q[:, start_idx:])
-        w = tile_map(jax.lax.reduce_sum_p, w, axes=(0,))  # type:ignore
-        # Inplace update of Q.
-        Q = tile_map(
-            qr_householder_row_update_p, Q, vQ[:, start_idx:], w, vrescaleQ, start_idx=start_idx  # type:ignore
-        )
-        RT, Q = tile_data_barrier(RT, Q)
-
-        R = tile_put_sharded(RT.array.T, RT.tiles)
-        # Using "smart" slicing to reduce compute to do.
-        # w = R^T @ v
-        w = tile_map(
-            dot_product1d_p, vR[:, start_idx:], R[:, start_idx:]
-        )  # this returns size 12 array (6 worker threads)
-        w = tile_map(jax.lax.reduce_sum_p, w, axes=(0,))  # type:ignore
-        # Inplace update of R.
-        R = tile_map(  # type:ignore
-            qr_householder_row_update_p, R, vR[:, start_idx:], w, vrescaleR, start_idx=start_idx  # type:ignore
-        )
 
-        RT = tile_put_sharded(R.array.T, R.tiles)
+    Q, R, sdiag_full = jax.lax.fori_loop(0, N - 2, ipu_hessenberg_body, (Q, R, sdiag_full))
 
     return (Q, R)
 
@@ -102,6 +241,6 @@ def ipu_hessenberg(x: Array) -> Tuple[Array, Array]:
         Q, R^T matrices (as tile sharded arrays).
     """
     # Initialize Q, RT, sdiag.
-    Q, RT, sdiag_full = ipu_hessenberg_shard_inputs(x, jax.numpy.sign(jax.numpy.diag(x)))
+    Q, R, sdiag_full = ipu_hessenberg_shard_inputs(x, jax.numpy.sign(jax.numpy.diag(x)))
     # IPU QR iterations.
-    return ipu_hessenberg_iterations(Q, RT, sdiag_full)
+    return ipu_hessenberg_iterations(Q, R, sdiag_full)
diff --git a/tests/linalg/test_tile_linalg_hessenberg.py b/tests/linalg/test_tile_linalg_hessenberg.py
index 262e4d1..cbd59c1 100644
--- a/tests/linalg/test_tile_linalg_hessenberg.py
+++ b/tests/linalg/test_tile_linalg_hessenberg.py
@@ -69,4 +69,4 @@ def hessenberg_decomposition_fn(x, xsdiag):
 
         start, end = np.asarray(start)[0], np.asarray(end)[0]
         hessenberg_cycle_count = end[0] - start[0]
-        assert hessenberg_cycle_count <= 105000
+        assert hessenberg_cycle_count <= 150000