diff --git a/runtime/cudaq/spin/spin_op.cpp b/runtime/cudaq/spin/spin_op.cpp
index f1acaf1d70..a898efd0f1 100644
--- a/runtime/cudaq/spin/spin_op.cpp
+++ b/runtime/cudaq/spin/spin_op.cpp
@@ -21,7 +21,6 @@
 #include <complex>
 #include <fstream>
 #include <iostream>
-#include <map>
 #include <random>
 #include <set>
 #include <utility>
@@ -62,48 +61,52 @@ actionOnBra(spin_op &term, const std::string &bitConfiguration) {
   return std::make_pair(newConfiguration, coeff);
 }
 
-std::pair<std::complex<double>, std::vector<bool>>
-mult(std::vector<bool> row, std::vector<bool> other_row,
-     std::complex<double> &rowCoeff, std::complex<double> &otherCoeff) {
-  // This is term_i * otherTerm_j
-  std::vector<bool> tmp(row.size()), tmp2(row.size());
-  std::size_t numQubits = row.size() / 2;
+std::pair<std::vector<bool>, std::complex<double>>
+mult(const std::vector<bool> &p1, const std::vector<bool> &p2,
+     const std::complex<double> &p1Coeff, const std::complex<double> &p2Coeff) {
+  auto [minSize, maxSize] = std::minmax({p1.size(), p2.size()});
+  std::size_t minNumQubits = minSize / 2;
+  std::size_t maxNumQubits = maxSize / 2;
+  std::vector<bool> result(maxSize, false);
+  int yCount = 0;
+  int cPhase = 0;
+
+  for (std::size_t i = 0; i < minNumQubits; ++i) {
+    bool p1_x = p1[i];
+    bool p1_z = p1[i + (p1.size() / 2)];
+    bool p2_x = p2[i];
+    bool p2_z = p2[i + (p2.size() / 2)];
+
+    // Compute the resulting Pauli operator
+    result[i] = p1_x ^ p2_x;
+    result[i + maxNumQubits] = p1_z ^ p2_z;
+
+    yCount +=
+        (p1_x & p1_z) + (p2_x & p2_z) - (result[i] & result[i + maxNumQubits]);
+    cPhase += p1_x & p2_z;
+  }
 
-  for (std::size_t i = 0; i < 2 * numQubits; i++)
-    tmp[i] = row[i] ^ other_row[i];
+  const std::vector<bool> &big = p1.size() < p2.size() ? p2 : p1;
+  for (std::size_t i = minNumQubits; i < maxNumQubits; ++i) {
+    result[i] = big[i];
+    result[i + maxNumQubits] = big[i + maxNumQubits];
+  }
 
-  for (std::size_t i = 0; i < numQubits; i++)
-    tmp2[i] = (row[i] && other_row[numQubits + i]) ||
-              (row[i + numQubits] && other_row[i]);
+  // Normalize the phase to a value in the range [0, 3]
+  int phaseFactor = (2 * cPhase + yCount) % 4;
+  if (phaseFactor < 0)
+    phaseFactor += 4;
 
-  int orig_phase = 0, other_phase = 0;
-  for (std::size_t i = 0; i < numQubits; i++) {
-    if (row[i] && row[i + numQubits])
-      orig_phase++;
+  // Phase correction factors based on the total phase
+  using namespace std::complex_literals;
+  std::array<std::complex<double>, 4> phase{1.0, -1i, -1.0, 1i};
+  std::complex<double> resultCoeff = p1Coeff * phase[phaseFactor] * p2Coeff;
 
-    if (other_row[i] && other_row[i + numQubits])
-      other_phase++;
-  }
+  // Handle the "-0" issue
+  if (std::abs(resultCoeff.real()) < 1e-12)
+    resultCoeff.real(0);
 
-  int sum = 0;
-  for (auto a : tmp2)
-    if (a)
-      sum++;
-
-  auto _phase = orig_phase + other_phase + 2 * sum;
-  // Based on the phase, figure out an extra coeff to apply
-  for (std::size_t i = 0; i < numQubits; i++)
-    if (tmp[i] && tmp[i + numQubits])
-      _phase -= 1;
-
-  _phase %= 4;
-  std::complex<double> imaginary(0, 1);
-  std::array<std::complex<double>, 4> phaseCoeffArr{1.0, -1. * imaginary, -1.0,
-                                                    imaginary};
-  auto phase_coeff = phaseCoeffArr[_phase];
-  auto coeff = rowCoeff;
-  coeff *= phase_coeff * otherCoeff;
-  return std::make_pair(coeff, tmp);
+  return {result, resultCoeff};
 }
 } // namespace details
 
@@ -425,56 +428,47 @@ spin_op &spin_op::operator-=(const spin_op &v) noexcept {
 }
 
 spin_op &spin_op::operator*=(const spin_op &v) noexcept {
-  spin_op copy = v;
-  if (v.num_qubits() > num_qubits())
-    expandToNQubits(copy.num_qubits());
-  else if (v.num_qubits() < num_qubits())
-    copy.expandToNQubits(num_qubits());
-
-  std::unordered_map<std::vector<bool>, std::complex<double>> newTerms;
-  std::size_t ourRow = 0, theirRow = 0;
-  std::vector<std::complex<double>> composedCoeffs(num_terms() *
-                                                   copy.num_terms());
-  std::vector<std::vector<bool>> composition(num_terms() * copy.num_terms());
-  std::map<std::size_t, std::pair<std::size_t, std::size_t>> indexMap;
-  auto nElements = composition.size();
-  for (std::size_t i = 0; i < nElements; i++) {
-    auto pair = std::make_pair(ourRow, theirRow);
-    indexMap.emplace(i, pair);
-    if (theirRow == copy.num_terms() - 1) {
-      theirRow = 0;
-      ourRow++;
-    } else
-      theirRow++;
-  }
+  using term_and_coeff = std::pair<std::vector<bool>, std::complex<double>>;
+  std::size_t numTerms = num_terms() * v.num_terms();
+  std::vector<term_and_coeff> result(numTerms);
+  std::size_t min = std::min(num_terms(), v.num_terms());
+
+  // Put the `unordered_map` iterators into vectors to minimize pointer chasing
+  // when doing the cartesian product of the spin operators' terms.
+  using Iter =
+      std::unordered_map<spin_op_term, std::complex<double>>::const_iterator;
+  std::vector<Iter> thisTermIt;
+  std::vector<Iter> otherTermIt;
+  thisTermIt.reserve(terms.size());
+  otherTermIt.reserve(v.terms.size());
+  for (auto it = terms.begin(); it != terms.end(); ++it)
+    thisTermIt.push_back(it);
+  for (auto it = v.terms.begin(); it != v.terms.end(); ++it)
+    otherTermIt.push_back(it);
+
 #if defined(_OPENMP)
   // Threshold to start OpenMP parallelization.
   // 16 ~ 4-term * 4-term
   constexpr std::size_t spin_op_omp_threshold = 16;
-#pragma omp parallel for shared(composition) if (nElements >                   \
-                                                     spin_op_omp_threshold)
+#pragma omp parallel for shared(result) if (numTerms > spin_op_omp_threshold)
 #endif
-  for (std::size_t i = 0; i < nElements; i++) {
-    auto [j, k] = indexMap[i];
-    auto s = terms.begin();
-    auto t = copy.terms.begin();
-    std::advance(s, j);
-    std::advance(t, k);
-    auto res = details::mult(s->first, t->first, s->second, t->second);
-    composition[i] = res.second;
-    composedCoeffs[i] = res.first;
+  for (std::size_t i = 0; i < numTerms; ++i) {
+    Iter s = thisTermIt[i % min];
+    Iter t = otherTermIt[i / min];
+    if (terms.size() > v.terms.size()) {
+      s = thisTermIt[i / min];
+      t = otherTermIt[i % min];
+    }
+    result[i] = details::mult(s->first, t->first, s->second, t->second);
   }
 
-  for (std::size_t i = 0; i < nElements; i++) {
-    auto iter = newTerms.find(composition[i]);
-    if (iter == newTerms.end())
-      newTerms.emplace(composition[i], composedCoeffs[i]);
-    else
-      iter->second += composedCoeffs[i];
+  terms.clear();
+  terms.reserve(numTerms);
+  for (auto &&[term, coeff] : result) {
+    auto [it, created] = terms.emplace(term, coeff);
+    if (!created)
+      it->second += coeff;
   }
-
-  terms = newTerms;
-
   return *this;
 }
 
diff --git a/unittests/spin_op/SpinOpTester.cpp b/unittests/spin_op/SpinOpTester.cpp
index d8176c89b3..51b24e3942 100644
--- a/unittests/spin_op/SpinOpTester.cpp
+++ b/unittests/spin_op/SpinOpTester.cpp
@@ -10,6 +10,89 @@
 
 #include "cudaq/spin_op.h"
 
+enum Pauli : int8_t { I = 0, X, Y, Z };
+constexpr Pauli paulis[4] = {Pauli::I, Pauli::X, Pauli::Y, Pauli::Z};
+
+// Function to multiply two single-qubit Pauli operators
+static std::pair<std::complex<double>, Pauli> multiply_paulis(Pauli a,
+                                                              Pauli b) {
+  using namespace std::complex_literals;
+  // I    X    Y    Z
+  constexpr std::complex<double> table[4][4] = {
+      {1., 1., 1, 1},    // I
+      {1., 1., 1i, -1i}, // X
+      {1., -1i, 1, 1i},  // Y
+      {1., 1i, -1i, 1}   // Z
+  };
+  if (a == b)
+    return {1.0, Pauli::I};
+  if (a == Pauli::I)
+    return {1.0, b};
+  if (b == Pauli::I)
+    return {1.0, a};
+  return {table[a][b], paulis[a ^ b]};
+}
+
+// Function to multiply two multi-qubit Pauli words
+static std::pair<std::complex<double>, std::vector<Pauli>>
+multiply_pauli_words(const std::vector<Pauli> &a, const std::vector<Pauli> &b,
+                     bool verbose = false) {
+  std::complex<double> phase = 1.0;
+  std::string info;
+  std::vector<Pauli> result(a.size(), Pauli::I);
+  for (size_t i = 0; i < a.size(); ++i) {
+    auto [p, r] = multiply_paulis(a[i], b[i]);
+    phase *= p;
+    result[i] = r;
+  }
+  return {phase, result};
+}
+
+// Generates a pauli word out of a binary representation of it.
+static std::vector<Pauli> generate_pauli_word(int64_t id, int64_t num_qubits) {
+  constexpr int64_t mask = 0x3;
+  std::vector<Pauli> word(num_qubits, Pauli::I);
+  for (int64_t i = 0; i < num_qubits; ++i) {
+    assert((id & mask) < 4);
+    word[i] = paulis[id & mask];
+    id >>= 2;
+  }
+  return word;
+}
+
+static std::string generate_pauli_string(const std::vector<Pauli> &word) {
+  constexpr char paulis_name[4] = {'I', 'X', 'Y', 'Z'};
+  std::string result(word.size(), 'I');
+  for (int64_t i = 0; i < word.size(); ++i)
+    result[i] = paulis_name[word[i]];
+  return result;
+}
+
+static cudaq::spin_op generate_cudaq_spin(int64_t id, int64_t num_qubits,
+                                          bool addI = true) {
+  constexpr int64_t mask = 0x3;
+  cudaq::spin_op result;
+  for (int64_t i = 0; i < num_qubits; ++i) {
+    switch (paulis[id & mask]) {
+    case Pauli::I:
+      if (addI)
+        result *= cudaq::spin::i(i);
+      break;
+    case Pauli::X:
+      result *= cudaq::spin::x(i);
+      break;
+    case Pauli::Y:
+      result *= cudaq::spin::y(i);
+      break;
+    case Pauli::Z:
+      result *= cudaq::spin::z(i);
+      break;
+    }
+    id >>= 2;
+  }
+  return result;
+}
+
 using namespace cudaq::spin;
 
 TEST(SpinOpTester, checkConstruction) {
@@ -82,61 +165,26 @@ TEST(SpinOpTester, checkBug178) {
 }
 
 TEST(SpinOpTester, checkMultiplication) {
-  auto mult = x(0) * y(1);
-  mult.dump();
-  auto mult3 = y(0) * y(1);
-  mult3.dump();
-
-  auto tmp = 2 * y(1);
-  tmp.dump();
-
-  auto mult2 = x(3) * tmp;
-  mult2.dump();
-
-  std::cout << "X * Z: -iY\n";
-  (x(3) * z(3)).dump();
-  EXPECT_EQ(y(3), x(3) * z(3));
-  EXPECT_EQ((x(3) * z(3)).get_coefficient(), std::complex<double>(0, -1));
-
-  std::cout << "X * X: I\n";
-  (x(2) * x(2)).dump();
-  EXPECT_EQ(cudaq::spin_op(), x(2) * x(2));
-  EXPECT_EQ((x(2) * x(2)).get_coefficient(), std::complex<double>(1, 0));
-
-  std::cout << "Y * Y: I\n";
-  (y(14) * y(14)).dump();
-  EXPECT_EQ(cudaq::spin_op(), y(14) * y(14));
-  EXPECT_EQ((y(14) * y(14)).get_coefficient(), std::complex<double>(1, 0));
-
-  std::cout << "Z * Z: I\n";
-  (z(0) * z(0)).dump();
-  EXPECT_EQ(cudaq::spin_op(), z(0) * z(0));
-  EXPECT_EQ((z(0) * z(0)).get_coefficient(), std::complex<double>(1, 0));
-
-  std::cout << "X * Y: iZ\n";
-  (x(3) * y(3)).dump();
-  EXPECT_EQ(z(3), x(3) * y(3));
-  EXPECT_EQ((x(3) * y(3)).get_coefficient(), std::complex<double>(0, 1));
-
-  std::cout << "I * I: I\n";
-  (i(2) * i(2)).dump();
-  EXPECT_EQ(i(2), i(2) * i(2));
-  EXPECT_EQ((i(2) * i(2)).get_coefficient(), std::complex<double>(1, 0));
-
-  std::cout << "I * Z: Z\n";
-  (i(3) * i(3)).dump();
-  EXPECT_EQ(z(3), i(3) * z(3));
-  EXPECT_EQ((i(3) * z(3)).get_coefficient(), std::complex<double>(1, 0));
-
-  auto tmp2 = 2 * x(0) * x(1) * y(2) * y(3) + 3 * y(0) * y(1) * x(2) * x(3);
-  std::cout << "START\n";
-  tmp2 = tmp2 * tmp2;
-  tmp2.dump();
-
-  EXPECT_EQ(2, tmp2.num_terms());
-  auto expected =
-      13 * i(0) * i(1) * i(2) * i(3) + 12 * z(0) * z(1) * z(2) * z(3);
-  EXPECT_EQ(expected, tmp2);
+  for (int num_qubits = 1; num_qubits <= 4; ++num_qubits) {
+    int64_t num_words = std::pow(4, num_qubits);
+    for (int64_t i = 0; i < num_words; ++i) {
+      for (int64_t j = 0; j < num_words; ++j) {
+        // Expected result:
+        std::vector<Pauli> a_word = generate_pauli_word(i, num_qubits);
+        std::vector<Pauli> b_word = generate_pauli_word(j, num_qubits);
+        auto [phase, result] = multiply_pauli_words(a_word, b_word);
+
+        // Result:
+        cudaq::spin_op a_spin = generate_cudaq_spin(i, num_qubits);
+        cudaq::spin_op b_spin = generate_cudaq_spin(j, num_qubits, false);
+        cudaq::spin_op result_spin = a_spin * b_spin;
+
+        // Check result
+        EXPECT_EQ(generate_pauli_string(result), result_spin.to_string(false));
+        EXPECT_EQ(phase, result_spin.get_coefficient());
+      }
+    }
+  }
 }
 
 TEST(SpinOpTester, canBuildDeuteron) {