diff --git a/src/intrinsics/native/divmod.rs b/src/intrinsics/native/divmod.rs
index 33557bd..92495ea 100644
--- a/src/intrinsics/native/divmod.rs
+++ b/src/intrinsics/native/divmod.rs
@@ -108,11 +108,9 @@ pub fn udivmod4(
         return;
     }
 
-    const N_UDWORD_BITS: u32 = 128;
-
-    let mut dividend = *a;
-    let mut divisor = *b;
-    let mut quotient: U256;
+    let dividend = *a;
+    let divisor = *b;
+    let quotient: U256;
     let mut remainder: U256;
 
     if divisor > dividend {
@@ -156,34 +154,199 @@ pub fn udivmod4(
         return;
     }
 
-    // 0 <= shift <= 127.
-    let shift = divisor.high().leading_zeros() - dividend.high().leading_zeros();
-
-    divisor <<= shift;
-    quotient = U256::ZERO;
-    for _ in 0..=shift {
-        *quotient.low_mut() <<= 1;
-        // Branch free version of.
-        // if (dividend.all >= divisor.all)
-        // {
-        //    dividend.all -= divisor.all;
-        //    carry = 1;
-        // }
-        let s = ((*divisor
-            .wrapping_sub(dividend)
-            .wrapping_sub(U256::ONE)
-            .high() as i128)
-            >> (N_UDWORD_BITS - 1)) as u128;
-        *quotient.low_mut() |= s & 1;
-        dividend -= divisor & U256::from_words(s, s);
-        divisor >>= 1;
-    }
+    (quotient, remainder) = div_mod_knuth(&dividend, &divisor);
+
     if let Some(rem) = rem {
-        rem.write(dividend);
+        rem.write(remainder);
     }
     res.write(quotient);
 }
 
+
+// See Knuth, TAOCP, Volume 2, section 4.3.1, Algorithm D.
+// https://skanthak.homepage.t-online.de/division.html
+#[inline]
+pub fn div_mod_knuth(u: &U256, v: &U256) -> (U256, U256) {
+    const N_UDWORD_BITS: u32 = 128;
+
+    #[inline]
+    fn full_shl(a: &U256, shift: u32) -> [u128; 3] {
+        debug_assert!(shift < N_UDWORD_BITS);
+        let mut u = [0_u128; 3];
+        let u_lo = a.low() << shift;
+        let u_hi = a >> N_UDWORD_BITS - shift;
+        u[0] = u_lo;
+        u[1] = *u_hi.low();
+        u[2] = *u_hi.high();
+
+        u
+    }
+
+    #[inline]
+    fn full_shr(u: &[u128; 3], shift: u32) -> U256 {
+        debug_assert!(shift < N_UDWORD_BITS);
+        let mut res = U256::ZERO;
+        *res.low_mut() = u[0] >> shift;
+        *res.high_mut() = u[1] >> shift;
+        // carry
+        if shift > 0 {
+            let sh = N_UDWORD_BITS - shift;
+            *res.low_mut() |= u[1] << sh;
+            *res.high_mut() |= u[2] << sh;
+        }
+
+        res
+    }
+
+    // returns (lo, hi)
+    #[inline]
+    const fn split_u128_to_u128(a: u128) -> (u128, u128) {
+        (a & 0xFFFFFFFFFFFFFFFF, a >> N_UDWORD_BITS / 2)
+    }
+
+    // returns (lo, hi)
+    #[inline]
+    const fn fullmul_u128(a: u128, b: u128) -> (u128, u128) {
+        let (a0, a1) = split_u128_to_u128(a);
+        let (b0, b1) = split_u128_to_u128(b);
+
+        let mut t = a0 * b0;
+        let mut k: u128;
+        let w3: u128;
+        (w3, k) = split_u128_to_u128(t);
+
+        t = a1 * b0 + k;
+        let (w1, w2) = split_u128_to_u128(t);
+        t = a0 * b1 + w1;
+        k = t >> 64;
+
+        let w_hi = a1 * b1 + w2 + k;
+        let w_lo = (t << 64) + w3;
+
+        (w_lo, w_hi)
+    }
+
+    #[inline]
+    fn fullmul_u256_u128(a: &U256, b: u128) -> [u128; 3] {
+        let mut acc = [0_u128; 3];
+        let mut lo: u128;
+        let mut carry: u128;
+        let c: bool;
+        if b != 0 {
+            (lo, carry) = fullmul_u128(*a.low(), b);
+            acc[0] = lo;
+            acc[1] = carry;
+            (lo, carry) = fullmul_u128(*a.high(), b);
+            (acc[1], c) = acc[1].overflowing_add(lo);
+            acc[2] = carry + c as u128;
+        }
+
+        acc
+    }
+
+    #[inline]
+    const fn add_carry(a: u128, b: u128, c: bool) -> (u128, bool) {
+        let (res1, overflow1) = b.overflowing_add(c as u128);
+        let (res2, overflow2) = u128::overflowing_add(a, res1);
+
+        (res2, overflow1 || overflow2)
+    }
+
+    #[inline]
+    const fn sub_carry(a: u128, b: u128, c: bool) -> (u128, bool) {
+        let (res1, overflow1) = b.overflowing_add(c as u128);
+        let (res2, overflow2) = u128::overflowing_sub(a, res1);
+
+        (res2, overflow1 || overflow2)
+    }
+
+    // D1.
+    // Make sure 128th bit in v's highest word is set.
+    // If we shift both u and v, it won't affect the quotient
+    // and the remainder will only need to be shifted back.
+    let shift = v.high().leading_zeros();
+    debug_assert!(shift < N_UDWORD_BITS);
+    let v = v << shift;
+    debug_assert!(v.high() >> N_UDWORD_BITS - 1 == 1);
+    // u will store the remainder (shifted)
+    let mut u = full_shl(u, shift);
+
+    // quotient
+    let mut q = U256::ZERO;
+    let v_n_1 = *v.high();
+    let v_n_2 = *v.low();
+
+    // D2. D7. - unrolled loop j == 0, n == 2, m == 0 (only one possible iteration)
+    let mut r_hat: u128 = 0;
+    let u_jn = u[2];
+
+    // D3.
+    // q_hat is our guess for the j-th quotient digit
+    // q_hat = min(b - 1, (u_{j+n} * b + u_{j+n-1}) / v_{n-1})
+    // b = 1 << WORD_BITS
+    // Theorem B: q_hat >= q_j >= q_hat - 2
+    let mut q_hat = if u_jn < v_n_1 {
+        //let (mut q_hat, mut r_hat) = _div_mod_u128(u_jn, u[j + n - 1], v_n_1);
+        let mut q_hat = udiv256_by_128_to_128(u_jn, u[1], v_n_1, &mut r_hat);
+        let mut overflow: bool;
+        // this loop takes at most 2 iterations
+        loop {
+            let another_iteration = {
+                // check if q_hat * v_{n-2} > b * r_hat + u_{j+n-2}
+                let (lo, hi) = fullmul_u128(q_hat, v_n_2);
+                hi > r_hat || (hi == r_hat && lo > u[0])
+            };
+            if !another_iteration {
+                break;
+            }
+            q_hat -= 1;
+            (r_hat, overflow) = r_hat.overflowing_add(v_n_1);
+            // if r_hat overflowed, we're done
+            if overflow {
+                break;
+            }
+        }
+        q_hat
+    } else {
+        // here q_hat >= q_j >= q_hat - 1
+        u128::MAX
+    };
+
+    // ex. 20:
+    // since q_hat * v_{n-2} <= b * r_hat + u_{j+n-2},
+    // either q_hat == q_j, or q_hat == q_j + 1
+
+    // D4.
+    // let's assume optimistically q_hat == q_j
+    // subtract (q_hat * v) from u[j..]
+    let q_hat_v = fullmul_u256_u128(&v, q_hat);
+    // u[j..] -= q_hat_v;
+    let mut c = false;
+    (u[0], c) = sub_carry(u[0], q_hat_v[0], c);
+    (u[1], c) = sub_carry(u[1], q_hat_v[1], c);
+    (u[2], c) = sub_carry(u[2], q_hat_v[2], c);
+
+    // D6.
+    // actually, q_hat == q_j + 1 and u[j..] has overflowed
+    // highly unlikely ~ (1 / 2^127)
+    if c {
+        q_hat -= 1;
+        // add v to u[j..]
+        c = false;
+        (u[0], c) = add_carry(u[0], *v.low(), c);
+        (u[1], c) = add_carry(u[1], *v.high(), c);
+        u[2] = u[2].wrapping_add(c as u128);
+    }
+
+    // D5.
+    *q.low_mut() = q_hat;
+
+    // D8.
+    let remainder = full_shr(&u, shift);
+
+    (q, remainder)
+}
+
 #[inline]
 pub fn udiv2(r: &mut U256, a: &U256) {
     let (a, b) = (*r, a);