From 32e154e729935d7fd608f3da1b1479db001f8e1d Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Tue, 17 Sep 2024 14:48:18 +0200 Subject: [PATCH 1/4] generic Toom-3 multiplication for gr_poly --- doc/source/gr_poly.rst | 12 ++ src/gr_poly.h | 3 +- src/gr_poly/mul_toom33.c | 203 ++++++++++++++++++++++++++++++++ src/gr_poly/test/main.c | 2 + src/gr_poly/test/t-mul_toom33.c | 106 +++++++++++++++++ 5 files changed, 325 insertions(+), 1 deletion(-) create mode 100644 src/gr_poly/mul_toom33.c create mode 100644 src/gr_poly/test/t-mul_toom33.c diff --git a/doc/source/gr_poly.rst b/doc/source/gr_poly.rst index aa820dab16..f9f791b467 100644 --- a/doc/source/gr_poly.rst +++ b/doc/source/gr_poly.rst @@ -170,6 +170,18 @@ Arithmetic algorithm with `O(n^{1.6})` complexity, the ring must overload :func:`_gr_poly_mul` to dispatch to :func:`_gr_poly_mul_karatsuba` above some cutoff. +.. function:: int _gr_poly_mul_toom33(gr_ptr res, gr_srcptr poly1, slong len1, gr_srcptr poly2, slong len2, gr_ctx_t ctx); + int gr_poly_mul_toom33(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, gr_ctx_t ctx); + + Balanced Toom-3 multiplication with interpolation in five points, + using the Bodrato evaluation scheme. Assumes commutativity and division by 3. + Not optimized for squaring. + The underscore method requires positive lengths and does not support aliasing. + This function calls :func:`_gr_poly_mul` recursively rather than itself, so to get a recursive + algorithm with `O(n^{1.5})` complexity, the ring must overload :func:`_gr_poly_mul` to dispatch + to :func:`_gr_poly_mul_toom33` above some cutoff. + + Powering -------------------------------------------------------------------------------- diff --git a/src/gr_poly.h b/src/gr_poly.h index 6ba945a21c..9ce700b3fd 100644 --- a/src/gr_poly.h +++ b/src/gr_poly.h @@ -124,7 +124,8 @@ WARN_UNUSED_RESULT int gr_poly_mul_scalar(gr_poly_t res, const gr_poly_t poly, g WARN_UNUSED_RESULT int _gr_poly_mul_karatsuba(gr_ptr res, gr_srcptr poly1, slong len1, gr_srcptr poly2, slong len2, gr_ctx_t ctx); WARN_UNUSED_RESULT int gr_poly_mul_karatsuba(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, gr_ctx_t ctx); - +WARN_UNUSED_RESULT int _gr_poly_mul_toom33(gr_ptr res, gr_srcptr poly1, slong len1, gr_srcptr poly2, slong len2, gr_ctx_t ctx); +WARN_UNUSED_RESULT int gr_poly_mul_toom33(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, gr_ctx_t ctx); /* powering */ diff --git a/src/gr_poly/mul_toom33.c b/src/gr_poly/mul_toom33.c new file mode 100644 index 0000000000..dfcb8e2f2f --- /dev/null +++ b/src/gr_poly/mul_toom33.c @@ -0,0 +1,203 @@ +/* + Copyright (C) 2024 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "gr_vec.h" +#include "gr_poly.h" + +/* + Toom33 (interpolation in 5 points) using Bodrato scheme + http://marco.bodrato.it/papers/Bodrato2007-OptimalToomCookMultiplicationForBinaryFieldAndIntegers.pdf + + Assumes commutativity, division by 3. + Todo: squaring version. + Todo: skip unnecessary zero-extensions of vectors and tighten + allocations. +*/ +int +_gr_poly_mul_toom33(gr_ptr res, gr_srcptr f, slong flen, gr_srcptr g, slong glen, gr_ctx_t ctx) +{ + gr_srcptr U0, U1, U2, V0, V1, V2; + gr_ptr tmp, W0, W1, W2, W3, W4; + slong m, U2len, V2len, U1len, V1len, U0len, V0len, rlen, len; + slong sz = ctx->sizeof_elem; + slong alloc; + int status = GR_SUCCESS; + + /* TODO: should explicitly call basecase mul. */ + if (flen <= 1 || glen <= 1) + return _gr_poly_mullow_generic(res, f, flen, g, glen, flen + glen - 1, ctx); + + /* U = U2*x^(2m) + U1*x^m + U0 */ + /* V = V2*x^(2m) + V1*x^m + V0 */ + /* Each block has length m */ + m = FLINT_MAX(flen, glen); + m = (m + 3 - 1) / 3; + U0 = f; + U1 = GR_ENTRY(f, m, sz); + U2 = GR_ENTRY(f, 2 * m, sz); + V0 = g; + V1 = GR_ENTRY(g, m, sz); + V2 = GR_ENTRY(g, 2 * m, sz); + + U2len = FLINT_MAX(flen - 2 * m, 0); + V2len = FLINT_MAX(glen - 2 * m, 0); + U1len = FLINT_MIN(FLINT_MAX(flen - m, 0), m); + V1len = FLINT_MIN(FLINT_MAX(glen - m, 0), m); + U0len = FLINT_MIN(flen, m); + V0len = FLINT_MIN(glen, m); + + alloc = 10 * m; + GR_TMP_INIT_VEC(tmp, alloc, ctx); + W0 = tmp; + W1 = GR_ENTRY(W0, 2 * m, sz); + W2 = GR_ENTRY(W1, 2 * m, sz); + W3 = GR_ENTRY(W2, 2 * m, sz); + W4 = GR_ENTRY(W3, 2 * m, sz); + + /* Evaluation: 5*2 add, 2 shift; 5mul */ + /* W0 = U2 + U0 */ + /* if max(U2len,U0len) < m, assumes top coefficients are already zeroed from the initialization */ + status |= _gr_poly_add(W0, U2, U2len, U0, U0len, ctx); + /* W4 = V2 + V0 */ + /* if max(V2len,V0len) < m, assumes top coefficients are already zeroed from the initialization */ + status |= _gr_poly_add(W4, V2, V2len, V0, V0len, ctx); + /* W2 = W0 - U1 */ + status |= _gr_poly_sub(W2, W0, m, U1, U1len, ctx); + /* W1 = W4 - V1 */ + status |= _gr_poly_sub(W1, W4, m, V1, V1len, ctx); + /* W0 = W0 + U1 */ + status |= _gr_poly_add(W0, W0, m, U1, U1len, ctx); + /* W4 = W4 + V1 */ + status |= _gr_poly_add(W4, W4, m, V1, V1len, ctx); + /* W3 = W2 * W1 */ + status |= _gr_poly_mul(W3, W2, m, W1, m, ctx); + /* W1 = W0 * W4 */ + status |= _gr_poly_mul(W1, W0, m, W4, m, ctx); + /* W0 = ((W0 + U2) << 1) - U0 */ + status |= _gr_poly_add(W0, W0, m, U2, U2len, ctx); + status |= _gr_vec_mul_scalar_2exp_si(W0, W0, m, 1, ctx); + status |= _gr_poly_sub(W0, W0, m, U0, U0len, ctx); + /* W4 = ((W4 + V2) << 1) - V0 */ + status |= _gr_poly_add(W4, W4, m, V2, V2len, ctx); + status |= _gr_vec_mul_scalar_2exp_si(W4, W4, m, 1, ctx); + status |= _gr_poly_sub(W4, W4, m, V0, V0len, ctx); + /* W2 = W0 * W4 */ + status |= _gr_poly_mul(W2, W0, m, W4, m, ctx); + /* W0 = U0 * V0 */ + if (U0len > 0 && V0len > 0) + { + status |= _gr_poly_mul(W0, U0, U0len, V0, V0len, ctx); + status |= _gr_vec_zero(GR_ENTRY(W0, U0len + V0len - 1, sz), 2 * m - (U0len + V0len - 1), ctx); + } + else + status |= _gr_vec_zero(W0, 2 * m, ctx); + /* W4 = U2 * V2 */ + if (U2len > 0 && V2len > 0) + { + status |= _gr_poly_mul(W4, U2, U2len, V2, V2len, ctx); + status |= _gr_vec_zero(GR_ENTRY(W4, U2len + V2len - 1, sz), 2 * m - (U2len + V2len - 1), ctx); + } + else + status |= _gr_vec_zero(W4, 2 * m, ctx); + + + /* toom42 variant */ + /* U = U3*x^(3m) + U2*x^(2m) + U1*x^m + U0 */ + /* V = V1*x^m + V0 */ + /* Evaluation: 7+3 add, 3 shift; 5mul */ + /* + W0 = U1 + U3; + W4 = U0 + U2; + W3 = W4 + W0; + W4 = W4 - W0; + W0 = V0 + V1; + W2 = V0 - V1; + W1 = W3 * W0; + W3 = W4 * W2; + W4 = (((((U3<<1) + U2) << 1) + U1) << 1) + U0; + W0 = W0 + V1; + W2 = W4 * W0; + W0 = U0 * V0; + W4 = U3 * V1; + */ + + /* Interpolation: 8 add, 3 shift, 1 Sdiv */ + len = 2 * m - 1; + /* W2 = (W2 - W3) / 3 */ + status |= _gr_vec_sub(W2, W2, W3, len, ctx); + status |= _gr_vec_divexact_scalar_ui(W2, W2, len, 3, ctx); + /* W3 = (W1 - W3) >> 1 */ + status |= _gr_vec_sub(W3, W1, W3, len, ctx); + status |= _gr_vec_mul_scalar_2exp_si(W3, W3, len, -1, ctx); + /* W1 = W1 - W0 */ + status |= _gr_vec_sub(W1, W1, W0, len, ctx); + /* W2 = ((W2 - W1) >> 1) - (W4 << 1) */ + status |= _gr_vec_sub(W2, W2, W1, len, ctx); + status |= _gr_vec_mul_scalar_2exp_si(W2, W2, len, -1, ctx); + status |= _gr_vec_mul_scalar_2exp_si(res, W4, len, 1, ctx); + status |= _gr_vec_sub(W2, W2, res, len, ctx); + /* W1 = W1 - W3 - W4 */ + status |= _gr_vec_sub(W1, W1, W3, len, ctx); + status |= _gr_vec_sub(W1, W1, W4, len, ctx); + /* W3 = W3 - W2 */ + status |= _gr_vec_sub(W3, W3, W2, len, ctx); + + /* Recomposition: */ + /* W = W4 * x^(4m) + W2*x^(3m) + W1*x^(2m) + W3*x^m + W0 */ + + rlen = flen + glen - 1; + len = FLINT_MIN(rlen, m); + status |= _gr_vec_set(res, W0, FLINT_MIN(rlen, m), ctx); + len = FLINT_MIN(rlen - m, m); + status |= _gr_vec_add(GR_ENTRY(res, m, sz), W3, GR_ENTRY(W0, m, sz), len, ctx); + len = FLINT_MIN(rlen - 2 * m, m); + status |= _gr_vec_add(GR_ENTRY(res, 2 * m, sz), W1, GR_ENTRY(W3, m, sz), len, ctx); + len = FLINT_MIN(rlen - 3 * m, m); + status |= _gr_vec_add(GR_ENTRY(res, 3 * m, sz), W2, GR_ENTRY(W1, m, sz), len, ctx); + len = FLINT_MIN(rlen - 4 * m, m); + status |= _gr_vec_add(GR_ENTRY(res, 4 * m, sz), W4, GR_ENTRY(W2, m, sz), len, ctx); + len = FLINT_MIN(rlen - 5 * m, m); + status |= _gr_vec_set(GR_ENTRY(res, 5 * m, sz), GR_ENTRY(W4, m, sz), len, ctx); + + GR_TMP_CLEAR_VEC(tmp, alloc, ctx); + + return status; +} + +int +gr_poly_mul_toom33(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, gr_ctx_t ctx) +{ + slong len_out; + int status; + + if (poly1->length == 0 || poly2->length == 0) + return gr_poly_zero(res, ctx); + + len_out = poly1->length + poly2->length - 1; + + if (res == poly1 || res == poly2) + { + gr_poly_t t; + gr_poly_init2(t, len_out, ctx); + status = _gr_poly_mul_toom33(t->coeffs, poly1->coeffs, poly1->length, poly2->coeffs, poly2->length, ctx); + gr_poly_swap(res, t, ctx); + gr_poly_clear(t, ctx); + } + else + { + gr_poly_fit_length(res, len_out, ctx); + status = _gr_poly_mul_toom33(res->coeffs, poly1->coeffs, poly1->length, poly2->coeffs, poly2->length, ctx); + } + + _gr_poly_set_length(res, len_out, ctx); + _gr_poly_normalise(res, ctx); + return status; +} diff --git a/src/gr_poly/test/main.c b/src/gr_poly/test/main.c index 62016ec560..04af841384 100644 --- a/src/gr_poly/test/main.c +++ b/src/gr_poly/test/main.c @@ -44,6 +44,7 @@ #include "t-log_series.c" #include "t-make_monic.c" #include "t-mul_karatsuba.c" +#include "t-mul_toom33.c" #include "t-nth_derivative.c" #include "t-pow_series_fmpq.c" #include "t-pow_series_ui.c" @@ -106,6 +107,7 @@ test_struct tests[] = TEST_FUNCTION(gr_poly_log_series), TEST_FUNCTION(gr_poly_make_monic), TEST_FUNCTION(gr_poly_mul_karatsuba), + TEST_FUNCTION(gr_poly_mul_toom33), TEST_FUNCTION(gr_poly_nth_derivative), TEST_FUNCTION(gr_poly_pow_series_fmpq), TEST_FUNCTION(gr_poly_pow_series_ui), diff --git a/src/gr_poly/test/t-mul_toom33.c b/src/gr_poly/test/t-mul_toom33.c new file mode 100644 index 0000000000..16439ac6b7 --- /dev/null +++ b/src/gr_poly/test/t-mul_toom33.c @@ -0,0 +1,106 @@ +/* + Copyright (C) 2023 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "ulong_extras.h" +#include "gr_poly.h" + +FLINT_DLL extern gr_static_method_table _ca_methods; + +int +test_mul1(flint_rand_t state, int which) +{ + gr_ctx_t ctx; + slong n; + gr_poly_t A, B, C, D; + int status = GR_SUCCESS; + + gr_ctx_init_random(ctx, state); + + gr_poly_init(A, ctx); + gr_poly_init(B, ctx); + gr_poly_init(C, ctx); + gr_poly_init(D, ctx); + + if (ctx->methods == _ca_methods) + n = 2; + else if (gr_ctx_is_finite(ctx) == T_TRUE) + n = 30; + else + n = 10; + + GR_MUST_SUCCEED(gr_poly_randtest(A, state, 1 + n_randint(state, n), ctx)); + GR_MUST_SUCCEED(gr_poly_randtest(B, state, 1 + n_randint(state, n), ctx)); + GR_MUST_SUCCEED(gr_poly_randtest(C, state, 1 + n_randint(state, n), ctx)); + + switch (which) + { + case 0: + status |= gr_poly_mul_toom33(C, A, B, ctx); + break; + case 1: + status |= gr_poly_set(C, A, ctx); + status |= gr_poly_mul_toom33(C, C, B, ctx); + break; + case 2: + status |= gr_poly_set(C, B, ctx); + status |= gr_poly_mul_toom33(C, A, C, ctx); + break; + case 3: + status |= gr_poly_set(B, A, ctx); + status |= gr_poly_mul_toom33(C, A, A, ctx); + break; + case 4: + status |= gr_poly_set(B, A, ctx); + status |= gr_poly_set(C, A, ctx); + status |= gr_poly_mul_toom33(C, C, C, ctx); + break; + + default: + flint_abort(); + } + + /* todo: should explicitly call basecase mul */ + status |= gr_poly_mullow(D, A, B, FLINT_MAX(0, A->length + B->length - 1), ctx); + + if (status == GR_SUCCESS && gr_poly_equal(C, D, ctx) == T_FALSE) + { + flint_printf("FAIL\n\n"); + flint_printf("which = %d, n = %wd\n\n", which, n); + gr_ctx_println(ctx); + flint_printf("A = "); gr_poly_print(A, ctx); flint_printf("\n\n"); + flint_printf("B = "); gr_poly_print(B, ctx); flint_printf("\n\n"); + flint_printf("C = "); gr_poly_print(C, ctx); flint_printf("\n\n"); + flint_printf("D = "); gr_poly_print(D, ctx); flint_printf("\n\n"); + flint_abort(); + } + + gr_poly_clear(A, ctx); + gr_poly_clear(B, ctx); + gr_poly_clear(C, ctx); + gr_poly_clear(D, ctx); + + gr_ctx_clear(ctx); + + return status; +} + +TEST_FUNCTION_START(gr_poly_mul_toom33, state) +{ + slong iter; + + for (iter = 0; iter < 1000; iter++) + { + test_mul1(state, n_randint(state, 5)); + } + + TEST_FUNCTION_END(state); +} From eaceff20fa910629c1a67c21c02eade055f8bb49 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Tue, 17 Sep 2024 15:02:30 +0200 Subject: [PATCH 2/4] credit Bodrato for original pseudocode --- src/gr_poly/mul_toom33.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gr_poly/mul_toom33.c b/src/gr_poly/mul_toom33.c index dfcb8e2f2f..c443815442 100644 --- a/src/gr_poly/mul_toom33.c +++ b/src/gr_poly/mul_toom33.c @@ -1,4 +1,5 @@ /* + Copyright (C) 2007 Marco Bodrato Copyright (C) 2024 Fredrik Johansson This file is part of FLINT. From 83a1703313e20684c07b32365222298af909456b Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Tue, 17 Sep 2024 16:31:29 +0200 Subject: [PATCH 3/4] reduce Toom zero-padding --- src/gr_poly/mul_toom33.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/gr_poly/mul_toom33.c b/src/gr_poly/mul_toom33.c index c443815442..3be7c20ffb 100644 --- a/src/gr_poly/mul_toom33.c +++ b/src/gr_poly/mul_toom33.c @@ -28,6 +28,7 @@ _gr_poly_mul_toom33(gr_ptr res, gr_srcptr f, slong flen, gr_srcptr g, slong glen gr_srcptr U0, U1, U2, V0, V1, V2; gr_ptr tmp, W0, W1, W2, W3, W4; slong m, U2len, V2len, U1len, V1len, U0len, V0len, rlen, len; + slong W4len; slong sz = ctx->sizeof_elem; slong alloc; int status = GR_SUCCESS; @@ -101,14 +102,16 @@ _gr_poly_mul_toom33(gr_ptr res, gr_srcptr f, slong flen, gr_srcptr g, slong glen else status |= _gr_vec_zero(W0, 2 * m, ctx); /* W4 = U2 * V2 */ + /* We compute this length accurately instead of zero-extending. */ if (U2len > 0 && V2len > 0) { + W4len = U2len + V2len - 1; status |= _gr_poly_mul(W4, U2, U2len, V2, V2len, ctx); - status |= _gr_vec_zero(GR_ENTRY(W4, U2len + V2len - 1, sz), 2 * m - (U2len + V2len - 1), ctx); } else - status |= _gr_vec_zero(W4, 2 * m, ctx); - + { + W4len = 0; + } /* toom42 variant */ /* U = U3*x^(3m) + U2*x^(2m) + U1*x^m + U0 */ @@ -143,16 +146,16 @@ _gr_poly_mul_toom33(gr_ptr res, gr_srcptr f, slong flen, gr_srcptr g, slong glen /* W2 = ((W2 - W1) >> 1) - (W4 << 1) */ status |= _gr_vec_sub(W2, W2, W1, len, ctx); status |= _gr_vec_mul_scalar_2exp_si(W2, W2, len, -1, ctx); - status |= _gr_vec_mul_scalar_2exp_si(res, W4, len, 1, ctx); - status |= _gr_vec_sub(W2, W2, res, len, ctx); + status |= _gr_vec_mul_scalar_2exp_si(res, W4, W4len, 1, ctx); + status |= _gr_vec_sub(W2, W2, res, W4len, ctx); /* W1 = W1 - W3 - W4 */ status |= _gr_vec_sub(W1, W1, W3, len, ctx); - status |= _gr_vec_sub(W1, W1, W4, len, ctx); + status |= _gr_poly_sub(W1, W1, len, W4, W4len, ctx); /* W3 = W3 - W2 */ status |= _gr_vec_sub(W3, W3, W2, len, ctx); /* Recomposition: */ - /* W = W4 * x^(4m) + W2*x^(3m) + W1*x^(2m) + W3*x^m + W0 */ + /* W = W4 * x^(4m) + W2*x^(3m) + W1*x^(2m) + W*x^m + W0 */ rlen = flen + glen - 1; len = FLINT_MIN(rlen, m); @@ -164,7 +167,7 @@ _gr_poly_mul_toom33(gr_ptr res, gr_srcptr f, slong flen, gr_srcptr g, slong glen len = FLINT_MIN(rlen - 3 * m, m); status |= _gr_vec_add(GR_ENTRY(res, 3 * m, sz), W2, GR_ENTRY(W1, m, sz), len, ctx); len = FLINT_MIN(rlen - 4 * m, m); - status |= _gr_vec_add(GR_ENTRY(res, 4 * m, sz), W4, GR_ENTRY(W2, m, sz), len, ctx); + status |= _gr_poly_add(GR_ENTRY(res, 4 * m, sz), W4, FLINT_MIN(W4len, len), GR_ENTRY(W2, m, sz), len, ctx); len = FLINT_MIN(rlen - 5 * m, m); status |= _gr_vec_set(GR_ENTRY(res, 5 * m, sz), GR_ENTRY(W4, m, sz), len, ctx); From 6724256d82a254d6d3907df88be6d994f10c4df7 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Tue, 17 Sep 2024 16:39:11 +0200 Subject: [PATCH 4/4] clarify that exact division by 2 and 3 is required --- doc/source/gr_poly.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/gr_poly.rst b/doc/source/gr_poly.rst index f9f791b467..25f3b0cccd 100644 --- a/doc/source/gr_poly.rst +++ b/doc/source/gr_poly.rst @@ -174,7 +174,8 @@ Arithmetic int gr_poly_mul_toom33(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, gr_ctx_t ctx); Balanced Toom-3 multiplication with interpolation in five points, - using the Bodrato evaluation scheme. Assumes commutativity and division by 3. + using the Bodrato evaluation scheme. Assumes commutativity and that the ring + supports exact division by 2 and 3. Not optimized for squaring. The underscore method requires positive lengths and does not support aliasing. This function calls :func:`_gr_poly_mul` recursively rather than itself, so to get a recursive