Merge pull request #2055 from vneiger/mulmod_shoup

mulmod_shoup and nmod_vec_scalar_mul_nmod
flintlib · Sep 6, 2024 · 06c1720 · 06c1720
2 parents decd504 + 8a2bfbb
commit 06c1720
Show file tree

Hide file tree

Showing 40 changed files with 2,285 additions and 240 deletions.
diff --git a/configure.ac b/configure.ac
@@ -673,13 +673,13 @@ Please report at <https://github.com/flintlib/flint/issues/>])
         param_path="x86_64/zen3"
         ;;
       zen4)
-        gcc_cflags_arch="-march=znver4 -march=znver3"
-        param_path="x86_64/zen3"
+        gcc_cflags_arch="-march=znver4"
+        param_path="x86_64/zen4"
         have_avx512="yes"
         ;;
       coreibwl | broadwell)
         gcc_cflags_arch="-march=broadwell"
-        param_path="x86_64/skylake"
+        param_path="x86_64/broadwell"
         ;;
       skylake)
         gcc_cflags_arch="-march=skylake"
@@ -701,12 +701,12 @@ Please report at <https://github.com/flintlib/flint/issues/>])
         ;;
       icelake)
         gcc_cflags_arch="-march=icelake-client"
-        param_path="x86_64/skylake"
+        param_path="x86_64/icelake"
         have_avx512="yes"
         ;;
       icelake_server)
         gcc_cflags_arch="-march=icelake-server"
-        param_path="x86_64/skylake"
+        param_path="x86_64/icelake"
         have_avx512="yes"
         ;;
       rocketlake)
@@ -737,7 +737,7 @@ Please report at <https://github.com/flintlib/flint/issues/>])
         have_avx512="yes"
         ;;
       cometlake)
-        gcc_cflags_arch="-march=kabylake"
+        gcc_cflags_arch="-march=skylake"
         param_path="x86_64/skylake"
         ;;
       *)

diff --git a/doc/source/nmod_mat.rst b/doc/source/nmod_mat.rst
@@ -76,6 +76,12 @@ Memory management
     Swaps two matrices by swapping the individual entries rather than swapping
     the contents of the structs.
 
+.. function:: void nmod_mat_set_mod(nmod_mat_t mat, ulong n);
+
+    Sets the modulus of an already initialized matrix ``mat`` to be `n`. Row
+    and column dimensions are unchanged, and allocated memory is unaffected.
+    Caution: this does not reduce the entries of ``mat`` modulo `n`.
+
 
 Basic properties and manipulation
 --------------------------------------------------------------------------------
@@ -142,7 +148,7 @@ Concatenate
 
 .. function:: void nmod_mat_concat_vertical(nmod_mat_t res, const nmod_mat_t mat1, const nmod_mat_t mat2)
 
-    Sets ``res`` to vertical concatenation of (`mat1`, ``mat2``) in that order. Matrix dimensions : ``mat1`` : `m \times n`, ``mat2`` : `k \times n`, ``res`` : `(m + k) \times n`.
+    Sets ``res`` to vertical concatenation of (``mat1``, ``mat2``) in that order. Matrix dimensions : ``mat1`` : `m \times n`, ``mat2`` : `k \times n`, ``res`` : `(m + k) \times n`.
 
 .. function:: void nmod_mat_concat_horizontal(nmod_mat_t res, const nmod_mat_t mat1, const nmod_mat_t mat2)
 
@@ -309,16 +315,16 @@ Matrix-scalar arithmetic
 
 .. function:: void nmod_mat_scalar_mul(nmod_mat_t B, const nmod_mat_t A, ulong c)
 
-    Sets `B = cA`, where the scalar `c` is assumed to be reduced
-    modulo the modulus. Dimensions of `A` and `B` must be identical.
+    Sets `B = cA`, where the scalar `c` is assumed to be reduced modulo the
+    modulus. Dimensions of `A` and `B` must be identical.
 
-.. function:: void nmod_mat_scalar_addmul_ui(nmod_mat_t dest, const nmod_mat_t X, const nmod_mat_t Y, const ulong b)
+.. function:: void nmod_mat_scalar_addmul_ui(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B, const ulong c)
 
-    Sets `dest = X + bY`, where the scalar `b` is assumed to be reduced
-    modulo the modulus. Dimensions of dest, X and Y must be identical.
-    dest can be aliased with X or Y.
+    Sets `C = A + cB`, where the scalar `c` is assumed to be reduced modulo the
+    modulus. Dimensions of `C`, `A` and `B` must be identical.  `C` can be
+    aliased with `A` or `B`.
 
-.. function:: void nmod_mat_scalar_mul_fmpz(nmod_mat_t res, const nmod_mat_t M, const fmpz_t c)
+.. function:: void nmod_mat_scalar_mul_fmpz(nmod_mat_t B, const nmod_mat_t A, const fmpz_t c)
 
     Sets `B = cA`, where the scalar `c` is of type ``fmpz_t``. Dimensions of `A`
     and `B` must be identical.

diff --git a/doc/source/nmod_poly.rst b/doc/source/nmod_poly.rst
@@ -504,23 +504,17 @@ Scalar multiplication and division
     Adds ``poly`` multiplied by `c` to ``res``. The element `c` is assumed
     to be less than the modulus of ``poly``.
 
-.. function:: void _nmod_poly_make_monic(nn_ptr output, nn_srcptr input, slong len, nmod_t mod)
+.. function:: void _nmod_poly_make_monic(nn_ptr res, nn_srcptr poly, slong len, nmod_t mod)
 
-    Sets ``output`` to be the scalar multiple of ``input`` of
-    length ``len > 0`` that has leading coefficient one, if such a
-    polynomial exists. If the leading coefficient of ``input`` is not
-    invertible, ``output`` is set to the multiple of ``input`` whose
-    leading coefficient is the greatest common divisor of the leading
-    coefficient and the modulus of ``input``.
+    Requires that ``res`` and ``poly`` have length at least ``len``, with ``len
+    > 0``, and that ``poly[len-1]`` is invertible modulo ``mod.n``. Sets
+    ``res[i]`` to the modular product of `c` and ``poly[i]`` for `i` from `0`
+    to ``len-1``, where `c` is the inverse of ``poly[len-1]``.
 
-.. function:: void nmod_poly_make_monic(nmod_poly_t output, const nmod_poly_t input)
+.. function:: void nmod_poly_make_monic(nmod_poly_t res, const nmod_poly_t poly)
 
-    Sets ``output`` to be the scalar multiple of ``input`` with leading
-    coefficient one, if such a polynomial exists. If ``input`` is zero
-    an exception is raised. If the leading coefficient of ``input`` is not
-    invertible, ``output`` is set to the multiple of ``input`` whose
-    leading coefficient is the greatest common divisor of the leading
-    coefficient and the modulus of ``input``.
+    Sets ``res`` to be the scalar multiple of ``poly`` with leading coefficient
+    one. If ``poly`` is zero, an exception is raised.
 
 
 Bit packing and unpacking
@@ -1240,17 +1234,42 @@ Evaluation
 --------------------------------------------------------------------------------
 
 
+.. function:: ulong _nmod_poly_evaluate_nmod_precomp(nn_srcptr poly, slong len, ulong c, ulong c_precomp, nmod_t mod)
+
+    Evaluates ``poly`` at the value ``c`` and reduces modulo the given modulus
+    of ``poly``. The value ``c`` should be reduced modulo the modulus, and the
+    modulus must be less than `2^{\mathtt{FLINT\_BITS} - 1}`. The algorithm
+    used is Horner's method, with multiplications done via
+    :func:`n_mulmod_shoup` using the precomputed ``c_precomp`` obtained via
+    :func:`n_mulmod_precomp_shoup`.
+
+.. function:: ulong _nmod_poly_evaluate_nmod_precomp_lazy(nn_srcptr poly, slong len, ulong c, ulong c_precomp, nmod_t mod)
+
+    Evaluates ``poly`` at the value ``c`` and reduces modulo the given modulus
+    of ``poly``. The value ``c`` should be reduced modulo the modulus, and the
+    modulus `n` must satisfy `3n-2 < 2^{\mathtt{FLINT\_BITS}}` (this is `n \le
+    6148914691236517205` for 64 bits, and `n \le 1431655765` for 32 bits). The
+    algorithm used is Horner's method, with multiplications done as in
+    :func:`n_mulmod_shoup` using the precomputed ``c_precomp`` obtained via
+    :func:`n_mulmod_precomp_shoup`. Reductions modulo the modulus are delayed
+    to the very end of the computation.
+
 .. function:: ulong _nmod_poly_evaluate_nmod(nn_srcptr poly, slong len, ulong c, nmod_t mod)
 
-    Evaluates ``poly`` at the value ``c`` and reduces modulo the
-    given modulus of ``poly``. The value ``c`` should be reduced
-    modulo the modulus. The algorithm used is Horner's method.
+    Evaluates ``poly`` at the value ``c`` and reduces modulo the given modulus
+    of ``poly``. The value ``c`` should be reduced modulo the modulus. The
+    algorithm used is Horner's method, with multiplications done via
+    :func:`nmod_mul`.
 
 .. function:: ulong nmod_poly_evaluate_nmod(const nmod_poly_t poly, ulong c)
 
-    Evaluates ``poly`` at the value ``c`` and reduces modulo the
-    modulus of ``poly``. The value ``c`` should be reduced modulo
-    the modulus. The algorithm used is Horner's method.
+    Evaluates ``poly`` at the value ``c`` and reduces modulo the modulus of
+    ``poly``. The value ``c`` should be reduced modulo the modulus. The
+    algorithm used is Horner's method, with multiplications and additions done
+    differently depending on the modulus ``poly->mod`` and on the degree (calls
+    one of :func:`_nmod_poly_evaluate_nmod`,
+    :func:`_nmod_poly_evaluate_nmod_precomp`,
+    :func:`_nmod_poly_evaluate_nmod_precomp_lazy`).
 
 .. function:: void nmod_poly_evaluate_mat_horner(nmod_mat_t dest, const nmod_poly_t poly, const nmod_mat_t c)
 

diff --git a/doc/source/nmod_vec.rst b/doc/source/nmod_vec.rst
@@ -112,8 +112,9 @@ Arithmetic operations
 .. function:: void _nmod_vec_scalar_mul_nmod_shoup(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod)
 
     Sets ``(res, len)`` to ``(vec, len)`` multiplied by `c` using
-    :func:`n_mulmod_shoup`. `mod.n` should be less than `2^{\mathtt{FLINT\_BITS} - 1}`. `c`
-    and all elements of ``vec`` should be less than ``mod.n``.
+    :func:`n_mulmod_shoup`. ``mod.n`` should be less than
+    `2^{\mathtt{FLINT\_BITS} - 1}`, and `c` should be less than ``mod.n``.
+    There is no constraint on elements of ``vec``.
 
 .. function:: void _nmod_vec_scalar_addmul_nmod(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod)
 

diff --git a/doc/source/ulong_extras.rst b/doc/source/ulong_extras.rst
@@ -596,19 +596,163 @@ Modular Arithmetic
     ``m`` then 0 is returned by the function and the location ``sqrt``
     points to is set to NULL.
 
-.. function:: ulong n_mulmod_shoup(ulong w, ulong t, ulong w_precomp, ulong p)
 
-    Returns `w t \bmod{p}` given a precomputed scaled approximation of `w / p`
-    computed by :func:`n_mulmod_precomp_shoup`. The value of `p` should be
-    less than `2^{\mathtt{FLINT\_BITS} - 1}`. `w` and `t` should be less than `p`.
-    Works faster than :func:`n_mulmod2_preinv` if `w` fixed and `t` from array
-    (for example, scalar multiplication of vector).
-
-.. function:: ulong n_mulmod_precomp_shoup(ulong w, ulong p)
-
-    Returns `w'`, scaled approximation of `w / p`. `w'`  is equal to the integer
-    part of `w \cdot 2^{\mathtt{FLINT\_BITS}} / p`.
+Modular Arithmetic with Fixed Operand
+--------------------------------------------------------------------------------
 
+This is about computing several modular multiplications where one operand and
+the modulus are fixed, that is, computing `a b_i \bmod n` for several `b_i`'s
+and fixed `a` and `n`. Most functions below require `a < n <
+2^{\mathtt{FLINT\_BITS} - 1}` but have no constraint on `b_i` (it can be `\ge
+n`).
+
+Some explanations about the method can be found after the functions
+descriptions. This has been introduced in NTL and is attributed to Victor
+Shoup. For references, see
+
+- NTL code (in particular file FFT.cpp, consulted in NTL v11.5.1);
+
+- David Harvey, Faster arithmetic for number-theoretic transforms, 2014 J.Symbolic Computation (http://dx.doi.org/10.1016/j.jsc.2013.09.002);
+
+- Victor Shoup, "Arithmetic Software Libraries", 2021 (https://doi.org/10.1017/9781108854207.012) (chapter 9 of the book https://doi.org/10.1017/9781108854207).
+
+.. function:: ulong n_mulmod_precomp_shoup(ulong a, ulong n)
+
+    Returns ``a_precomp``, a scaled approximation of `a / n`. This requires `a
+    < n`. Precisely, ``a_precomp``  is the integer `\lfloor a \cdot
+    2^{\mathtt{FLINT\_BITS}} / n \rfloor`, and is intended to be used as the
+    precomputed data for :func:`n_mulmod_shoup`, which requires `n <
+    2^{\mathtt{FLINT\_BITS} - 1}`.
+
+.. function:: void n_mulmod_precomp_shoup_quo_rem(ulong * a_pr_quo, ulong * a_pr_rem, ulong a, ulong n)
+
+     Sets ``a_pr_quo`` to the above scaled approximation of `\lfloor a \cdot
+     2^{\mathtt{FLINT\_BITS}} / n \rfloor`, which is the quotient in the integer
+     division of `a \cdot 2^{\mathtt{FLINT\_BITS}}` by `n`, and also sets
+     ``a_pr_rem`` to the remainder in this division. This requires `a < n` and
+     is intended to be used as the precomputed data for
+     :func:`n_mulmod_and_precomp_shoup`, which requires `n <
+     2^{\mathtt{FLINT\_BITS} - 1}`.
+
+.. function:: ulong n_mulmod_precomp_shoup_rem_from_quo(ulong a_pr_quo, ulong n)
+
+    Returns the remainder in the integer division of `a \cdot
+    2^{\mathtt{FLINT\_BITS}}` by `n`. This requires `a < n` and is intended to
+    be used as the precomputed data for :func:`n_mulmod_and_precomp_shoup`,
+    which requires `n < 2^{\mathtt{FLINT\_BITS} - 1}`. This is faster than
+    :func:`n_mulmod_precomp_shoup_quo_rem` when ``a_pr_quo`` is already known.
+
+.. function:: ulong n_mulmod_shoup(ulong a, ulong b, ulong a_precomp, ulong n)
+
+    Returns `a b \bmod n` given ``a_precomp``, a precomputed scaled
+    approximation of `a / n` equal to `\lfloor a \cdot 2^{\mathtt{FLINT\_BITS}}
+    / n \rfloor`. This requires `n < 2^{\mathtt{FLINT\_BITS} - 1}` and `a < n`,
+    there is no restriction on `b`. Works faster than other ``mulmod``
+    routines (e.g. :func:`n_mulmod2_preinv`) in situations where one repeats
+    modular multiplications with fixed `a` and `n` and several values of `b`'s,
+    which amortizes the time for precomputing ``a_precomp``. Examples are
+    scalar multiplication of vectors such as :func:`_nmod_vec_scalar_mul_nmod`
+    or of matrices such as :func:`_nmod_vec_scalar_mul_nmod`.
+
+.. function:: void n_mulmod_and_precomp_shoup(ulong * ab, ulong * ab_precomp, \
+                             ulong a, ulong b,                                \
+                             ulong a_pr_quo, ulong a_pr_rem, ulong b_precomp, \
+                             ulong n)
+
+    Sets ``ab`` to `a b \bmod n` and sets ``ab_precomp`` to the precomputed
+    scaled approximation for ``ab / n``, that is, `\lfloor (ab \bmod n) \cdot
+    2^{\mathtt{FLINT\_BITS}} / n \rfloor`. This requires `n <
+    2^{\mathtt{FLINT\_BITS} - 1}`, `a < n`, and `b < n`. The input ``a_pr_quo``
+    and ``a_pr_rem`` is as in the description of
+    :func:`n_mulmod_precomp_shoup_quo_rem`, and ``b_precomp`` is as in the
+    description of :func:`n_mulmod_precomp_shoup`. This can be used for example
+    when seeking a list of powers `[a \bmod n, a^2 \bmod n, a^3 \bmod n,
+    \ldots]` along with associated precomputed data to speed up repeated
+    modular multiplications by these fixed powers.
+
+Here are some explanations. Let `B = \mathtt{FLINT\_BITS}`, and `W = 2^B`. We have as input
+`a, b, n`, and the goal is mainly to output `ab \bmod n`. Constraints are: `a <
+n`, and `n` has `< B` bits, i.e. `0 \le a < n < 2^{B-1}`. There is no
+restriction on b.
+
+This is intended for repeated multiplications with fixed `a` and `n`, and
+varying `b`; hence the initial step is seen as a precomputation (it
+depends on `a` and `n` only).
+
+**Precomputation step:**  The main function is ``n_mulmod_precomp_shoup``,
+which computes `\mathtt{a\_precomp} = \lfloor a W / n \rfloor`. The requirement
+`a < n` ensures ``a_precomp`` fits in a word (i.e. `\mathtt{a\_precomp} < W`).
+The variant ending in ``_quo_rem`` computes the same quotient
+`\mathtt{a\_pr\_quo} = \mathtt{a\_precomp}` but also stores the remainder
+`\mathtt{a\_pr\_rem}` in the division of `a W` by `n`; the variant ending in
+``_rem_from_quo`` deduces the remainder from the quotient if the latter is
+already known.
+
+**Modular multiplication:** The main function is ``n_mulmod_shoup``, which
+takes `a, b, \hat{a}, n` as input and returns `ab \bmod n`, where `\hat{a} =
+\mathtt{a\_precomp}`. The steps are:
+
+1. find `\mathtt{p_hi}, \mathtt{p_lo}` such that `\hat{a} b = \mathtt{p_hi} W + \mathtt{p_lo}`     (high part of a double-word multiplication, `\mathtt{p_lo}` will not be used)
+2. compute `c = ab - \mathtt{p_hi} n`                  (two single-word multiplications)
+3. if `c \ge n`, return `c-n`, else return `c`
+
+*Step 1.* One has `\mathtt{p_hi} = \lfloor ab/n \rfloor` or `\mathtt{p_hi} = \lfloor ab/n \rfloor - 1`.
+
+Proof: Write `aW = \hat{a} n + r`, with `0 \le r < n`.  Thus `\frac{\hat{a}
+b}{W} = \frac{ab}{n} - \frac{rb}{nW}`.  So `\mathtt{p_hi} = \lfloor
+\frac{\hat{a} b}{W} \rfloor = \lfloor \frac{ab}{n} - \frac{rb}{nW} \rfloor`.
+Clearly `\mathtt{p_hi} \le \lfloor \frac{ab}{n} \rfloor`. And `rb < nW` (since
+`r < n` and `b < W`) so `-\frac{rb}{nW} > -1`, hence `\mathtt{p_hi} \ge
+\lfloor \frac{ab}{n} \rfloor - 1`.
+
+*Step 2.* It follows that either `c = ab \bmod n` or `c = (ab \bmod n) + n`.
+
+This is where the restriction on `n` comes into play: allowing `n` to have `B`
+bits prevents us from detecting which case we are in (a comparison such as `c
+\ge n` would not tell). Since `n` has `< B` bits, `c \ge n` if and only if `c =
+(ab \bmod n) + n`.
+
+*Step 3.* we use this to detect which case we are in and correct the possible
+excess.
+
+**Combining precomputation and multiplication:** The main function is
+``n_mulmod_and_precomp_shoup``. The goal is to compute not only `ab \bmod n`,
+but also the corresponding quotient `\mathtt{ab\_precomp} = \lfloor (ab \bmod
+n) \cdot W / n \rfloor`. For this we require as input both the precomputed
+quotient and remainder for `a` (that is, ``a_pr_quo`` and ``a_pr_rem``), as
+well as the precomputed quotient for ``b`` (``b_precomp``). This adds the
+constraint that `b < n`.
+
+To simplify notation, write the integer division `aW = \hat{a} n + \check{a}`
+so that `\hat{a} = \mathtt{a\_pr\_quo}` and `\check{a} = \mathtt{a\_pr\_rem}`,
+and similarly for `bW = \hat{b} n + \check{b}` with `\hat{b} =
+\mathtt{b\_precomp}` (we will not use `\check{b}`).
+
+The first three steps follow the above-described approach to compute `ab \bmod
+n` using `\hat{a}` (but this time, the lower part of the double-word
+multiplication is kept):
+
+1. find `d, \hat{c}` such that `\hat{a} b = d W + \hat{c}` (double-word multiplication)
+2. compute `c = a b - d n` (two single-word multiplications)
+3. if `c \ge n` then `c \gets c - n`
+
+At this stage, `c = ab \bmod n`, and `\hat{c} = \hat{a} b \bmod W`. Now suppose
+we have the quotient `q = \lfloor \frac{\check{a} b}{n} \rfloor` in the
+division of `\check{a}b` by `n`. The claim is that `\hat{c} + q` is the sought
+precomputation quotient for `ab \bmod n` up to a multiple of `W`, that is,
+`\hat{c} + q = \lfloor \frac{c W}{n} \rfloor \bmod W`. Indeed, one has `\lfloor
+\frac{c W}{n} \rfloor = \lfloor \frac{ab W}{n} \rfloor \bmod W`, and it remains
+to observe that `\lfloor \frac{ab W}{n} \rfloor = \lfloor
+\frac{(\hat{a}n+\check{a}) b}{n} \rfloor = \hat{a} b + \lfloor \frac{\check{a}
+b}{n} \rfloor = \hat{c} + q \bmod W`.
+
+Therefore the remaining steps consist in computing the quotient `q`, which can
+be done via the modular multiplication of `\check{a}` by `b` using the
+precomputed quotient for `b`:
+
+4. find `q,e` such that `\hat{b} \check{a} = q W + e` (high-part of double word multiplication, `e` will not be used)
+5. compute `h = b \check{a} - q n` (two single-word multiplications)
+6. if `h \ge n` then `q \gets q+1`
 
 Divisibility testing
 --------------------------------------------------------------------------------

diff --git a/src/fft_small/mulmod_statisfies_bounds.c → src/fft_small/mulmod_satisfies_bounds.c b/src/fft_small/mulmod_statisfies_bounds.c → src/fft_small/mulmod_satisfies_bounds.c
diff --git a/src/fq_poly_templates/test/t-set_nmod_poly.c b/src/fq_poly_templates/test/t-set_nmod_poly.c
@@ -41,7 +41,7 @@ TEST_TEMPLATE_FUNCTION_START(T, poly_set_nmod_poly, state)
 #endif
 
         nmod_poly_randtest(b, state, len);
-        p = n_randint(state, 10);
+        p = n_randint(state, b->mod.n);
 
         TEMPLATE(T, poly_set_nmod_poly)(a, b, ctx);
         TEMPLATE(T, set_ui)(r, p, ctx);

diff --git a/src/mpn_extras/arm64/applem1/flint-mparam.h b/src/mpn_extras/arm64/applem1/flint-mparam.h
@@ -27,4 +27,11 @@
 #define FFT_N_NUM                                 19
 #define FFT_MULMOD_2EXPP1_CUTOFF                 128
 
+/* warning: set by default, likely not optimal         */
+/* if you have the relevant architecture, you can help */
+/* determine this by running profiling files:          */
+/*    nmod_vec's p-scalar_mul, p-scalar_addmul         */
+/*    nmod_mat's p-nmod_vec_mul                        */
+#define FLINT_MULMOD_SHOUP_THRESHOLD 10
+
 #endif
diff --git a/src/mpn_extras/arm64/flint-mparam.h b/src/mpn_extras/arm64/flint-mparam.h
@@ -29,4 +29,6 @@
 #define FFT_N_NUM                                 19
 #define FFT_MULMOD_2EXPP1_CUTOFF                 128
 
+#define FLINT_MULMOD_SHOUP_THRESHOLD 10
+
 #endif
diff --git a/src/mpn_extras/generic/flint-mparam.h b/src/mpn_extras/generic/flint-mparam.h
@@ -29,4 +29,6 @@
 #define FFT_N_NUM                                 19
 #define FFT_MULMOD_2EXPP1_CUTOFF                 128
 
+#define FLINT_MULMOD_SHOUP_THRESHOLD 10
+
 #endif