Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add {checked_,overflowing_,wrapping_,}div_rem #11

Open
nlordell opened this issue Feb 21, 2022 · 5 comments · May be fixed by #32
Open

Add {checked_,overflowing_,wrapping_,}div_rem #11

nlordell opened this issue Feb 21, 2022 · 5 comments · May be fixed by #32
Labels
good first issue Good for newcomers

Comments

@nlordell
Copy link
Owner

Since, internally, the udivmod intrinsic computes both the quotient and remainder, it makes sense to expose a member method on {I,U}256 so callers can take advantage of this.

@nlordell nlordell added the good first issue Good for newcomers label Feb 22, 2022
@NCGThompson
Copy link
Contributor

Currently std doesn't have a div_rem function and relies on LLVM to merge / and %. Therefor, we don't have an exact mold to fill.

The most obvious and probably the best way to implement it is to add the methods to *int/api.rs. The only catch is that if std finally implements div_rem at a later date, it may have slightly different function signatures. Another (not mutex) option is to add a num-integer compatibility feature, but this is probably overkill.

Here is an example of a function signature:

pub fn checked_div_rem(&self, rhs: &Self) -> Option<(Self, Self)>

corresponding to the dividend, the divisor, the quotient, and the remainder respectively.

@nlordell
Copy link
Owner Author

I've kept non-std APIs in uint.rs and int.rs, so I would put it there. the api.rs was really meant to mimic the STD API as much as possible (non-standard functions are likely there by accident and not on purpose).

@nlordell
Copy link
Owner Author

nlordell commented Oct 26, 2023

We can also make a new extra module (or some other name) for this kind of function if you think it is worth it, but at this point, I'd probably just stick it in int.rs and uint.rs.

@NCGThompson
Copy link
Contributor

NCGThompson commented Oct 27, 2023

Signed div_euclid has both a / and a % of the same operands.

ethnum-rs/src/int/api.rs

Lines 1894 to 1900 in 1cd9388

pub fn div_euclid(self, rhs: Self) -> Self {
let q = self / rhs;
if self % rhs < 0 {
return if rhs > 0 { q - 1 } else { q + 1 };
}
q
}

I wrapped the function in an #[inline(never)] function foo, and did a release build to x86_64 with out LLVM intrinsics. Apparently the compiler is unable to optimize them together to one divrem call. It also looks like it has two different possibilities for panicking, but I'm unsure what they are.. You can find the function calls by skimming for assembly lines that are longer than the others.

This is an example of the usefulness of a divrem function.

Assembly output
ethnum::int::I256::foo:
 push    rbp
 push    r15
 push    r14
 push    r13
 push    r12
 push    rbx
 sub     rsp, 248
 mov     qword, ptr, [rsp, +, 88], rdi
 mov     rdi, qword, ptr, [rsi]
 mov     r8, qword, ptr, [rsi, +, 8]
 mov     r13, qword, ptr, [rsi, +, 24]
 mov     r9, qword, ptr, [rsi, +, 16]
 xorps   xmm0, xmm0
 movups  xmmword, ptr, [rsp, +, 8], xmm0
 xorps   xmm3, xmm3
 movups  xmmword, ptr, [rsp, +, 24], xmm0
 movdqu  xmm0, xmmword, ptr, [rsp, +, 8]
 movdqu  xmm1, xmmword, ptr, [rdx]
 pcmpeqb xmm1, xmm0
 movdqu  xmm0, xmmword, ptr, [rdx, +, 16]
 movdqu  xmm2, xmmword, ptr, [rsp, +, 24]
 pcmpeqb xmm2, xmm0
 pand    xmm2, xmm1
 pmovmskb eax, xmm2
 cmp     eax, 65535
 je      .LBB10_1
 mov     rsi, r13
 shr     rsi, 63
 mov     r12, r13
 sar     r12, 63
 mov     rbp, qword, ptr, [rdx, +, 24]
 mov     rax, rbp
 shr     rax, 63
 mov     qword, ptr, [rsp], rdx
 mov     r14, rbp
 sar     r14, 63
 mov     rcx, qword, ptr, [rsp]
 mov     r15, qword, ptr, [rcx, +, 16]
 xor     r15, r14
 xor     r8, r12
 xor     rdi, r12
 xor     r13, r12
 xor     r9, r12
 mov     qword, ptr, [rsp, +, 176], rsi
 add     r9, rsi
 adc     r13, 0
 sub     rdi, r12
 sbb     r8, r12
 sbb     r9, 0
 sbb     r13, 0
 mov     qword, ptr, [rsp, +, 168], rdi
 mov     qword, ptr, [rsp, +, 96], rdi
 mov     qword, ptr, [rsp, +, 160], r8
 mov     qword, ptr, [rsp, +, 104], r8
 mov     qword, ptr, [rsp, +, 152], r9
 mov     qword, ptr, [rsp, +, 112], r9
 mov     rcx, qword, ptr, [rsp]
 mov     rdx, qword, ptr, [rcx, +, 8]
 xor     rdx, r14
 mov     rcx, qword, ptr, [rsp]
 mov     rbx, qword, ptr, [rcx]
 xor     rbx, r14
 xor     rbp, r14
 add     r15, rax
 adc     rbp, 0
 sub     rbx, r14
 sbb     rdx, r14
 sbb     r15, 0
 mov     qword, ptr, [rsp, +, 120], r13
 sbb     rbp, 0
 mov     qword, ptr, [rsp, +, 8], rbx
 mov     qword, ptr, [rsp, +, 144], rdx
 mov     qword, ptr, [rsp, +, 16], rdx
 mov     qword, ptr, [rsp, +, 24], r15
 mov     qword, ptr, [rsp, +, 32], rbp
 xor     r14, r12
 lea     rdi, [rsp, +, 184]
 lea     rsi, [rsp, +, 96]
 lea     rdx, [rsp, +, 8]
 xor     ecx, ecx
 call    qword, ptr, [rip, +, _ZN6ethnum10intrinsics6native6divmod8udivmod417h2baa86b59d534686E@GOTPCREL]
 mov     rsi, qword, ptr, [rsp, +, 184]
 mov     rcx, qword, ptr, [rsp, +, 192]
 mov     qword, ptr, [rsp, +, 128], rcx
 xor     rcx, r14
 mov     qword, ptr, [rsp, +, 136], rsi
 xor     rsi, r14
 mov     rax, qword, ptr, [rsp, +, 208]
 xor     rax, r14
 mov     rdx, qword, ptr, [rsp, +, 200]
 xor     rdx, r14
 sub     rdx, r14
 sbb     rax, r14
 sub     rsi, r14
 mov     qword, ptr, [rsp, +, 56], rsi
 sbb     rcx, r14
 mov     qword, ptr, [rsp, +, 72], rcx
 mov     r14, qword, ptr, [rsp]
 sbb     rdx, 0
 mov     qword, ptr, [rsp, +, 64], rdx
 sbb     rax, 0
 mov     qword, ptr, [rsp, +, 80], rax
 pxor    xmm0, xmm0
 movdqu  xmmword, ptr, [rsp, +, 8], xmm0
 movdqu  xmmword, ptr, [rsp, +, 24], xmm0
 movdqu  xmm0, xmmword, ptr, [rsp, +, 8]
 movdqu  xmm1, xmmword, ptr, [r14]
 pcmpeqb xmm1, xmm0
 movdqu  xmm0, xmmword, ptr, [r14, +, 16]
 movdqu  xmm2, xmmword, ptr, [rsp, +, 24]
 pcmpeqb xmm2, xmm0
 pand    xmm2, xmm1
 pmovmskb eax, xmm2
 cmp     eax, 65535
 je      .LBB10_4
 mov     rax, qword, ptr, [rsp, +, 152]
 mov     qword, ptr, [rsp, +, 112], rax
 mov     rax, qword, ptr, [rsp, +, 160]
 mov     qword, ptr, [rsp, +, 104], rax
 mov     rax, qword, ptr, [rsp, +, 168]
 mov     qword, ptr, [rsp, +, 96], rax
 mov     qword, ptr, [rsp, +, 120], r13
 mov     qword, ptr, [rsp, +, 32], rbp
 mov     qword, ptr, [rsp, +, 24], r15
 mov     rax, qword, ptr, [rsp, +, 144]
 mov     qword, ptr, [rsp, +, 16], rax
 mov     qword, ptr, [rsp, +, 8], rbx
 lea     rdi, [rsp, +, 184]
 lea     rsi, [rsp, +, 96]
 lea     rdx, [rsp, +, 8]
 lea     rcx, [rsp, +, 216]
 call    qword, ptr, [rip, +, _ZN6ethnum10intrinsics6native6divmod8udivmod417h2baa86b59d534686E@GOTPCREL]
 mov     rax, qword, ptr, [rsp, +, 224]
 xor     rax, r12
 mov     rcx, qword, ptr, [rsp, +, 216]
 xor     rcx, r12
 mov     rdx, qword, ptr, [rsp, +, 240]
 xor     rdx, r12
 mov     rsi, qword, ptr, [rsp, +, 232]
 xor     rsi, r12
 add     rsi, qword, ptr, [rsp, +, 176]
 adc     rdx, 0
 cmp     rcx, r12
 sbb     rax, r12
 sbb     rsi, 0
 sbb     rdx, 0
 js      .LBB10_7
 mov     rax, qword, ptr, [rsp, +, 88]
 mov     rcx, qword, ptr, [rsp, +, 56]
 mov     qword, ptr, [rax], rcx
 mov     rcx, qword, ptr, [rsp, +, 72]
 mov     qword, ptr, [rax, +, 8], rcx
 mov     rcx, qword, ptr, [rsp, +, 64]
 mov     qword, ptr, [rax, +, 16], rcx
 mov     rcx, qword, ptr, [rsp, +, 80]
 jmp     .LBB10_11
.LBB10_7:
 movdqu  xmm0, xmmword, ptr, [r14]
 movdqu  xmm1, xmmword, ptr, [r14, +, 16]
 pshufd  xmm2, xmm1, 238
 movq    rax, xmm2
 pxor    xmm2, xmm2
 pcmpeqd xmm1, xmm2
 movmskps ecx, xmm1
 xor     ecx, 15
 pcmpeqd xmm0, xmm2
 movmskps edx, xmm0
 xor     esi, esi
 xor     edx, 15
 setne   sil
 xor     edx, edx
 cmp     rdx, qword, ptr, [r14, +, 16]
 sbb     rdx, rax
 setl    al
 test    ecx, ecx
 movzx   eax, al
 cmovne  esi, eax
 test    sil, sil
 mov     rax, qword, ptr, [rsp, +, 88]
 je      .LBB10_8
 mov     rdi, qword, ptr, [rsp, +, 56]
 add     rdi, -1
 mov     rdx, qword, ptr, [rsp, +, 72]
 adc     rdx, -1
 xor     ecx, ecx
 mov     rsi, qword, ptr, [rsp, +, 128]
 or      qword, ptr, [rsp, +, 136], rsi
 sete    cl
 mov     rsi, qword, ptr, [rsp, +, 64]
 sub     rsi, rcx
 mov     rcx, qword, ptr, [rsp, +, 80]
 sbb     rcx, 0
 jmp     .LBB10_10
.LBB10_8:
 mov     rdi, qword, ptr, [rsp, +, 56]
 add     rdi, 1
 mov     rdx, qword, ptr, [rsp, +, 72]
 adc     rdx, 0
 mov     rsi, qword, ptr, [rsp, +, 64]
 adc     rsi, 0
 mov     rcx, qword, ptr, [rsp, +, 80]
 adc     rcx, 0
.LBB10_10:
 mov     qword, ptr, [rax], rdi
 mov     qword, ptr, [rax, +, 8], rdx
 mov     qword, ptr, [rax, +, 16], rsi
.LBB10_11:
 mov     qword, ptr, [rax, +, 24], rcx
 add     rsp, 248
 pop     rbx
 pop     r12
 pop     r13
 pop     r14
 pop     r15
 pop     rbp
 ret
.LBB10_1:
 lea     rax, [rip, +, .L__unnamed_22]
 mov     qword, ptr, [rsp, +, 8], rax
 mov     qword, ptr, [rsp, +, 16], 1
 lea     rax, [rip, +, .L__unnamed_7]
 mov     qword, ptr, [rsp, +, 24], rax
 movups  xmmword, ptr, [rsp, +, 32], xmm3
 lea     rsi, [rip, +, .L__unnamed_23]
 lea     rdi, [rsp, +, 8]
 call    qword, ptr, [rip, +, _ZN4core9panicking9panic_fmt17hd1e971d8d7c78e0eE@GOTPCREL]
 ud2
.LBB10_4:
 lea     rax, [rip, +, .L__unnamed_6]
 mov     qword, ptr, [rsp, +, 8], rax
 mov     qword, ptr, [rsp, +, 16], 1
 lea     rax, [rip, +, .L__unnamed_7]
 mov     qword, ptr, [rsp, +, 24], rax
 pxor    xmm0, xmm0
 movdqu  xmmword, ptr, [rsp, +, 32], xmm0
 lea     rsi, [rip, +, .L__unnamed_23]
 lea     rdi, [rsp, +, 8]
 call    qword, ptr, [rip, +, _ZN4core9panicking9panic_fmt17hd1e971d8d7c78e0eE@GOTPCREL]
 ud2

@NCGThompson NCGThompson linked a pull request Oct 28, 2023 that will close this issue
@nlordell
Copy link
Owner Author

It also looks like it has two different possibilities for panicking, but I'm unsure what they are

I imagine it is for rhs = 0 (the two possibilities being the / and % which aren't being optimised into a single call as you observed).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
good first issue Good for newcomers
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants