Skip to content

Commit

Permalink
Upgrade rust version (#16)
Browse files Browse the repository at this point in the history
* Update rust-toolchain version.
* Replace llvm_asm with stabilized asm macro.
* Update asm comments
* Remove unused helper.
* Bump package versions.
  • Loading branch information
remoun authored Mar 11, 2022
1 parent 456bf36 commit 7d59741
Show file tree
Hide file tree
Showing 12 changed files with 129 additions and 153 deletions.
10 changes: 5 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion aligned-cmov/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "aligned-cmov"
version = "2.0.0"
version = "2.1.0"
description = "Fast constant-time conditional moves of aligned bytes"
authors = ["MobileCoin"]
license = "GPL-3.0"
Expand Down
222 changes: 104 additions & 118 deletions aligned-cmov/src/cmov_impl_asm.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2018-2021 The MobileCoin Foundation
// Copyright (c) 2018-2022 The MobileCoin Foundation

//! Implementation of cmov on x86-64 using inline assembly.
//!
Expand All @@ -19,45 +19,41 @@
//! For now it seems simplest to use inline assembly for all of it.
use super::{A64Bytes, A8Bytes, ArrayLength};
use core::arch::asm;

// CMov for u32 values
#[inline]
pub fn cmov_u32(condition: bool, src: &u32, dest: &mut u32) {
// Create a temporary to be assigned to a register for cmov
let mut temp: u32 = *dest;
// Test condition ($1)
// cmovnz from src into temp ($2 to $0)
unsafe {
llvm_asm!("test $1, $1
cmovnz $0, $2"
:"+&r"(temp)
: "r"(condition), "rm"(*src)
: "cc"
: "volatile", "intel");
asm!(
// Set ZF if cond==0
"test {0}, {0}",
// Conditionally move src into temp (based on ZF)
// `:e` formats to the 32-bit register.
"cmovnz {2:e}, {1:e}",
in(reg_byte) condition as u8,
in(reg) *src,
// inout since we might not write, so we need the existing value.
inout(reg) *dest,
);
}
// "cc" is because we are setting flags in test
// "volatile" is meant to discourage the optimizer from inspecting this
*dest = temp;
}

// CMov for u64 values
#[inline]
pub fn cmov_u64(condition: bool, src: &u64, dest: &mut u64) {
// Create a temporary to be assigned to a register for cmov
let mut temp: u64 = *dest;
// Test condition ($1)
// cmovnz from src into temp ($2 to $0)
unsafe {
llvm_asm!("test $1, $1
cmovnz $0, $2"
:"+&r"(temp)
: "r"(condition), "rm"(*src)
: "cc"
: "volatile", "intel");
asm!(
// Set ZF if cond==0
"test {0}, {0}",
// Conditionally move src into dest (based on ZF)
"cmovnz {2}, {1}",
in(reg_byte) condition as u8,
in(reg) *src,
// inout since we might not write, so we need the existing value.
inout(reg) *dest,
);
}
// "cc" is because we are setting flags in test
// "volatile" is meant to discourage the optimizer from inspecting this
*dest = temp;
}

// CMov for i32 values
Expand Down Expand Up @@ -124,7 +120,6 @@ pub fn cmov_a64_bytes<N: ArrayLength<u8>>(
// If we have avx2 then we can use cmov_byte_slice_a64 implementation
#[cfg(target_feature = "avx2")]
#[inline]
#[allow(unused)]
pub fn cmov_a64_bytes<N: ArrayLength<u8>>(
condition: bool,
src: &A64Bytes<N>,
Expand All @@ -151,13 +146,8 @@ pub fn cmov_a64_bytes<N: ArrayLength<u8>>(
// if condition { memcpy(dest, src, count * 8) }
// for pointers aligned to 8 byte boundary. Assumes count > 0
#[inline]
#[allow(unused)]
unsafe fn cmov_byte_slice_a8(condition: bool, src: *const u64, dest: *mut u64, mut count: usize) {
// Create a temporary to be assigned to a register for cmov
// We also use it to pass the condition in, which is only needed in a register
// before entering the loop.
let mut temp: u64 = condition as u64;

unsafe fn cmov_byte_slice_a8(condition: bool, src: *const u64, dest: *mut u64, count: usize) {
debug_assert!(count > 0, "count cannot be 0");
// The idea here is to test once before we enter the loop and reuse the test
// result for every cmov.
// The loop is a dec, jnz loop.
Expand All @@ -167,109 +157,105 @@ unsafe fn cmov_byte_slice_a8(condition: bool, src: *const u64, dest: *mut u64, m
// indexing. Because dec will clobber ZF, we can't use cmovnz. We store
// the result of outer test in CF which is not clobbered by dec.
// cmovc is contingent on CF
// negb is used to set CF to 1 iff the condition value was 1.
// neg is used to set CF to 1 iff the condition value was 1.
// Pity, we cannot cmov directly to memory
//
// temp is a "register output" because this is the way to obtain a scratch
// register when we don't care what actual register is used and we want the
// compiler to pick.
//
// count is modified by the assembly -- for this reason it has to be labelled
// as an input and an output, otherwise its illegal to modify it.
//
// The condition is passed in through temp, then neg moves the value to CF.
// After that we don't need condition in a register, so temp register can be
// The condition is passed in through cond, then neg moves the value to CF.
// After that we don't need condition in a register, so cond register can be
// reused.
llvm_asm!("neg $0
loop_body_${:uid}:
mov $0, [$3 + 8*$1 - 8]
cmovc $0, [$2 + 8*$1 - 8]
mov [$3 + 8*$1 - 8], $0
dec $1
jnz loop_body_${:uid}"
: "+&r"(temp), "+&r"(count)
: "r"(src), "r"(dest)
: "cc", "memory"
: "volatile", "intel");
// cc is because we are setting flags in test
// memory is because we are dereferencing a bunch of pointers in asm
// volatile is because the output variable "temp" is not the true output
// of this block, the memory side-effects are.
asm!(
// Sets CF=0 iff cond==0, 1 otherwise.
"neg {0}",
// Must use numeric local labels, since this block may be inlined multiple times.
// https://doc.rust-lang.org/nightly/rust-by-example/unsafe/asm.html#labels
"42:",
// Copy into temp register from dest (for NOP default).
"mov {0}, [{3} + 8*{1} - 8]",
// Conditionally copy into temp register from src (based on CF).
"cmovc {0}, [{2} + 8*{1} - 8]",
// Copy from temp register into dest.
"mov [{3} + 8*{1} - 8], {0}",
// Decrement count. Sets ZF=1 when we reach 0.
"dec {1}",
// If ZF is not set, loop.
"jnz 42b",
// Discard outputs; the memory side effects are what's desired.
inout(reg) (condition as u64) => _,
inout(reg) count => _,
in(reg) src,
in(reg) dest,
);
}

// Should be a constant time function equivalent to:
// if condition { memcpy(dest, src, num_bytes) }
// for pointers aligned to *64* byte boundary. Will fault if that is not the
// case. Assumes num_bytes > 0, and num_bytes divisible by 64!
// case. Requires num_bytes > 0, and num_bytes divisible by 64.
// This version uses AVX2 256-bit moves
#[cfg(target_feature = "avx2")]
#[inline]
#[allow(unused)]
unsafe fn cmov_byte_slice_a64(condition: bool, src: *const u64, dest: *mut u64, num_bytes: usize) {
debug_assert!(
num_bytes > 0,
"num_bytes cannot be 0, caller must handle that"
);
debug_assert!(
num_bytes % 64 == 0,
"num_bytes must be divisible by 64, caller must handle that"
);
// Similarly as before, we want to test once and use the test result for the
// whole loop.
//
// Before we enter the loop, we want to set ymm1 to all 0s or all 1s, depending
// on condition. We use neg to make it all 0s or all 1s 64bit, then vmovq to
// move that to xmm2, then vbroadcastsd to fill ymm1 with 1s or zeros.
//
// This time the cmov mechanism is:
// - VMOVDQA to load the source into a ymm register,
// - VPMASKMOVQ to move that to memory, after masking it with ymm1.
// An alternative approach which I didn't test is, MASKMOV to another ymm
// register, then move that register to memory.
//
debug_assert!(num_bytes > 0, "num_bytes cannot be 0");
debug_assert!(num_bytes % 64 == 0, "num_bytes must be divisible by 64");

// Notes:
// temp = $0 is the scratch register
// We aren't allowed to modify condition or num_bytes = $3 unless
// we annotate them appropriately as outputs.
// So, num_bytes is an in/out register, and we pass condition in via
// the scratch register, instead of a dedicated register.
// temp = {0} is the scratch register
//
// Once we have the mask in ymm1 we don't need the condition in a register
// anymore. Then, $0 is the loop variable, counting down from num_bytes in
// steps of 64
//
// The values of $0 in the loop are num_bytes, num_bytes - 64, ... 64,
// The values of {0} in the loop are num_bytes, num_bytes - 64, ... 64,
// rather than num_bytes - 64 ... 0. This is because the semantics of the loop
// are subtract 64, then test for 0, rather than test for 0, then subtract.
// So when we index using $0, we subtract 64 to compensate.
//
// We unroll the loop once because we can assume 64 byte alignment, but ymm
// register holds 32 bytes. So one round of vmovdqa, vpmaskmovq moves 32 bytes.
// So we do this twice in one pass through the loop.
//
// TODO: In AVX512 zmm registers we could move 64 bytes at once...
// So when we index using {0}, we subtract 64 to compensate.
// TODO: Does unrolling the loop more help?
let mut temp: u64 = condition as u64;

llvm_asm!("neg $0
vmovq xmm2, $0
vbroadcastsd ymm1, xmm2
mov $0, $3
loop_body2_${:uid}:
vmovdqa ymm2, [$1 + $0 - 64]
vpmaskmovq [$2 + $0 - 64], ymm1, ymm2
vmovdqa ymm3, [$1 + $0 - 32]
vpmaskmovq [$2 + $0 - 32], ymm1, ymm3
sub $0, 64
jnz loop_body2_${:uid}"
: "+&r"(temp)
: "r"(src), "r"(dest), "rmi"(num_bytes)
: "cc", "memory", "ymm1", "ymm2", "ymm3"
: "volatile", "intel");
// cc is because we are setting flags
// memory is because we are dereferencing a bunch of pointers in asm
// volatile is because the output variable "temp" is not the true output
// of this block, the memory side-effects are.
asm!(
// Similarly to cmov_byte_slice_a8, we want to test once and use the
// result for the whole loop.
// Before we enter the loop, we want to set ymm1 to all 0s or all 1s,
// depending on condition. We use neg to make all 64 bits 0s or all 1s
// (since neg(0) = 0 and neg(1) = -1 = 11111111b in two's complement),
// then vmovq to move that to xmm2, then vbroadcastsd to fill ymm1
// with ones or zeros.
"neg {0}",
"vmovq xmm2, {0}",
"vbroadcastsd ymm1, xmm2",
// Once we have the mask in ymm1 we don't need the condition in
// a register anymore. Then, {0} is the loop variable, counting down
// from num_bytes in steps of 64
"mov {0}, {3}",
// Must use numeric local labels, since this block may be inlined multiple times.
// https://doc.rust-lang.org/nightly/rust-by-example/unsafe/asm.html#labels
"42:",
// This time the cmov mechanism is:
// - VMOVDQA to load the source into a ymm register,
// - VPMASKMOVQ to move that to memory, after masking it with ymm1.
// An alternative approach which I didn't test is, MASKMOV to
// another ymm register, then move that register to memory.
"vmovdqa ymm2, [{1} + {0} - 64]",
"vpmaskmovq [{2} + {0} - 64], ymm1, ymm2",
// We unroll the loop once because we can assume 64 byte alignment,
// but ymm register holds 32 bytes. So one round of
// vmovdqa+vpmaskmovq moves 32 bytes.
// So we do this twice in one pass through the loop.
//
// TODO: In AVX512 zmm registers we could move 64 bytes at once...
"vmovdqa ymm3, [{1} + {0} - 32]",
"vpmaskmovq [{2} + {0} - 32], ymm1, ymm3",
// Decrement num_bytes. Sets ZF=1 when we reach 0.
"sub {0}, 64",
// If ZF is not set, loop.
"jnz 42b",
// Discard output; the memory side effects are what's desired.
inout(reg) condition as u64 => _,
in(reg) src,
in(reg) dest,
in(reg) num_bytes,
// Scratch/Temp registers.
out("ymm1") _,
out("ymm2") _,
out("ymm3") _,
);
}

// TODO: In avx512 there is vmovdqa which takes a mask register, and kmovq to
Expand Down
1 change: 0 additions & 1 deletion aligned-cmov/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// Copyright (c) 2018-2021 The MobileCoin Foundation

#![no_std]
#![feature(llvm_asm)]

pub use aligned_array::{subtle, Aligned, AsAlignedChunks, AsNeSlice, A64, A8};
pub use generic_array::{arr, typenum, ArrayLength, GenericArray};
Expand Down
4 changes: 2 additions & 2 deletions balanced-tree-index/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "balanced-tree-index"
version = "2.0.0"
version = "2.1.0"
description = "Utilities for constant-time manipulation of a complete binary tree with a flat in-memory representation."
authors = ["MobileCoin"]
license = "GPL-3.0"
Expand All @@ -11,7 +11,7 @@ keywords = ["binary-tree", "constant-time", "utilities", "no_std"]
categories = ["cryptography", "data-structures"]

[dependencies]
aligned-cmov = { path = "../aligned-cmov", version = "2" }
aligned-cmov = { path = "../aligned-cmov", version = "2.1" }

rand_core = { version = "0.6", default-features = false }

Expand Down
4 changes: 2 additions & 2 deletions mc-oblivious-map/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "mc-oblivious-map"
version = "2.0.0"
version = "2.1.0"
description = "Implementation of Oblivious Hash Map data structures on top of Oblivious RAM"
authors = ["MobileCoin"]
license = "GPL-3.0"
Expand All @@ -15,7 +15,7 @@ no_asm_insecure = ["aligned-cmov/no_asm_insecure"]

[dependencies]
aligned-array = { version = "1", features = ["subtle"] }
aligned-cmov = { path = "../aligned-cmov", version = "2" }
aligned-cmov = { path = "../aligned-cmov", version = "2.1" }
mc-oblivious-traits = { path = "../mc-oblivious-traits", version = "2" }

generic-array = { version = "0.14", default-features = false }
Expand Down
4 changes: 2 additions & 2 deletions mc-oblivious-ram/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "mc-oblivious-ram"
version = "2.0.0"
version = "2.1.0"
description = "Implementations of Oblivious RAM data structures"
authors = ["MobileCoin"]
license = "GPL-3.0"
Expand All @@ -14,7 +14,7 @@ categories = ["cryptography", "data-structures", "no-std"]
no_asm_insecure = ["aligned-cmov/no_asm_insecure"]

[dependencies]
aligned-cmov = { path = "../aligned-cmov", version = "2" }
aligned-cmov = { path = "../aligned-cmov", version = "2.1" }
balanced-tree-index = { path = "../balanced-tree-index", version = "2" }
mc-oblivious-traits = { path = "../mc-oblivious-traits", version = "2" }

Expand Down
Loading

0 comments on commit 7d59741

Please sign in to comment.