Skip to content

Commit

Permalink
chore(gpu): refactor comparisons to track noise/degree
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 27, 2025
1 parent 13a1b44 commit 18941c4
Show file tree
Hide file tree
Showing 17 changed files with 1,235 additions and 798 deletions.
20 changes: 12 additions & 8 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ void cuda_negate_integer_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
uint32_t carry_modulus);
uint32_t carry_modulus, uint32_t num_radix_blocks);

void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
Expand Down Expand Up @@ -218,15 +218,17 @@ void scratch_cuda_integer_radix_comparison_kb_64(

void cuda_comparison_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t lwe_ciphertext_count);
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);

void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks);
uint32_t num_scalar_blocks);

void cleanup_cuda_integer_comparison(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down Expand Up @@ -474,7 +476,8 @@ void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(

void cuda_integer_are_all_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);

void cleanup_cuda_integer_are_all_comparisons_block_true(
Expand All @@ -492,7 +495,8 @@ void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(

void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);

void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
Expand Down
130 changes: 79 additions & 51 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -3160,8 +3160,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
COMPARISON_TYPE op;
int_radix_params params;

Torus *tmp_out;
Torus *tmp_block_accumulated;
CudaRadixCiphertextFFI *tmp_out;
CudaRadixCiphertextFFI *tmp_block_accumulated;

// This map store LUTs that checks the equality between some input and values
// of interest in are_all_block_true(), as with max_value (the maximum message
Expand All @@ -3181,12 +3181,15 @@ template <typename Torus> struct int_are_all_block_true_buffer {
uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
tmp_block_accumulated = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_out = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
num_radix_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_out, num_radix_blocks,
params.big_lwe_dimension);
tmp_block_accumulated = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], tmp_block_accumulated, max_chunks,
params.big_lwe_dimension);

is_max_value =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
max_chunks, allocate_gpu_memory);
Expand All @@ -3209,8 +3212,10 @@ template <typename Torus> struct int_are_all_block_true_buffer {
is_max_value->release(streams, gpu_indexes, gpu_count);
delete (is_max_value);

cuda_drop_async(tmp_block_accumulated, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_out, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_out);
delete tmp_out;
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_block_accumulated);
delete tmp_block_accumulated;
}
};

Expand Down Expand Up @@ -3323,8 +3328,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {

int_radix_lut<Torus> *tree_last_leaf_scalar_lut;

Torus *tmp_x;
Torus *tmp_y;
CudaRadixCiphertextFFI *tmp_x;
CudaRadixCiphertextFFI *tmp_y;

int_tree_sign_reduction_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
Expand All @@ -3345,10 +3350,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
};

if (allocate_gpu_memory) {
tmp_x = (Torus *)cuda_malloc_async(big_size * num_radix_blocks,
streams[0], gpu_indexes[0]);
tmp_y = (Torus *)cuda_malloc_async(big_size * num_radix_blocks,
streams[0], gpu_indexes[0]);
tmp_x = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_x, num_radix_blocks,
params.big_lwe_dimension);
tmp_y = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_y, num_radix_blocks,
params.big_lwe_dimension);
// LUTs
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
Expand Down Expand Up @@ -3379,23 +3388,25 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
tree_last_leaf_scalar_lut->release(streams, gpu_indexes, gpu_count);
delete tree_last_leaf_scalar_lut;

cuda_drop_async(tmp_x, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_y, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_x);
delete tmp_x;
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_y);
delete tmp_y;
}
};

template <typename Torus> struct int_comparison_diff_buffer {
int_radix_params params;
COMPARISON_TYPE op;

Torus *tmp_packed;
CudaRadixCiphertextFFI *tmp_packed;

std::function<Torus(Torus)> operator_f;

int_tree_sign_reduction_buffer<Torus> *tree_buffer;

Torus *tmp_signs_a;
Torus *tmp_signs_b;
CudaRadixCiphertextFFI *tmp_signs_a;
CudaRadixCiphertextFFI *tmp_signs_b;
int_radix_lut<Torus> *reduce_signs_lut;

int_comparison_diff_buffer(cudaStream_t const *streams,
Expand Down Expand Up @@ -3425,16 +3436,22 @@ template <typename Torus> struct int_comparison_diff_buffer {

Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);

tmp_packed = (Torus *)cuda_malloc_async(big_size * num_radix_blocks,
streams[0], gpu_indexes[0]);
tmp_packed = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_packed, num_radix_blocks,
params.big_lwe_dimension);

tree_buffer = new int_tree_sign_reduction_buffer<Torus>(
streams, gpu_indexes, gpu_count, operator_f, params, num_radix_blocks,
allocate_gpu_memory);
tmp_signs_a = (Torus *)cuda_malloc_async(big_size * num_radix_blocks,
streams[0], gpu_indexes[0]);
tmp_signs_b = (Torus *)cuda_malloc_async(big_size * num_radix_blocks,
streams[0], gpu_indexes[0]);
tmp_signs_a = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_signs_a, num_radix_blocks,
params.big_lwe_dimension);
tmp_signs_b = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_signs_b, num_radix_blocks,
params.big_lwe_dimension);
// LUTs
reduce_signs_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
Expand All @@ -3449,9 +3466,12 @@ template <typename Torus> struct int_comparison_diff_buffer {
reduce_signs_lut->release(streams, gpu_indexes, gpu_count);
delete reduce_signs_lut;

cuda_drop_async(tmp_packed, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_signs_a, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_signs_b, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_packed);
delete tmp_packed;
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_signs_a);
delete tmp_signs_a;
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_signs_b);
delete tmp_signs_b;
}
};

Expand All @@ -3469,12 +3489,12 @@ template <typename Torus> struct int_comparison_buffer {
int_comparison_eq_buffer<Torus> *eq_buffer;
int_comparison_diff_buffer<Torus> *diff_buffer;

Torus *tmp_block_comparisons;
Torus *tmp_lwe_array_out;
Torus *tmp_trivial_sign_block;
CudaRadixCiphertextFFI *tmp_block_comparisons;
CudaRadixCiphertextFFI *tmp_lwe_array_out;
CudaRadixCiphertextFFI *tmp_trivial_sign_block;

// Scalar EQ / NE
Torus *tmp_packed_input;
CudaRadixCiphertextFFI *tmp_packed_input;

// Max Min
int_cmux_buffer<Torus> *cmux_buffer;
Expand Down Expand Up @@ -3502,8 +3522,6 @@ template <typename Torus> struct int_comparison_buffer {

identity_lut_f = [](Torus x) -> Torus { return x; };

auto big_lwe_size = params.big_lwe_dimension + 1;

if (allocate_gpu_memory) {
lsb_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
Expand All @@ -3515,18 +3533,21 @@ template <typename Torus> struct int_comparison_buffer {
}

// +1 to have space for signed comparison
tmp_lwe_array_out = (Torus *)cuda_malloc_async(
big_lwe_size * (num_radix_blocks + 1) * sizeof(Torus), streams[0],
gpu_indexes[0]);
tmp_lwe_array_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], tmp_lwe_array_out, num_radix_blocks + 1,
params.big_lwe_dimension);

tmp_packed_input = (Torus *)cuda_malloc_async(
big_lwe_size * 2 * num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
tmp_packed_input = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], tmp_packed_input, 2 * num_radix_blocks,
params.big_lwe_dimension);

// Block comparisons
tmp_block_comparisons = (Torus *)cuda_malloc_async(
big_lwe_size * num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
tmp_block_comparisons = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], tmp_block_comparisons, num_radix_blocks,
params.big_lwe_dimension);

// Cleaning LUT
identity_lut =
Expand Down Expand Up @@ -3589,8 +3610,10 @@ template <typename Torus> struct int_comparison_buffer {

if (is_signed) {

tmp_trivial_sign_block = (Torus *)cuda_malloc_async(
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
tmp_trivial_sign_block = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
tmp_trivial_sign_block, 1,
params.big_lwe_dimension);

signed_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
Expand Down Expand Up @@ -3665,12 +3688,17 @@ template <typename Torus> struct int_comparison_buffer {
delete identity_lut;
is_zero_lut->release(streams, gpu_indexes, gpu_count);
delete is_zero_lut;
cuda_drop_async(tmp_lwe_array_out, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_block_comparisons, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_packed_input, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_lwe_array_out);
delete tmp_lwe_array_out;
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_block_comparisons);
delete tmp_block_comparisons;
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_packed_input);
delete tmp_packed_input;

if (is_signed) {
cuda_drop_async(tmp_trivial_sign_block, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0],
tmp_trivial_sign_block);
delete tmp_trivial_sign_block;
signed_lut->release(streams, gpu_indexes, gpu_count);
delete (signed_lut);
signed_msb_lut->release(streams, gpu_indexes, gpu_count);
Expand Down
Loading

0 comments on commit 18941c4

Please sign in to comment.