diff --git a/Makefile b/Makefile index 4354809dc2..431803d7ba 100644 --- a/Makefile +++ b/Makefile @@ -281,14 +281,14 @@ check_typos: install_typos_checker .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled clippy_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \ - --features=boolean,shortint,integer,internal-keycache,gpu \ + --features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \ --all-targets \ -p $(TFHE_SPEC) -- --no-deps -D warnings .PHONY: check_gpu # Run check on tfhe with "gpu" enabled check_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \ - --features=boolean,shortint,integer,internal-keycache,gpu \ + --features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \ --all-targets \ -p $(TFHE_SPEC) @@ -393,10 +393,10 @@ clippy_trivium: install_rs_check_toolchain .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.) clippy_all_targets: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ - --features=boolean,shortint,integer,internal-keycache,zk-pok,strings \ + --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \ -p $(TFHE_SPEC) -- --no-deps -D warnings RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ - --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \ + --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \ -p $(TFHE_SPEC) -- --no-deps -D warnings .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng @@ -1040,35 +1040,35 @@ bench_integer: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_signed_integer # Run benchmarks for signed integer bench_signed_integer: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-signed-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend bench_integer_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression bench_integer_compression: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench glwe_packing_compression-integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_compression_gpu bench_integer_compression_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench glwe_packing_compression-integer-bench \ - --features=integer,internal-keycache,gpu -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters bench_integer_multi_bit: install_rs_check_toolchain @@ -1076,7 +1076,7 @@ bench_integer_multi_bit: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters bench_signed_integer_multi_bit: install_rs_check_toolchain @@ -1084,7 +1084,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-signed-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters bench_integer_multi_bit_gpu: install_rs_check_toolchain @@ -1092,7 +1092,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain @@ -1100,14 +1100,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs bench_integer_zk: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench zk-pke-bench \ - --features=integer,internal-keycache,zk-pok,nightly-avx512 \ + --features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \ -p $(TFHE_SPEC) -- .PHONY: bench_shortint # Run benchmarks for shortint diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs index d6ead085b7..4d4af899cc 100644 --- a/tfhe/benches/integer/bench.rs +++ b/tfhe/benches/integer/bench.rs @@ -143,15 +143,25 @@ fn bench_server_key_binary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = gen_random_u256(&mut rng); + let mut ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, &mut ct_1); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -294,15 +304,23 @@ fn bench_server_key_unary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + + reset_pbs_count(); + unary_fn(&sks, &mut ct_0); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -451,15 +469,24 @@ fn bench_server_key_binary_scalar_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size; + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, clear_1); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -567,15 +594,28 @@ fn if_then_else_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let true_ct = cks.encrypt_radix(clear_0, num_block); + + let clear_1 = gen_random_u256(&mut rng); + let false_ct = cks.encrypt_radix(clear_1, num_block); + + let condition = sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + + reset_pbs_count(); + sks.if_then_else_parallelized(&condition, &true_ct, &false_ct); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let cts_cond = (0..elements) .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5))) .collect::>(); @@ -663,20 +703,34 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize); + let cks = RadixClientKey::from((cks, nb_ctxt)); + + let clears = (0..len) + .map(|_| gen_random_u256(&mut rng) & max_for_bit_size) + .collect::>(); + let ctxts = clears + .iter() + .copied() + .map(|clear| cks.encrypt(clear)) + .collect::>(); + + reset_pbs_count(); + sks.sum_ciphertexts_parallelized(&ctxts); + let pbs_count = get_pbs_count(); + bench_id = format!( "{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits" ); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - - let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize); - let cks = RadixClientKey::from((cks, nb_ctxt)); - let cts = (0..elements) .map(|_| { let clears = (0..len) @@ -1358,17 +1412,24 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + let ct = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); + let mut d_ctxt = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &streams); + + reset_pbs_count(); + unary_op(&gpu_sks, &mut d_ctxt, &streams); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1457,17 +1518,28 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); + let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); + let mut d_ctxt_1 = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &streams); + let mut d_ctxt_2 = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &streams); + + reset_pbs_count(); + binary_op(&gpu_sks, &mut d_ctxt_1, &mut d_ctxt_2, &streams); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1564,19 +1636,28 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + // Execute the operation once to know its cost. + let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); + let mut d_ctxt_1 = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &streams); + let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size; + + reset_pbs_count(); + binary_op(&gpu_sks, &mut d_ctxt_1, clear_1, &streams); + let pbs_count = get_pbs_count(); + bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); bench_id = format!( "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}" ); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1667,11 +1748,29 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + let clear_cond = rng.gen::(); + let ct_then = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); + let ct_else = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); + let ct_cond = cks.encrypt_bool(clear_cond); + + let d_ct_cond = CudaBooleanBlock::from_boolean_block(&ct_cond, &stream); + let d_ct_then = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_then, &stream); + let d_ct_else = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_else, &stream); + + reset_pbs_count(); + gpu_sks.if_then_else(&d_ct_cond, &d_ct_then, &d_ct_else, &stream); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { let (cks, _cpu_sks) = @@ -2516,6 +2615,7 @@ use cuda::{ cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; +use tfhe::{get_pbs_count, reset_pbs_count}; criterion_group!( smart_ops, @@ -2617,6 +2717,7 @@ criterion_group!( criterion_group!( default_dedup_ops, + bitand_parallelized, add_parallelized, mul_parallelized, div_rem_parallelized, diff --git a/tfhe/benches/integer/glwe_packing_compression.rs b/tfhe/benches/integer/glwe_packing_compression.rs index cb30a63570..1e89a7e2fe 100644 --- a/tfhe/benches/integer/glwe_packing_compression.rs +++ b/tfhe/benches/integer/glwe_packing_compression.rs @@ -77,9 +77,19 @@ fn cpu_glwe_packing(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + // Execute the operation once to know its cost. + let ct = cks.encrypt_radix(0_u32, num_blocks); + let mut builder = CompressedCiphertextListBuilder::new(); + builder.push(ct); + let compressed = builder.build(&compression_key); + + reset_pbs_count(); + let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap(); + let pbs_count = get_pbs_count(); + let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize; - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); // FIXME thread usage seemed to be somewhat more "efficient". // For example, with bit_size = 2, my laptop is only using around 2/3 of the // available threads Thread usage increases with bit_size = 8 but @@ -185,27 +195,26 @@ mod cuda { let bench_id_pack; let bench_id_unpack; + // Generate private compression key + let cks = ClientKey::new(param); + let private_compression_key = cks.new_compression_private_key(comp_param); + + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, compressed_decompression_key) = + radix_cks.new_compressed_compression_decompression_keys(&private_compression_key); + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream); + let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( + radix_cks.parameters().glwe_dimension(), + radix_cks.parameters().polynomial_size(), + radix_cks.parameters().message_modulus(), + radix_cks.parameters().carry_modulus(), + radix_cks.parameters().ciphertext_modulus(), + &stream, + ); + match BENCH_TYPE.get().unwrap() { BenchmarkType::Latency => { - // Generate private compression key - let cks = ClientKey::new(param); - let private_compression_key = cks.new_compression_private_key(comp_param); - - // Generate and convert compression keys - let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); - let (compressed_compression_key, compressed_decompression_key) = radix_cks - .new_compressed_compression_decompression_keys(&private_compression_key); - let cuda_compression_key = - compressed_compression_key.decompress_to_cuda(&stream); - let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - &stream, - ); - // Encrypt let ct = cks.encrypt_radix(0_u32, num_blocks); let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); @@ -239,28 +248,25 @@ mod cuda { }); } BenchmarkType::Throughput => { + // Execute the operation once to know its cost. + let ct = cks.encrypt_radix(0_u32, num_blocks); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); + let mut builder = CudaCompressedCiphertextListBuilder::new(); + builder.push(d_ct, &stream); + let compressed = builder.build(&cuda_compression_key, &stream); + + reset_pbs_count(); + let _: CudaUnsignedRadixCiphertext = compressed + .get(0, &cuda_decompression_key, &stream) + .unwrap() + .unwrap(); + let pbs_count = get_pbs_count(); + let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)) .ceil() as usize; - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); - let cks = ClientKey::new(param); - let private_compression_key = cks.new_compression_private_key(comp_param); - - let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); - let (compressed_compression_key, compressed_decompression_key) = radix_cks - .new_compressed_compression_decompression_keys(&private_compression_key); - let cuda_compression_key = - compressed_compression_key.decompress_to_cuda(&stream); - let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - &stream, - ); - // Encrypt let ct = cks.encrypt_radix(0_u32, num_blocks); let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); @@ -344,6 +350,7 @@ criterion_group!(cpu_glwe_packing2, cpu_glwe_packing); #[cfg(feature = "gpu")] use cuda::gpu_glwe_packing2; +use tfhe::{get_pbs_count, reset_pbs_count}; fn main() { BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap()); diff --git a/tfhe/benches/integer/oprf.rs b/tfhe/benches/integer/oprf.rs index 8bdc1e9407..f664ed8406 100644 --- a/tfhe/benches/integer/oprf.rs +++ b/tfhe/benches/integer/oprf.rs @@ -7,6 +7,7 @@ use rayon::prelude::*; use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::IntegerKeyKind; use tfhe::keycache::NamedParam; +use tfhe::{get_pbs_count, reset_pbs_count}; use tfhe_csprng::seeders::Seed; pub fn unsigned_oprf(c: &mut Criterion) { @@ -40,12 +41,21 @@ pub fn unsigned_oprf(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + reset_pbs_count(); + sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded( + Seed(0), + bit_size as u64, + num_block as u64, + ); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - b.iter(|| { (0..elements).into_par_iter().for_each(|_| { sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded( diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs index 9c1cf0ed1b..2cf57ed94b 100644 --- a/tfhe/benches/integer/signed_bench.rs +++ b/tfhe/benches/integer/signed_bench.rs @@ -66,12 +66,20 @@ fn bench_server_key_signed_binary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + binary_op(&sks, &ct_0, &ct_1); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -151,12 +159,21 @@ fn bench_server_key_signed_shift_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_1 = rng.gen_range(0u128..bit_size as u128); + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + binary_op(&sks, &ct_0, &ct_1); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -233,12 +250,19 @@ fn bench_server_key_unary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + unary_fn(&sks, &ct_0); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -307,12 +331,21 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let cond = sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + sks.if_then_else_parallelized(&cond, &ct_then, &ct_else); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let cts_cond = (0..elements) .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5))) .collect::>(); @@ -830,12 +863,20 @@ fn bench_server_key_binary_scalar_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let clear_1 = rng_func(&mut rng, bit_size); + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, clear_1); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -1401,14 +1442,32 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clearlow = rng.gen::(); + let clearhigh = rng.gen::(); + let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh)); + let ct_0 = cks.encrypt_signed_radix(clear_0, num_block); + let mut d_ctxt_0 = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream); + + let clearlow = rng.gen::(); + let clearhigh = rng.gen::(); + let clear_1 = tfhe::integer::I256::from((clearlow, clearhigh)); + let ct_1 = cks.encrypt_signed_radix(clear_1, num_block); + let mut d_ctxt_1 = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_1, &stream); + + reset_pbs_count(); + binary_op(&gpu_sks, &mut d_ctxt_0, &mut d_ctxt_1, &stream); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1527,14 +1586,25 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clearlow = rng.gen::(); + let clearhigh = rng.gen::(); + let clear = tfhe::integer::I256::from((clearlow, clearhigh)); + let ct = cks.encrypt_signed_radix(clear, num_block); + let mut d_ctxt = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &stream); + + reset_pbs_count(); + unary_op(&gpu_sks, &mut d_ctxt, &stream); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1650,16 +1720,29 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clearlow = rng.gen::(); + let clearhigh = rng.gen::(); + let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh)); + let ct_0 = cks.encrypt_signed_radix(clear_0, num_block); + let mut d_ctxt_0 = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream); + + let clear_0 = rng_func(&mut rng, bit_size) & max_value_for_bit_size; + + reset_pbs_count(); + binary_op(&gpu_sks, &mut d_ctxt_0, clear_0, &stream); + let pbs_count = get_pbs_count(); + bench_id = format!( "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}" ); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1786,14 +1869,32 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clearlow = rng.gen::(); + let clearhigh = rng.gen::(); + let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh)); + let ct_0 = cks.encrypt_signed_radix(clear_0, num_block); + let mut d_ctxt_0 = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream); + + let clearlow = rng.gen::(); + let clearhigh = rng.gen::(); + let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh)); + let ct_1 = cks.encrypt_radix(clear_1, num_block); + let mut d_ctxt_1 = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &stream); + + reset_pbs_count(); + binary_op(&gpu_sks, &mut d_ctxt_0, &mut d_ctxt_1, &stream); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1916,14 +2017,29 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clear_cond = rng.gen::(); + let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_cond = cks.encrypt_bool(clear_cond); + + let d_ct_cond = CudaBooleanBlock::from_boolean_block(&ct_cond, &stream); + let d_ct_then = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_then, &stream); + let d_ct_else = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_else, &stream); + + reset_pbs_count(); + gpu_sks.if_then_else(&d_ct_cond, &d_ct_then, &d_ct_else, &stream); + let pbs_count = get_pbs_count(); + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let cts_cond = (0..elements) .map(|_| { let ct_cond = cks.encrypt_bool(rng.gen::()); @@ -2697,6 +2813,7 @@ use cuda::{ cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; +use tfhe::{get_pbs_count, reset_pbs_count}; #[cfg(feature = "gpu")] fn go_through_gpu_bench_groups(val: &str) { diff --git a/tfhe/benches/integer/zk_pke.rs b/tfhe/benches/integer/zk_pke.rs index 8d789caf1f..6c8e89d344 100644 --- a/tfhe/benches/integer/zk_pke.rs +++ b/tfhe/benches/integer/zk_pke.rs @@ -18,6 +18,7 @@ use tfhe::shortint::parameters::compact_public_key_only::p_fail_2_minus_64::ks_p use tfhe::shortint::parameters::key_switching::p_fail_2_minus_64::ks_pbs::V0_11_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; use tfhe::shortint::parameters::PBSParameters; use tfhe::zk::{CompactPkeCrs, ZkComputeLoad}; +use tfhe::{get_pbs_count, reset_pbs_count}; use utilities::{write_to_json, OperatorType}; fn write_result(file: &mut File, name: &str, value: usize) { @@ -96,7 +97,17 @@ fn pke_zk_proof(c: &mut Criterion) { }); } BenchmarkType::Throughput => { - let elements = throughput_num_threads(num_block); + // Execute the operation once to know its cost. + let input_msg = rng.gen::(); + let messages = vec![input_msg; fhe_uint_count]; + + reset_pbs_count(); + let _ = tfhe::integer::ProvenCompactCiphertextList::builder(&pk) + .extend(messages.iter().copied()) + .build_with_proof_packed(&crs, &metadata, compute_load); + let pbs_count = get_pbs_count(); + + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_id = format!( @@ -304,7 +315,27 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) { } BenchmarkType::Throughput => { // In throughput mode object sizes are not recorded. - let elements = throughput_num_threads(num_block); + + // Execute the operation once to know its cost. + let input_msg = rng.gen::(); + let messages = vec![input_msg; fhe_uint_count]; + let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk) + .extend(messages.iter().copied()) + .build_with_proof_packed(&crs, &metadata, compute_load) + .unwrap(); + + reset_pbs_count(); + let _ = ct1.verify_and_expand( + &crs, + &pk, + &metadata, + IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary( + casting_key.as_view(), + ), + ); + let pbs_count = get_pbs_count(); + + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_id_verify = format!( diff --git a/tfhe/benches/utilities.rs b/tfhe/benches/utilities.rs index 5348010941..e30fefff17 100644 --- a/tfhe/benches/utilities.rs +++ b/tfhe/benches/utilities.rs @@ -392,7 +392,7 @@ pub mod integer_utils { /// Generate a number of threads to use to saturate current machine for throughput measurements. #[allow(dead_code)] - pub fn throughput_num_threads(num_block: usize) -> u64 { + pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 { let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil(); @@ -401,13 +401,23 @@ pub mod integer_utils { // This value is for Nvidia H100 GPU let streaming_multiprocessors = 132; let num_gpus = unsafe { cuda_get_number_of_gpus() }; - ((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64 + let total_num_sm = streaming_multiprocessors * num_gpus; + // Some operations with a high count of PBS (e.g. division) would yield an operation + // loading value so low that the number of elements in the end wouldn't be meaningful. + let minimum_loading = 0.2; + let operation_loading = + ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading); + (total_num_sm as f64 * block_multiplicator * operation_loading) as u64 } #[cfg(not(feature = "gpu"))] { let num_threads = rayon::current_num_threads() as f64; + // Some operations with a high count of PBS (e.g. division) would yield an operation + // loading value so low that the number of elements in the end wouldn't be meaningful. + let minimum_loading = 0.2; + let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading); // Add 20% more to maximum threads available. - ((num_threads + (num_threads * 0.2)) * block_multiplicator) as u64 + ((num_threads + (num_threads * 0.2)) * block_multiplicator * operation_loading) as u64 } }