diff --git a/Cargo.lock b/Cargo.lock index 84c58a6d..3fed4ab3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -198,6 +198,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "nanorand" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" + [[package]] name = "once_cell" version = "1.19.0" @@ -342,6 +348,7 @@ version = "0.2.0" dependencies = [ "arbitrary", "libfuzzer-sys", + "nanorand", "rcl", "serde_json", "toml", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index e8ce32a0..e3236218 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -10,6 +10,7 @@ cargo-fuzz = true [dependencies] arbitrary = "1.3.0" libfuzzer-sys = "0.4" +nanorand = "0.7.0" rcl = { path = ".." } serde_json = "1.0.114" tree-sitter = "0.20.10" diff --git a/fuzz/fuzz_targets/fuzz_smith.rs b/fuzz/fuzz_targets/fuzz_smith.rs index 47e7073c..312eec3b 100644 --- a/fuzz/fuzz_targets/fuzz_smith.rs +++ b/fuzz/fuzz_targets/fuzz_smith.rs @@ -20,7 +20,8 @@ #![no_main] -use libfuzzer_sys::fuzz_target; +use libfuzzer_sys::{fuzz_mutator, fuzz_target}; +use nanorand::{Rng, WyRand}; use rcl::eval::Evaluator; use rcl::loader::{Loader, VoidFilesystem}; use rcl::tracer::VoidTracer; @@ -41,3 +42,157 @@ fuzz_target!(|input: SynthesizedProgram| { let mut value_env = rcl::runtime::prelude(); let _ = evaluator.eval_doc(&mut type_env, &mut value_env, doc); }); + +struct Mutator<'a> { + data: &'a mut [u8], + size: usize, + max_size: usize, + rng: WyRand, +} + +impl<'a> Mutator<'a> { + /// Return the byte offset of an arbitrary instruction in the buffer. + fn gen_instruction_index(&mut self) -> usize { + // Subtract 1 so we are sure to have an index of a full 2-byte instruction, + // not a trailing 1-byte leftover. + let i = std::cmp::min(self.size - 1, self.max_size - 1) / 2; + self.rng.generate_range(0..i) * 2 + } + + /// Return an arbitrary index into the data buffer. + fn gen_data_index(&mut self) -> usize { + // Bias indices towards the end of the data; the instructions are at the + // start and auxiliary data is at the end. Instructions are 2 bytes, so + // if we delete one byte in the middle then the part after it becomes + // meaningless (they might still be valid instructions, but it's not a + // small mutation). We should have more luck deleting in e.g. a string + // at the end. + let n = std::cmp::min(self.size, self.max_size); + match self.rng.generate_range(0..3) { + 0 => n - 1, + 1 => self.rng.generate_range((n / 2)..n), + 2 => self.rng.generate_range(0..n), + _ => unreachable!(), + } + } + + /// Generate a random valid opcode. + fn gen_opcode(&mut self) -> u8 { + loop { + let opcode: u8 = self.rng.generate(); + if rcl_fuzz::smith::parse_opcode(opcode).is_some() { + return opcode; + } + } + } + + /// Generate an instruction argument. + fn gen_argument(&mut self) -> u8 { + // We bias the argument towards smaller numbers, because often they are + // lengths or indexes into the stack, and those are all small. + match self.rng.generate_range(0..4) { + 0 => 0, + 1 => 1, + 2 => self.rng.generate_range(0..10), + 3 => self.rng.generate(), + _ => unreachable!(), + } + } + + fn mutate(&mut self) { + match self.rng.generate_range(0..9) { + 0 => self.insert_instruction(), + 1 => self.remove_instruction(), + 2 => self.replace_instruction(), + 3 => self.swap_instructions(), + 4 => self.increment_argument(), + 5 => self.decrement_argument(), + 6 => self.replace_argument(), + 7 => self.append_byte(), + 8 => self.remove_byte(), + _ => unreachable!(), + } + } + + fn insert_instruction(&mut self) { + let i = self.gen_instruction_index(); + + // Move everything behind the insertion place one instruction ahead. + self.data.copy_within(i..self.data.len() - 2, i + 2); + + // Then insert the new instruction. + self.data[i] = self.gen_opcode(); + self.data[i + 1] = self.gen_argument(); + self.size += 2; + } + + fn remove_instruction(&mut self) { + let i = self.gen_instruction_index(); + + // Move everything back one place. + self.data.copy_within(i + 2.., i); + self.size -= 2; + } + + fn replace_instruction(&mut self) { + let i = self.gen_instruction_index(); + self.data[i] = self.gen_opcode(); + self.data[i + 1] = self.gen_argument(); + } + + fn swap_instructions(&mut self) { + let i = self.gen_instruction_index(); + let j = self.gen_instruction_index(); + self.data.swap(i, j); + self.data.swap(i + 1, j + 1); + } + + fn increment_argument(&mut self) { + let i = self.gen_instruction_index(); + self.data[i + 1] = self.data[i + 1].saturating_add(1); + } + + fn decrement_argument(&mut self) { + let i = self.gen_instruction_index(); + self.data[i + 1] = self.data[i + 1].saturating_sub(1); + } + + fn replace_argument(&mut self) { + let i = self.gen_instruction_index(); + self.data[i + 1] = self.gen_argument(); + } + + fn append_byte(&mut self) { + if self.size >= self.data.len() || self.max_size >= self.data.len() { + return; + } + // Bias values towards 0 or printable ASCII, the auxiliary data at the + // end is often used for indices or strings. + let b = match self.rng.generate_range(0..2) { + 0 => 0, + 1 => self.rng.generate_range(0x20..0x7f), + 2 => self.rng.generate(), + _ => unreachable!(), + }; + self.data[self.size] = b; + self.size += 1; + } + + fn remove_byte(&mut self) { + let i = self.gen_data_index(); + self.data.copy_within(i + 1.., i); + self.size -= 1; + } +} + +fuzz_mutator!(|data: &mut [u8], size: usize, max_size: usize, seed: u32| { + let rng = WyRand::new_seed(seed as u64); + let mut mutator = Mutator { + data, + size, + max_size, + rng, + }; + mutator.mutate(); + mutator.size +}); diff --git a/fuzz/src/smith.rs b/fuzz/src/smith.rs index 252241c3..4d8e5a20 100644 --- a/fuzz/src/smith.rs +++ b/fuzz/src/smith.rs @@ -89,7 +89,7 @@ macro_rules! define_ops { $( #[doc = $doc] $name = $opcode ),+ } - fn parse_op(opcode: u8) -> Option { + pub fn parse_opcode(opcode: u8) -> Option { match opcode { $( $opcode => Some(Op::$name), )+ _ => None, @@ -264,7 +264,7 @@ impl<'a> ProgramBuilder<'a> { let n = self.input[self.head + 1]; self.head += 2; - let op = match parse_op(op_byte) { + let op = match parse_opcode(op_byte) { None => return true, Some(op) => op, };