From 28128768b024f26bb0eac47da165fcd72b0db02d Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Sat, 16 Sep 2023 15:09:39 -0400 Subject: [PATCH] feat: implement support for wasm data segments Wasm has the concept of "data segments", essentially initializers for specific regions of memory, identified by offset from the start of linear memory, and a vector of bytes to be written starting at that offset. This is used to implement things like the `rodata` section you'd find in a typical ELF executable/library. Globals can then be exported with an address pointing into that segment, which can be useful for sharing the same data across many read-only globals, and for packing global data more efficiently. Our implementation of this is very similar, except we also allow data segments to have a size larger than the initializer (byte vector), which has the same semantics as padding the initializer with zeroes out to that size. --- hir/src/constants.rs | 5 ++ hir/src/globals.rs | 38 +++++++--- hir/src/lib.rs | 6 ++ hir/src/module.rs | 38 ++++++++++ hir/src/program.rs | 14 ++++ hir/src/segments.rs | 172 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 264 insertions(+), 9 deletions(-) create mode 100644 hir/src/segments.rs diff --git a/hir/src/constants.rs b/hir/src/constants.rs index 319768fd1..80beb9a2a 100644 --- a/hir/src/constants.rs +++ b/hir/src/constants.rs @@ -125,6 +125,11 @@ impl ConstantPool { &self.constants[&id] } + /// Returns true if this pool contains the given constant data + pub fn contains(&self, data: &ConstantData) -> bool { + self.cache.contains_key(data) + } + /// Insert constant data into the pool, returning a handle for later referencing; when constant /// data is inserted that is a duplicate of previous constant data, the existing handle will be /// returned. diff --git a/hir/src/globals.rs b/hir/src/globals.rs index df70a5876..00b4b997b 100644 --- a/hir/src/globals.rs +++ b/hir/src/globals.rs @@ -102,7 +102,6 @@ pub enum ConflictResolutionStrategy { /// This table is used to lay out and link together global variables for a [Program]. /// -/// /// See the docs for [Linkage], [GlobalVariableData], and [GlobalVariableTable::declare] for more details. pub struct GlobalVariableTable { layout: LinkedList, @@ -129,6 +128,11 @@ impl GlobalVariableTable { } } + /// Returns true if the global variable table is empty + pub fn is_empty(&self) -> bool { + self.layout.is_empty() + } + /// Get a double-ended iterator over the current table layout pub fn iter<'a, 'b: 'a>( &'b self, @@ -175,18 +179,19 @@ impl GlobalVariableTable { } /// Computes the offset, in bytes, of the given [GlobalVariable] from the - /// start of the linear memory heap, assuming that the layout of the global - /// variable table up to and including `id` remains unchanged. + /// start of the segment in which globals are allocated, assuming that the + /// layout of the global variable table up to and including `id` remains + /// unchanged. /// /// # SAFETY /// - /// This should only be used once all global variables have been declared, and - /// the layout of the table has been decided. It is technically safe to use - /// offsets obtained before all global variables are declared, _IF_ the layout - /// up to and including those global variables remains unchanged after that - /// point. + /// This should only be used once all data segments and global variables have + /// been declared, and the layout of the table has been decided. It is technically + /// safe to use offsets obtained before all global variables are declared, _IF_ the + /// data segments and global variable layout up to and including those global variables + /// remains unchanged after that point. /// - /// If the offset for a given global variable is obtained, and the layout is + /// If the offset for a given global variable is obtained, and the heap layout is /// subsequently changed in such a way that the original offset is no longer /// accurate, bad things will happen. pub unsafe fn offset_of(&self, id: GlobalVariable) -> usize { @@ -232,6 +237,21 @@ impl GlobalVariableTable { self.try_insert(name, ty, linkage) } + /// Get the constant data associated with `id` + pub fn get_constant(&self, id: Constant) -> &ConstantData { + self.data.get(id) + } + + /// Inserts the given constant data into this table without allocating a global + pub fn insert_constant(&mut self, data: ConstantData) -> Constant { + self.data.insert(data) + } + + /// Returns true if the given constant data is in the constant pool + pub fn contains_constant(&self, data: &ConstantData) -> bool { + self.data.contains(data) + } + /// This sets the initializer for the given [GlobalVariable] to `init`. /// /// This function will return `Err` if any of the following occur: diff --git a/hir/src/lib.rs b/hir/src/lib.rs index 2c9dc690d..d10eb0e45 100644 --- a/hir/src/lib.rs +++ b/hir/src/lib.rs @@ -5,8 +5,12 @@ pub use miden_diagnostics::SourceSpan; pub use miden_hir_symbol::{symbols, Symbol}; pub use miden_hir_type::{FunctionType, Type, TypeRepr}; +/// Represents a field element in Miden pub type Felt = winter_math::fields::f64::BaseElement; +/// Represents an offset from the base of linear memory in Miden +pub type Offset = u32; + macro_rules! assert_matches { ($left:expr, $(|)? $( $pattern:pat_param )|+ $( if $guard: expr )? $(,)?) => { match $left { @@ -63,6 +67,7 @@ mod layout; mod locals; mod module; mod program; +mod segments; #[cfg(test)] mod tests; mod value; @@ -86,6 +91,7 @@ pub use self::layout::{ArenaMap, LayoutAdapter, LayoutNode, OrderedArenaMap}; pub use self::locals::{Local, LocalId}; pub use self::module::*; pub use self::program::{Linker, LinkerError, Program}; +pub use self::segments::{DataSegment, DataSegmentAdapter, DataSegmentError, DataSegmentTable}; pub use self::value::{Value, ValueData, ValueList, ValueListPool}; pub use self::write::{write_external_function, write_function}; diff --git a/hir/src/module.rs b/hir/src/module.rs index 6aa44096b..e3102a717 100644 --- a/hir/src/module.rs +++ b/hir/src/module.rs @@ -42,6 +42,8 @@ pub struct Module { /// Documentation attached to this module, to be passed through to /// Miden Assembly during code generation. pub docs: Option, + /// The set of data segments allocated in this module + segments: DataSegmentTable, /// The set of global variables declared in this module globals: GlobalVariableTable, /// The set of functions which belong to this module, in the order @@ -124,6 +126,7 @@ impl Module { link: Default::default(), name, docs: None, + segments: Default::default(), globals: GlobalVariableTable::new(ConflictResolutionStrategy::None), functions: Default::default(), is_kernel: false, @@ -153,6 +156,41 @@ impl Module { !self.link.is_linked() } + /// Return an iterator over the data segments allocated in this module + /// + /// The iterator is double-ended, so can be used to traverse the segments in either direction. + /// + /// Data segments are ordered by the address at which are are allocated, in ascending order. + pub fn segments<'a, 'b: 'a>( + &'b self, + ) -> intrusive_collections::linked_list::Iter<'a, DataSegmentAdapter> { + self.segments.iter() + } + + /// Declare a new [DataSegment] in this module, with the given offset, size, and data. + /// + /// Returns `Err` if the proposed segment overlaps with an existing segment. + /// + /// Data segments are ordered by the address at which they are allocated, at link-time, all + /// segments from all modules are linked together, and they must either be disjoint, or exactly + /// identical in order to overlap - it is not permitted to have partially overlapping segments + /// with different views of the memory represented by that segment. + pub fn declare_data_segment( + &mut self, + offset: Offset, + size: u32, + init: ConstantData, + readonly: bool, + ) -> Result<(), DataSegmentError> { + let init_size = init + .len() + .try_into() + .expect("invalid constant data: must be smaller than 2^32 bytes"); + let size = core::cmp::max(size, init_size); + let init = self.globals.insert_constant(init); + self.segments.insert(offset, size, init, readonly) + } + /// Return an iterator over the global variables declared in this module /// /// The iterator is double-ended, so can be used to traverse the globals table in either direction diff --git a/hir/src/program.rs b/hir/src/program.rs index a33921620..9fff80057 100644 --- a/hir/src/program.rs +++ b/hir/src/program.rs @@ -28,6 +28,8 @@ pub struct Program { /// a program or just a collection of modules; and in the case of the former, what code /// to emit in the root code block. entrypoint: Option, + /// The data segments gathered from all modules in the program, and laid out in address order. + segments: DataSegmentTable, /// The global variable table produced by linking the global variable tables of all /// modules in this program. The layout of this table corresponds to the layout of /// global variables in the linear memory heap at runtime. @@ -60,6 +62,7 @@ impl Program { Self { modules: Default::default(), entrypoint: None, + segments: Default::default(), globals: Default::default(), } } @@ -97,6 +100,17 @@ impl Program { entry } + /// Return an iterator over the data segments allocated in this program + /// + /// The iterator is double-ended, so can be used to traverse the segments in either direction. + /// + /// Data segments are ordered by the address at which are are allocated, in ascending order. + pub fn segments<'a, 'b: 'a>( + &'b self, + ) -> intrusive_collections::linked_list::Iter<'a, DataSegmentAdapter> { + self.segments.iter() + } + /// Get a reference to the global variable table for this program pub fn globals(&self) -> &GlobalVariableTable { &self.globals diff --git a/hir/src/segments.rs b/hir/src/segments.rs new file mode 100644 index 000000000..c266a272c --- /dev/null +++ b/hir/src/segments.rs @@ -0,0 +1,172 @@ +use std::hash::{Hash, Hasher}; + +use intrusive_collections::{intrusive_adapter, LinkedList, LinkedListLink, UnsafeRef}; + +intrusive_adapter!(pub DataSegmentAdapter = UnsafeRef: DataSegment { link: LinkedListLink }); + +use super::{Constant, Offset}; + +/// This error is raised when attempting to declare a [DataSegment] +/// that in some way conflicts with previously declared data segments. +#[derive(Debug, thiserror::Error)] +pub enum DataSegmentError { + /// The current segment overlaps with a previously allocated segment + #[error("invalid data segment: segment of {size1} bytes at {offset1:#x} overlaps with segment of {size2} bytes at {offset2:#x}")] + OverlappingSegments { + offset1: Offset, + size1: u32, + offset2: Offset, + size2: u32, + }, + /// The current segment and a previous definition of that segment do + /// not agree on the data or read/write properties of the memory they + /// represent. + #[error("invalid data segment: segment at {0:#x} conflicts with a previous segment declaration at this address")] + Mismatch(Offset), + /// The current segment and size do not fall in the boundaries of the heap + /// which is allocatable to globals and other heap allocations. + /// + /// For example, Miden reserves some amount of memory for procedure locals + /// at a predetermined address, and we do not permit segments to be allocated + /// past that point. + #[error("invalid data segment: segment of {size} bytes at {offset:#x} would extend beyond the end of the usable heap")] + OutOfBounds { offset: Offset, size: u32 }, +} + +/// Similar to [GlobalVariableTable], this structure is used to track data segments in a module or program. +#[derive(Default)] +pub struct DataSegmentTable { + segments: LinkedList, +} +impl DataSegmentTable { + /// Returns true if the table has no segments defined + pub fn is_empty(&self) -> bool { + self.segments.is_empty() + } + + /// Try to insert a new [DataSegment] in the table, with the given offset, size, and data. + /// + /// Returns `Err` if the proposed segment overlaps with an existing segment. + /// + /// Data segments are ordered by the address at which they are allocated. + pub fn insert( + &mut self, + offset: Offset, + size: u32, + init: Constant, + readonly: bool, + ) -> Result<(), DataSegmentError> { + // Make sure this segment does not overlap with another segment + let end = offset + .checked_add(size) + .ok_or_else(|| DataSegmentError::OutOfBounds { offset, size })?; + let mut cursor = self.segments.front_mut(); + while let Some(segment) = cursor.get() { + let segment_end = segment.offset + segment.size; + // If this segment starts after the segment we're declaring, + // we do not need to continue searching for conflicts, and + // can go a head and perform the insert + if segment.offset >= end { + let segment = Box::new(DataSegment::new(offset, size, init, readonly)); + cursor.insert_before(UnsafeRef::from_box(segment)); + return Ok(()); + } + // If this segment starts at the same place as the one we're + // declaring that's a guaranteed conflict + if segment.offset == offset { + // If the two segments have the same size and offset, then + // if they match in all other respects, we're done. If they + // don't match, then we raise a mismatch error. + if segment.size == size { + if segment.init == init && segment.readonly == readonly { + return Ok(()); + } + } + return Err(DataSegmentError::Mismatch(offset)); + } + // This segment starts before the segment we're declaring, + // make sure that this segment ends before our segment starts + if segment_end > offset { + return Err(DataSegmentError::OverlappingSegments { + offset1: offset, + size1: size, + offset2: segment.offset, + size2: segment.size, + }); + } + } + + // If we reach here, we didn't find any conflicts, and all segments + // that were previously declared occur before the offset at which this + // segment is allocated + let segment = Box::new(DataSegment::new(offset, size, init, readonly)); + self.segments.push_back(UnsafeRef::from_box(segment)); + Ok(()) + } + + /// Traverse the data segments in the table in ascending order by offset + pub fn iter<'a, 'b: 'a>( + &'b self, + ) -> intrusive_collections::linked_list::Iter<'a, DataSegmentAdapter> { + self.segments.iter() + } +} + +/// A [DataSegment] represents a region of linear memory that should be initialized +/// with a given vector of bytes. +/// +/// This is distinct from [GlobalVariableData], which can be referenced by name, +/// and participates in linkage. Furthermore, [GlobalVariableData] is only as large +/// as it's type/initializer and alignment require, they cannot be arbitrarily sized. +/// +/// A data segment has an offset from the start of linear memory, i.e. address 0x0, +/// and a fixed size, which must be at least as large as the initializer data for +/// the segment. If the size is larger than the initializer data, then it is implied +/// that the remaining bytes will be zeroed. +/// +/// A read-only data segment is used to determine whether a given operation is permitted +/// on addresses falling in that segment - e.g. loads are allowed, stores are not. Miden +/// currently does not have any form of memory protection, so this validation is best +/// effort. +#[derive(Debug, Clone)] +pub struct DataSegment { + link: LinkedListLink, + /// The offset from the start of linear memory where this segment starts + pub offset: Offset, + /// The size, in bytes, of this data segment. + /// + /// By default this will be the same size as `init`, unless explicitly given. + pub size: u32, + /// The data to initialize this segment with, may not be larger than `size` + pub init: Constant, + /// Whether or not this segment is intended to be read-only data + pub readonly: bool, +} +impl DataSegment { + pub fn new(offset: Offset, size: u32, init: Constant, readonly: bool) -> Self { + Self { + link: Default::default(), + offset, + size, + init, + readonly, + } + } +} +impl Eq for DataSegment {} +impl PartialEq for DataSegment { + fn eq(&self, other: &Self) -> bool { + self.offset == other.offset + && self.size == other.size + && self.init == other.init + && self.readonly == other.readonly + } +} +impl Hash for DataSegment { + fn hash(&self, state: &mut H) { + self.offset.hash(state); + self.size.hash(state); + self.init.hash(state); + self.readonly.hash(state); + } +}