diff --git a/docs/img/miden/vm/design/chiplets/bitwise/bitwise_execution_trace.png b/docs/img/miden/vm/design/chiplets/bitwise/bitwise_execution_trace.png
new file mode 100644
index 000000000..a554cff6c
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/bitwise/bitwise_execution_trace.png differ
diff --git a/docs/img/miden/vm/design/chiplets/chiplets.png b/docs/img/miden/vm/design/chiplets/chiplets.png
new file mode 100644
index 000000000..e61ac4468
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/chiplets.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher.png b/docs/img/miden/vm/design/chiplets/hasher.png
new file mode 100644
index 000000000..32f7e73be
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher/hash_1_permutation_trace.png b/docs/img/miden/vm/design/chiplets/hasher/hash_1_permutation_trace.png
new file mode 100644
index 000000000..6e1f9fae3
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher/hash_1_permutation_trace.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher/hash_2_to_1_hash.png b/docs/img/miden/vm/design/chiplets/hasher/hash_2_to_1_hash.png
new file mode 100644
index 000000000..524485387
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher/hash_2_to_1_hash.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher/hash_execution_trace.png b/docs/img/miden/vm/design/chiplets/hasher/hash_execution_trace.png
new file mode 100644
index 000000000..1f217c08b
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher/hash_execution_trace.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher/hash_linear_hash_n.png b/docs/img/miden/vm/design/chiplets/hasher/hash_linear_hash_n.png
new file mode 100644
index 000000000..9fae15dc4
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher/hash_linear_hash_n.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher/hash_merkle_tree.png b/docs/img/miden/vm/design/chiplets/hasher/hash_merkle_tree.png
new file mode 100644
index 000000000..996c13f56
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher/hash_merkle_tree.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher/hash_merkle_tree_trace.png b/docs/img/miden/vm/design/chiplets/hasher/hash_merkle_tree_trace.png
new file mode 100644
index 000000000..3cccacd66
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher/hash_merkle_tree_trace.png differ
diff --git a/docs/img/miden/vm/design/chiplets/hasher_bitwise.png b/docs/img/miden/vm/design/chiplets/hasher_bitwise.png
new file mode 100644
index 000000000..9bcaa289d
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/hasher_bitwise.png differ
diff --git a/docs/img/miden/vm/design/chiplets/kernel_rom/kernel_rom_execution_trace.png b/docs/img/miden/vm/design/chiplets/kernel_rom/kernel_rom_execution_trace.png
new file mode 100644
index 000000000..1cde46feb
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/kernel_rom/kernel_rom_execution_trace.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_alternative_design.png b/docs/img/miden/vm/design/chiplets/memory/memory_alternative_design.png
new file mode 100644
index 000000000..2c24b0dbc
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_alternative_design.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_context_separation.png b/docs/img/miden/vm/design/chiplets/memory/memory_context_separation.png
new file mode 100644
index 000000000..f6fe123f8
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_context_separation.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_limitation_diagram.png b/docs/img/miden/vm/design/chiplets/memory/memory_limitation_diagram.png
new file mode 100644
index 000000000..acc1c907e
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_limitation_diagram.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_miden_vm_layout.png b/docs/img/miden/vm/design/chiplets/memory/memory_miden_vm_layout.png
new file mode 100644
index 000000000..f92db1004
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_miden_vm_layout.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_non_contiguous_memory.png b/docs/img/miden/vm/design/chiplets/memory/memory_non_contiguous_memory.png
new file mode 100644
index 000000000..c17bf0dd7
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_non_contiguous_memory.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_read_write.png b/docs/img/miden/vm/design/chiplets/memory/memory_read_write.png
new file mode 100644
index 000000000..828065a5a
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_read_write.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_reading_memory.png b/docs/img/miden/vm/design/chiplets/memory/memory_reading_memory.png
new file mode 100644
index 000000000..6dc10574d
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_reading_memory.png differ
diff --git a/docs/img/miden/vm/design/chiplets/memory/memory_writing_to_memory.png b/docs/img/miden/vm/design/chiplets/memory/memory_writing_to_memory.png
new file mode 100644
index 000000000..3ef5bf10f
Binary files /dev/null and b/docs/img/miden/vm/design/chiplets/memory/memory_writing_to_memory.png differ
diff --git a/docs/img/miden/vm/design/decoder/block_hash_table.png b/docs/img/miden/vm/design/decoder/block_hash_table.png
new file mode 100644
index 000000000..91713e759
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/block_hash_table.png differ
diff --git a/docs/img/miden/vm/design/decoder/constraints/air_decoder_columns.png b/docs/img/miden/vm/design/decoder/constraints/air_decoder_columns.png
new file mode 100644
index 000000000..3db2751ee
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/constraints/air_decoder_columns.png differ
diff --git a/docs/img/miden/vm/design/decoder/constraints/air_decoder_in_spans_column_constraint.png b/docs/img/miden/vm/design/decoder/constraints/air_decoder_in_spans_column_constraint.png
new file mode 100644
index 000000000..e675de266
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/constraints/air_decoder_in_spans_column_constraint.png differ
diff --git a/docs/img/miden/vm/design/decoder/constraints/air_decoder_left_right_child.png b/docs/img/miden/vm/design/decoder/constraints/air_decoder_left_right_child.png
new file mode 100644
index 000000000..81cbb8d99
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/constraints/air_decoder_left_right_child.png differ
diff --git a/docs/img/miden/vm/design/decoder/constraints/air_decoder_op_group_constraint.png b/docs/img/miden/vm/design/decoder/constraints/air_decoder_op_group_constraint.png
new file mode 100644
index 000000000..e52d55d27
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/constraints/air_decoder_op_group_constraint.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_OPERATION_batch_flags.png b/docs/img/miden/vm/design/decoder/decoder_OPERATION_batch_flags.png
new file mode 100644
index 000000000..20cb5f9b3
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_OPERATION_batch_flags.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_block_stack_table.png b/docs/img/miden/vm/design/decoder/decoder_block_stack_table.png
new file mode 100644
index 000000000..d56dae6c4
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_block_stack_table.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_decoding_span_block_with_push.png b/docs/img/miden/vm/design/decoder/decoder_decoding_span_block_with_push.png
new file mode 100644
index 000000000..e90eed184
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_decoding_span_block_with_push.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_dyn_block_decoding.png b/docs/img/miden/vm/design/decoder/decoder_dyn_block_decoding.png
new file mode 100644
index 000000000..955d88bdb
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_dyn_block_decoding.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_dyn_operation.png b/docs/img/miden/vm/design/decoder/decoder_dyn_operation.png
new file mode 100644
index 000000000..93426e0d3
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_dyn_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_end_operation.png b/docs/img/miden/vm/design/decoder/decoder_end_operation.png
new file mode 100644
index 000000000..7b3638357
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_end_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_halt_operation.png b/docs/img/miden/vm/design/decoder/decoder_halt_operation.png
new file mode 100644
index 000000000..3bb9d5639
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_halt_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_imm_vale_op_group_table.png b/docs/img/miden/vm/design/decoder/decoder_imm_vale_op_group_table.png
new file mode 100644
index 000000000..eb659bf65
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_imm_vale_op_group_table.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_join_block_decoding.png b/docs/img/miden/vm/design/decoder/decoder_join_block_decoding.png
new file mode 100644
index 000000000..2e3a4370e
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_join_block_decoding.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_join_operation.png b/docs/img/miden/vm/design/decoder/decoder_join_operation.png
new file mode 100644
index 000000000..a2372e413
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_join_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_loop_execution.png b/docs/img/miden/vm/design/decoder/decoder_loop_execution.png
new file mode 100644
index 000000000..23e010e7f
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_loop_execution.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_loop_operation.png b/docs/img/miden/vm/design/decoder/decoder_loop_operation.png
new file mode 100644
index 000000000..fc9dcb465
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_loop_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_loop_skipping.png b/docs/img/miden/vm/design/decoder/decoder_loop_skipping.png
new file mode 100644
index 000000000..4ae5bdcfd
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_loop_skipping.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_multi_batch_span.png b/docs/img/miden/vm/design/decoder/decoder_multi_batch_span.png
new file mode 100644
index 000000000..311102323
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_multi_batch_span.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_op_group_table.png b/docs/img/miden/vm/design/decoder/decoder_op_group_table.png
new file mode 100644
index 000000000..b814ec2b0
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_op_group_table.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_op_group_table_after_span_op.png b/docs/img/miden/vm/design/decoder/decoder_op_group_table_after_span_op.png
new file mode 100644
index 000000000..4efed18de
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_op_group_table_after_span_op.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_op_group_table_multi_span.png b/docs/img/miden/vm/design/decoder/decoder_op_group_table_multi_span.png
new file mode 100644
index 000000000..1fdf289b2
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_op_group_table_multi_span.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_op_group_table_post_respan.png b/docs/img/miden/vm/design/decoder/decoder_op_group_table_post_respan.png
new file mode 100644
index 000000000..d3d0d862b
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_op_group_table_post_respan.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_operation_group_decoding.png b/docs/img/miden/vm/design/decoder/decoder_operation_group_decoding.png
new file mode 100644
index 000000000..e206f588a
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_operation_group_decoding.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_repeat_operation.png b/docs/img/miden/vm/design/decoder/decoder_repeat_operation.png
new file mode 100644
index 000000000..30194a0e6
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_repeat_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_respan_operation.png b/docs/img/miden/vm/design/decoder/decoder_respan_operation.png
new file mode 100644
index 000000000..899fec543
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_respan_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_single_batch_span.png b/docs/img/miden/vm/design/decoder/decoder_single_batch_span.png
new file mode 100644
index 000000000..20c3fe0f2
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_single_batch_span.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_span_block.png b/docs/img/miden/vm/design/decoder/decoder_span_block.png
new file mode 100644
index 000000000..c36f719b3
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_span_block.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_split_block_decoding.png b/docs/img/miden/vm/design/decoder/decoder_split_block_decoding.png
new file mode 100644
index 000000000..8503fe9e0
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_split_block_decoding.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_split_operation.png b/docs/img/miden/vm/design/decoder/decoder_split_operation.png
new file mode 100644
index 000000000..265d8226f
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_split_operation.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_state_block_hash_2.png b/docs/img/miden/vm/design/decoder/decoder_state_block_hash_2.png
new file mode 100644
index 000000000..eb5290a9d
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_state_block_hash_2.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_state_block_hash_4.png b/docs/img/miden/vm/design/decoder/decoder_state_block_hash_4.png
new file mode 100644
index 000000000..1a0a8b15d
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_state_block_hash_4.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_state_block_hash_6.png b/docs/img/miden/vm/design/decoder/decoder_state_block_hash_6.png
new file mode 100644
index 000000000..8da273fd3
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_state_block_hash_6.png differ
diff --git a/docs/img/miden/vm/design/decoder/decoder_trace.png b/docs/img/miden/vm/design/decoder/decoder_trace.png
new file mode 100644
index 000000000..3db2751ee
Binary files /dev/null and b/docs/img/miden/vm/design/decoder/decoder_trace.png differ
diff --git a/docs/img/miden/vm/design/lookups/logup_component.png b/docs/img/miden/vm/design/lookups/logup_component.png
new file mode 100644
index 000000000..364e7cbd6
Binary files /dev/null and b/docs/img/miden/vm/design/lookups/logup_component.png differ
diff --git a/docs/img/miden/vm/design/lookups/logup_table.png b/docs/img/miden/vm/design/lookups/logup_table.png
new file mode 100644
index 000000000..ef8f14228
Binary files /dev/null and b/docs/img/miden/vm/design/lookups/logup_table.png differ
diff --git a/docs/img/miden/vm/design/programs/call_block.png b/docs/img/miden/vm/design/programs/call_block.png
new file mode 100644
index 000000000..2f65c5026
Binary files /dev/null and b/docs/img/miden/vm/design/programs/call_block.png differ
diff --git a/docs/img/miden/vm/design/programs/dyn_block.png b/docs/img/miden/vm/design/programs/dyn_block.png
new file mode 100644
index 000000000..ff2058161
Binary files /dev/null and b/docs/img/miden/vm/design/programs/dyn_block.png differ
diff --git a/docs/img/miden/vm/design/programs/join_block.png b/docs/img/miden/vm/design/programs/join_block.png
new file mode 100644
index 000000000..9be9bae1d
Binary files /dev/null and b/docs/img/miden/vm/design/programs/join_block.png differ
diff --git a/docs/img/miden/vm/design/programs/loop_block.png b/docs/img/miden/vm/design/programs/loop_block.png
new file mode 100644
index 000000000..87e09781e
Binary files /dev/null and b/docs/img/miden/vm/design/programs/loop_block.png differ
diff --git a/docs/img/miden/vm/design/programs/mast_of_program.png b/docs/img/miden/vm/design/programs/mast_of_program.png
new file mode 100644
index 000000000..9940676e0
Binary files /dev/null and b/docs/img/miden/vm/design/programs/mast_of_program.png differ
diff --git a/docs/img/miden/vm/design/programs/span_block_creation.png b/docs/img/miden/vm/design/programs/span_block_creation.png
new file mode 100644
index 000000000..acf04c3ea
Binary files /dev/null and b/docs/img/miden/vm/design/programs/span_block_creation.png differ
diff --git a/docs/img/miden/vm/design/programs/split_block.png b/docs/img/miden/vm/design/programs/split_block.png
new file mode 100644
index 000000000..33f1da257
Binary files /dev/null and b/docs/img/miden/vm/design/programs/split_block.png differ
diff --git a/docs/img/miden/vm/design/programs/syscall_block.png b/docs/img/miden/vm/design/programs/syscall_block.png
new file mode 100644
index 000000000..abd1b3314
Binary files /dev/null and b/docs/img/miden/vm/design/programs/syscall_block.png differ
diff --git a/docs/img/miden/vm/design/range/rc_16_bit_logup.png b/docs/img/miden/vm/design/range/rc_16_bit_logup.png
new file mode 100644
index 000000000..79dadaf65
Binary files /dev/null and b/docs/img/miden/vm/design/range/rc_16_bit_logup.png differ
diff --git a/docs/img/miden/vm/design/range/rc_16_bit_range_check.png b/docs/img/miden/vm/design/range/rc_16_bit_range_check.png
new file mode 100644
index 000000000..9e705ba59
Binary files /dev/null and b/docs/img/miden/vm/design/range/rc_16_bit_range_check.png differ
diff --git a/docs/img/miden/vm/design/range/rc_8_bit_logup.png b/docs/img/miden/vm/design/range/rc_8_bit_logup.png
new file mode 100644
index 000000000..b7cc4dbc8
Binary files /dev/null and b/docs/img/miden/vm/design/range/rc_8_bit_logup.png differ
diff --git a/docs/img/miden/vm/design/range/rc_8_bit_range_check.png b/docs/img/miden/vm/design/range/rc_8_bit_range_check.png
new file mode 100644
index 000000000..bcedb9c3f
Binary files /dev/null and b/docs/img/miden/vm/design/range/rc_8_bit_range_check.png differ
diff --git a/docs/img/miden/vm/design/range/rc_better_construction.png b/docs/img/miden/vm/design/range/rc_better_construction.png
new file mode 100644
index 000000000..3f0e66beb
Binary files /dev/null and b/docs/img/miden/vm/design/range/rc_better_construction.png differ
diff --git a/docs/img/miden/vm/design/range/rc_table_post_8_bit_range_check.png b/docs/img/miden/vm/design/range/rc_table_post_8_bit_range_check.png
new file mode 100644
index 000000000..2032e106c
Binary files /dev/null and b/docs/img/miden/vm/design/range/rc_table_post_8_bit_range_check.png differ
diff --git a/docs/img/miden/vm/design/range/rc_with_bridge_rows.png b/docs/img/miden/vm/design/range/rc_with_bridge_rows.png
new file mode 100644
index 000000000..df95f5f75
Binary files /dev/null and b/docs/img/miden/vm/design/range/rc_with_bridge_rows.png differ
diff --git a/docs/img/miden/vm/design/stack/crypto-ops/FRIE2F4.png b/docs/img/miden/vm/design/stack/crypto-ops/FRIE2F4.png
new file mode 100644
index 000000000..018888040
Binary files /dev/null and b/docs/img/miden/vm/design/stack/crypto-ops/FRIE2F4.png differ
diff --git a/docs/img/miden/vm/design/stack/crypto-ops/HPERM.png b/docs/img/miden/vm/design/stack/crypto-ops/HPERM.png
new file mode 100644
index 000000000..f6aabc838
Binary files /dev/null and b/docs/img/miden/vm/design/stack/crypto-ops/HPERM.png differ
diff --git a/docs/img/miden/vm/design/stack/crypto-ops/MPVERIFY.png b/docs/img/miden/vm/design/stack/crypto-ops/MPVERIFY.png
new file mode 100644
index 000000000..5a642d125
Binary files /dev/null and b/docs/img/miden/vm/design/stack/crypto-ops/MPVERIFY.png differ
diff --git a/docs/img/miden/vm/design/stack/crypto-ops/MRUPDATE.png b/docs/img/miden/vm/design/stack/crypto-ops/MRUPDATE.png
new file mode 100644
index 000000000..19451d2b2
Binary files /dev/null and b/docs/img/miden/vm/design/stack/crypto-ops/MRUPDATE.png differ
diff --git a/docs/img/miden/vm/design/stack/decorator-operations/DIVRESULTU64.png b/docs/img/miden/vm/design/stack/decorator-operations/DIVRESULTU64.png
new file mode 100644
index 000000000..408e5740b
Binary files /dev/null and b/docs/img/miden/vm/design/stack/decorator-operations/DIVRESULTU64.png differ
diff --git a/docs/img/miden/vm/design/stack/decorator-operations/MERKLENODE.png b/docs/img/miden/vm/design/stack/decorator-operations/MERKLENODE.png
new file mode 100644
index 000000000..68b83399c
Binary files /dev/null and b/docs/img/miden/vm/design/stack/decorator-operations/MERKLENODE.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/ADD.png b/docs/img/miden/vm/design/stack/field-operations/ADD.png
new file mode 100644
index 000000000..1cb8fbb91
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/ADD.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/AND.png b/docs/img/miden/vm/design/stack/field-operations/AND.png
new file mode 100644
index 000000000..c4e4ff69f
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/AND.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/EQ.png b/docs/img/miden/vm/design/stack/field-operations/EQ.png
new file mode 100644
index 000000000..066b8949b
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/EQ.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/EQZ.png b/docs/img/miden/vm/design/stack/field-operations/EQZ.png
new file mode 100644
index 000000000..34055ddc2
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/EQZ.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/EXPACC.png b/docs/img/miden/vm/design/stack/field-operations/EXPACC.png
new file mode 100644
index 000000000..1ed11035a
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/EXPACC.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/EXT2MUL.png b/docs/img/miden/vm/design/stack/field-operations/EXT2MUL.png
new file mode 100644
index 000000000..51a369ed2
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/EXT2MUL.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/INCR.png b/docs/img/miden/vm/design/stack/field-operations/INCR.png
new file mode 100644
index 000000000..579e325c8
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/INCR.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/INV.png b/docs/img/miden/vm/design/stack/field-operations/INV.png
new file mode 100644
index 000000000..66085bc12
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/INV.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/MUL.png b/docs/img/miden/vm/design/stack/field-operations/MUL.png
new file mode 100644
index 000000000..5811b1e98
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/MUL.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/NEG.png b/docs/img/miden/vm/design/stack/field-operations/NEG.png
new file mode 100644
index 000000000..afcd31b31
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/NEG.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/NOT.png b/docs/img/miden/vm/design/stack/field-operations/NOT.png
new file mode 100644
index 000000000..9258b81ee
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/NOT.png differ
diff --git a/docs/img/miden/vm/design/stack/field-operations/OR.png b/docs/img/miden/vm/design/stack/field-operations/OR.png
new file mode 100644
index 000000000..95625324c
Binary files /dev/null and b/docs/img/miden/vm/design/stack/field-operations/OR.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/ADVPOP.png b/docs/img/miden/vm/design/stack/io-ops/ADVPOP.png
new file mode 100644
index 000000000..bbbf4de67
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/ADVPOP.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/ADVPOPW.png b/docs/img/miden/vm/design/stack/io-ops/ADVPOPW.png
new file mode 100644
index 000000000..fe1d63238
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/ADVPOPW.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/MLOAD.png b/docs/img/miden/vm/design/stack/io-ops/MLOAD.png
new file mode 100644
index 000000000..4a1c0a76a
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/MLOAD.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/MLOADW.png b/docs/img/miden/vm/design/stack/io-ops/MLOADW.png
new file mode 100644
index 000000000..9354925e4
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/MLOADW.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/MSTORE.png b/docs/img/miden/vm/design/stack/io-ops/MSTORE.png
new file mode 100644
index 000000000..691b8a110
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/MSTORE.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/MSTOREW.png b/docs/img/miden/vm/design/stack/io-ops/MSTOREW.png
new file mode 100644
index 000000000..085c6cc70
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/MSTOREW.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/MSTREAM.png b/docs/img/miden/vm/design/stack/io-ops/MSTREAM.png
new file mode 100644
index 000000000..b88c2b239
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/MSTREAM.png differ
diff --git a/docs/img/miden/vm/design/stack/io-ops/SDEPTH.png b/docs/img/miden/vm/design/stack/io-ops/SDEPTH.png
new file mode 100644
index 000000000..ce5ca6084
Binary files /dev/null and b/docs/img/miden/vm/design/stack/io-ops/SDEPTH.png differ
diff --git a/docs/img/miden/vm/design/stack/overflow_table_layout.png b/docs/img/miden/vm/design/stack/overflow_table_layout.png
new file mode 100644
index 000000000..bcecf65eb
Binary files /dev/null and b/docs/img/miden/vm/design/stack/overflow_table_layout.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/CSWAP.png b/docs/img/miden/vm/design/stack/stack-ops/CSWAP.png
new file mode 100644
index 000000000..55f3104bd
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/CSWAP.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/CSWAPW.png b/docs/img/miden/vm/design/stack/stack-ops/CSWAPW.png
new file mode 100644
index 000000000..88dbbe3ea
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/CSWAPW.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/DROP.png b/docs/img/miden/vm/design/stack/stack-ops/DROP.png
new file mode 100644
index 000000000..49887e30a
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/DROP.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/DUP(n).png b/docs/img/miden/vm/design/stack/stack-ops/DUP(n).png
new file mode 100644
index 000000000..fedf5667f
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/DUP(n).png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/MOVDN(n).png b/docs/img/miden/vm/design/stack/stack-ops/MOVDN(n).png
new file mode 100644
index 000000000..52b032a3a
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/MOVDN(n).png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/MOVUP(n).png b/docs/img/miden/vm/design/stack/stack-ops/MOVUP(n).png
new file mode 100644
index 000000000..5d0870cb6
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/MOVUP(n).png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/PAD.png b/docs/img/miden/vm/design/stack/stack-ops/PAD.png
new file mode 100644
index 000000000..a94ff693f
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/PAD.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/SWAP.png b/docs/img/miden/vm/design/stack/stack-ops/SWAP.png
new file mode 100644
index 000000000..5f1dc9132
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/SWAP.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/SWAPDW.png b/docs/img/miden/vm/design/stack/stack-ops/SWAPDW.png
new file mode 100644
index 000000000..18fa8680d
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/SWAPDW.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/SWAPW.png b/docs/img/miden/vm/design/stack/stack-ops/SWAPW.png
new file mode 100644
index 000000000..8f6772694
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/SWAPW.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/SWAPW2.png b/docs/img/miden/vm/design/stack/stack-ops/SWAPW2.png
new file mode 100644
index 000000000..d2d23a202
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/SWAPW2.png differ
diff --git a/docs/img/miden/vm/design/stack/stack-ops/SWAPW3.png b/docs/img/miden/vm/design/stack/stack-ops/SWAPW3.png
new file mode 100644
index 000000000..03f85162d
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack-ops/SWAPW3.png differ
diff --git a/docs/img/miden/vm/design/stack/stack_1st_left_shift.png b/docs/img/miden/vm/design/stack/stack_1st_left_shift.png
new file mode 100644
index 000000000..2061cc605
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack_1st_left_shift.png differ
diff --git a/docs/img/miden/vm/design/stack/stack_overflow_push_2nd_item.png b/docs/img/miden/vm/design/stack/stack_overflow_push_2nd_item.png
new file mode 100644
index 000000000..c0d7ff120
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack_overflow_push_2nd_item.png differ
diff --git a/docs/img/miden/vm/design/stack/stack_overflow_table_post_1_right_shift.png b/docs/img/miden/vm/design/stack/stack_overflow_table_post_1_right_shift.png
new file mode 100644
index 000000000..de5b8e16a
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack_overflow_table_post_1_right_shift.png differ
diff --git a/docs/img/miden/vm/design/stack/stack_overflow_table_post_2_right_shift.png b/docs/img/miden/vm/design/stack/stack_overflow_table_post_2_right_shift.png
new file mode 100644
index 000000000..4fe5824ac
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack_overflow_table_post_2_right_shift.png differ
diff --git a/docs/img/miden/vm/design/stack/stack_right_shift.png b/docs/img/miden/vm/design/stack/stack_right_shift.png
new file mode 100644
index 000000000..3b0060bce
Binary files /dev/null and b/docs/img/miden/vm/design/stack/stack_right_shift.png differ
diff --git a/docs/img/miden/vm/design/stack/system-ops/ASSERT.png b/docs/img/miden/vm/design/stack/system-ops/ASSERT.png
new file mode 100644
index 000000000..407a755c3
Binary files /dev/null and b/docs/img/miden/vm/design/stack/system-ops/ASSERT.png differ
diff --git a/docs/img/miden/vm/design/stack/system-ops/CLK.png b/docs/img/miden/vm/design/stack/system-ops/CLK.png
new file mode 100644
index 000000000..ad009037e
Binary files /dev/null and b/docs/img/miden/vm/design/stack/system-ops/CLK.png differ
diff --git a/docs/img/miden/vm/design/stack/system-ops/FMPADD.png b/docs/img/miden/vm/design/stack/system-ops/FMPADD.png
new file mode 100644
index 000000000..88e5e61e4
Binary files /dev/null and b/docs/img/miden/vm/design/stack/system-ops/FMPADD.png differ
diff --git a/docs/img/miden/vm/design/stack/system-ops/FMPUPDATE.png b/docs/img/miden/vm/design/stack/system-ops/FMPUPDATE.png
new file mode 100644
index 000000000..eed660c31
Binary files /dev/null and b/docs/img/miden/vm/design/stack/system-ops/FMPUPDATE.png differ
diff --git a/docs/img/miden/vm/design/stack/trace_layout.png b/docs/img/miden/vm/design/stack/trace_layout.png
new file mode 100644
index 000000000..d12d8fe4e
Binary files /dev/null and b/docs/img/miden/vm/design/stack/trace_layout.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32ADD.png b/docs/img/miden/vm/design/stack/u32-operations/U32ADD.png
new file mode 100644
index 000000000..30bbf8912
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32ADD.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32ADD3.png b/docs/img/miden/vm/design/stack/u32-operations/U32ADD3.png
new file mode 100644
index 000000000..9d1e0ac28
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32ADD3.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32AND.png b/docs/img/miden/vm/design/stack/u32-operations/U32AND.png
new file mode 100644
index 000000000..3d916e2eb
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32AND.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32ASSERT2.png b/docs/img/miden/vm/design/stack/u32-operations/U32ASSERT2.png
new file mode 100644
index 000000000..c32a18b35
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32ASSERT2.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32DIV.png b/docs/img/miden/vm/design/stack/u32-operations/U32DIV.png
new file mode 100644
index 000000000..e71a6dae0
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32DIV.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32MADD.png b/docs/img/miden/vm/design/stack/u32-operations/U32MADD.png
new file mode 100644
index 000000000..4ee5c262e
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32MADD.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32MUL.png b/docs/img/miden/vm/design/stack/u32-operations/U32MUL.png
new file mode 100644
index 000000000..b1ca4ce0c
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32MUL.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32SPLIT.png b/docs/img/miden/vm/design/stack/u32-operations/U32SPLIT.png
new file mode 100644
index 000000000..9c9c6e88b
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32SPLIT.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32SUB.png b/docs/img/miden/vm/design/stack/u32-operations/U32SUB.png
new file mode 100644
index 000000000..ed7bdb734
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32SUB.png differ
diff --git a/docs/img/miden/vm/design/stack/u32-operations/U32XOR.png b/docs/img/miden/vm/design/stack/u32-operations/U32XOR.png
new file mode 100644
index 000000000..1dc6524df
Binary files /dev/null and b/docs/img/miden/vm/design/stack/u32-operations/U32XOR.png differ
diff --git a/docs/img/miden/vm/design/vm_trace.png b/docs/img/miden/vm/design/vm_trace.png
new file mode 100644
index 000000000..fd5b1fd3e
Binary files /dev/null and b/docs/img/miden/vm/design/vm_trace.png differ
diff --git a/docs/img/miden/vm/intro/vm_components.png b/docs/img/miden/vm/intro/vm_components.png
new file mode 100644
index 000000000..109cb8d7e
Binary files /dev/null and b/docs/img/miden/vm/intro/vm_components.png differ
diff --git a/docs/img/miden/vm/user-docs/assembly/assembly_to_VM.png b/docs/img/miden/vm/user-docs/assembly/assembly_to_VM.png
new file mode 100644
index 000000000..2e3950467
Binary files /dev/null and b/docs/img/miden/vm/user-docs/assembly/assembly_to_VM.png differ
diff --git a/docs/img/miden/vm/user-docs/assembly/execution_contexts/context_transitions.png b/docs/img/miden/vm/user-docs/assembly/execution_contexts/context_transitions.png
new file mode 100644
index 000000000..2dd3bb49e
Binary files /dev/null and b/docs/img/miden/vm/user-docs/assembly/execution_contexts/context_transitions.png differ
diff --git a/docs/img/miden/vm/user-docs/assembly/execution_contexts/root_mem_layout.png b/docs/img/miden/vm/user-docs/assembly/execution_contexts/root_mem_layout.png
new file mode 100644
index 000000000..d082a363e
Binary files /dev/null and b/docs/img/miden/vm/user-docs/assembly/execution_contexts/root_mem_layout.png differ
diff --git a/docs/img/miden/vm/user-docs/assembly/execution_contexts/user_mem_layout.png b/docs/img/miden/vm/user-docs/assembly/execution_contexts/user_mem_layout.png
new file mode 100644
index 000000000..645aaa1b5
Binary files /dev/null and b/docs/img/miden/vm/user-docs/assembly/execution_contexts/user_mem_layout.png differ
diff --git a/docs/img/miden/vm/user-docs/assembly/overview/miden_vm_overview.png b/docs/img/miden/vm/user-docs/assembly/overview/miden_vm_overview.png
new file mode 100644
index 000000000..8572b4583
Binary files /dev/null and b/docs/img/miden/vm/user-docs/assembly/overview/miden_vm_overview.png differ
diff --git a/docs/miden/architecture/accounts.md b/docs/miden/architecture/accounts.md
index 82d96b909..9d4dcc0ca 100644
--- a/docs/miden/architecture/accounts.md
+++ b/docs/miden/architecture/accounts.md
@@ -51,7 +51,7 @@ An account vault can be reduced to a single hash which is the root of the sparse
 
 Interface for accounts. In Miden every account is a smart contract. It has an interface that exposes functions that can be called by note scripts. Functions exposed by the account have the following properties:
 
-* Functions are actually roots of [Miden program MASTs](https://0xpolygonmiden.github.io/miden-vm/user_docs/assembly/main.html) (i.e., a 32-byte hash). Thus, function identifier is a commitment to the code which is executed when a function is invoked.
+* Functions are actually roots of [Miden program MASTs](https://0xpolygonmiden.github.io/miden-vm/user-docs/assembly/main.html) (i.e., a 32-byte hash). Thus, function identifier is a commitment to the code which is executed when a function is invoked.
 * Only account functions have mutable access to an account's storage and vault. Therefore, the only way to modify an account's internal state is through one of the account's functions.
 * Account functions can take parameters and can create new notes.
 
diff --git a/docs/miden/architecture/notes.md b/docs/miden/architecture/notes.md
index 4479c1c02..ca8af2997 100644
--- a/docs/miden/architecture/notes.md
+++ b/docs/miden/architecture/notes.md
@@ -21,7 +21,7 @@ Asset container for a note. A note vault can contain up to `255` assets stored i
 
 ### Script
 
-Unlike an account, a note has a single executable script. This script will be executed in a [transaction](transactions.md). This script is also the root of a [Miden program MAST](https://0xpolygonmiden.github.io/miden-vm/user_docs/assembly/main.html). A script is always executed in the context of a single account, and thus, may invoke account's functions. A note's script can call zero or more of an account's function.
+Unlike an account, a note has a single executable script. This script will be executed in a [transaction](transactions.md). This script is also the root of a [Miden program MAST](https://0xpolygonmiden.github.io/miden-vm/user-docs/assembly/main.html). A script is always executed in the context of a single account, and thus, may invoke account's functions. A note's script can call zero or more of an account's function.
 
 !!! note
     Since code in Miden is expressed as MAST, every function is a commitment to the underlying code. The code cannot change unnoticed to the user because its hash would change.*
diff --git a/docs/miden/vm/background.md b/docs/miden/vm/background.md
new file mode 100644
index 000000000..90d212576
--- /dev/null
+++ b/docs/miden/vm/background.md
@@ -0,0 +1,29 @@
+Proofs of execution generated by Miden VM are based on STARKs. A STARK is a novel proof-of-computation scheme that allows you to create an efficiently verifiable proof that a computation was executed correctly. The scheme was developed by Eli Ben-Sasson, Michael Riabzev et al. at Technion - Israel Institute of Technology. STARKs do not require an initial trusted setup, and rely on very few cryptographic assumptions.
+
+Here are some resources to learn more about STARKs:
+
+* STARKs paper: [Scalable, transparent, and post-quantum secure computational integrity](https://eprint.iacr.org/2018/046)
+* STARKs vs. SNARKs: [A Cambrian Explosion of Crypto Proofs](https://nakamoto.com/cambrian-explosion-of-crypto-proofs/)
+
+Vitalik Buterin's blog series on zk-STARKs:
+
+* [STARKs, part 1: Proofs with Polynomials](https://vitalik.ca/general/2017/11/09/starks_part_1.html)
+* [STARKs, part 2: Thank Goodness it's FRI-day](https://vitalik.ca/general/2017/11/22/starks_part_2.html)
+* [STARKs, part 3: Into the Weeds](https://vitalik.ca/general/2018/07/21/starks_part_3.html)
+
+Alan Szepieniec's STARK tutorials:
+
+* [Anatomy of a STARK](https://aszepieniec.github.io/stark-anatomy/)
+* [BrainSTARK](https://aszepieniec.github.io/stark-brainfuck/)
+
+StarkWare's STARK Math blog series:
+
+* [STARK Math: The Journey Begins](https://medium.com/starkware/stark-math-the-journey-begins-51bd2b063c71)
+* [Arithmetization I](https://medium.com/starkware/arithmetization-i-15c046390862)
+* [Arithmetization II](https://medium.com/starkware/arithmetization-ii-403c3b3f4355)
+* [Low Degree Testing](https://medium.com/starkware/low-degree-testing-f7614f5172db)
+* [A Framework for Efficient STARKs](https://medium.com/starkware/a-framework-for-efficient-starks-19608ba06fbe)
+
+StarkWare's STARK tutorial:
+
+ * [STARK 101](https://starkware.co/stark-101/)
diff --git a/docs/miden/vm/design/chiplets/bitwise.md b/docs/miden/vm/design/chiplets/bitwise.md
new file mode 100644
index 000000000..c4847d55c
--- /dev/null
+++ b/docs/miden/vm/design/chiplets/bitwise.md
@@ -0,0 +1,159 @@
+In this note we describe how to compute bitwise AND and XOR operations on 32-bit values and the constraints required for proving correct execution.
+
+Assume that $a$ and $b$ are field elements in a 64-bit prime field. Assume also that $a$ and $b$ are known to contain values smaller than $2^{32}$. We want to compute $a \oplus b \rightarrow z$, where $\oplus$ is either bitwise AND or XOR, and $z$ is a field element containing the result of the corresponding bitwise operation.
+
+First, observe that we can compute AND and XOR relations for **single bit values** as follows:
+
+$$
+and(a, b) = a \cdot b
+$$
+
+$$
+xor(a, b) = a + b - 2 \cdot a \cdot b
+$$
+
+To compute bitwise operations for multi-bit values, we will decompose the values into individual bits, apply the operations to single bits, and then aggregate the bitwise results into the final result.
+
+To perform this operation we will use a table with 12 columns, and computing a single AND or XOR operation will require 8 table rows. We will also rely on two periodic columns as shown below.
+
+![bitwise_execution_trace](../../../../img/miden/vm/design/chiplets/bitwise/bitwise_execution_trace.png)
+
+In the above, the columns have the following meanings:
+
+- Periodic columns $k_0$ and $k_1$. These columns contain values needed to switch various constraint on or off. $k_0$ contains a repeating sequence of a single one, followed by seven zeros. $k_1$ contains a repeating sequence of seven ones, followed by a single zero.
+- Input columns $a$ and $b$. On the first row of each 8-row cycle, the prover will set values in these columns to the upper 4 bits of the values to which a bitwise operation is to be applied. For all subsequent rows, we will append the next-most-significant 4-bit limb to each value. Thus, by the final row columns $a$ and $b$ will contain the full input values for the bitwise operation.
+- Columns $a_0$, $a_1$, $a_2$, $a_3$, $b_0$, $b_1$, $b_2$, $b_3$ will contain lower 4 bits of their corresponding values.
+- Output column $z_p$. This column represents the value of column $z$ for the prior row. For the first row, it is set to $0$.
+- Output column $z$. This column will be used to aggregate the results of bitwise operations performed over columns $a_0$, $a_1$, $a_2$, $a_3$, $b_0$, $b_1$, $b_2$, $b_3$. By the time we get to the last row in each 8-row cycle, this column will contain the final result.
+
+## Example
+
+Let's illustrate the above table on a concrete example. For simplicity, we'll use 16-bit values, and thus, we'll only need 4 rows to complete the operation (rather than 8 for 32-bit values). Let's say $a = 41851$ (`b1010_0011_0111_1011`) and $b = 40426$ (`b1001_1101_1110_1010`), then $and(a, b) = 33130$ (`b1000_0001_0110_1010`). The table for this computation looks like so:
+
+|   a   |   b   | x0  | x1  | x2  | x3  | y0  | y1  | y2  | y3  |   zp   |   z   |
+| :---: | :---: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :----: | :---: |
+|  10   |   9   |  0  |  1  |  0  |  1  |  1  |  0  |  0  |  1  |   0    |   8   |
+|  163  |  157  |  1  |  1  |  0  |  0  |  1  |  0  |  1  |  1  |   8    |  129  |
+| 2615  | 2526  |  1  |  1  |  1  |  0  |  0  |  1  |  1  |  1  |  129   | 2070  |
+| 41851 | 40426 |  1  |  1  |  0  |  1  |  0  |  1  |  0  |  1  |  2070  | 33130 |
+
+Here, in the first row, we set each of the $a$ and $b$ columns to the value of their most-significant 4-bit limb. The bit columns ($a_0 .. a_3$ and $b_0 .. b_3$) in the first row contain the lower 4 bits of their corresponding values (`b1010` and `b1001`). Column $z$ contains the result of bitwise AND for the upper 4 bits (`b1000`), while column $z_p$ contains that result for the prior row.
+
+With every subsequent row, we inject the next-most-significant 4 bits of each value into the bit columns, increase the $a$ and $b$ columns accordingly, and aggregate the result of bitwise AND into the $z$ column, adding it to $2^4$ times the value of $z$ in the previous row. We set column $z_p$ to be the value of $z$ in the prior row. By the time we get to the last row, the $z$ column contains the result of the bitwise AND, while columns $a$ and $b$ contain their original values.
+
+## Constraints
+
+AIR constraints needed to ensure the correctness of the above table are described below. We also add one more column $s$ to the execution trace, to allow us to select between two bitwise operations (`U32AND` and `U32XOR`).
+
+### Selectors
+
+The Bitwise chiplet supports two operations with the following operation selectors:
+
+- `U32AND`: $s = 0$
+- `U32XOR`: $s = 1$
+
+The constraints must require that the selectors be binary and stay the same throughout the cycle:
+
+> $$
+s^2 - s = 0 \text{ | degree} = 2
+$$
+
+> $$
+k_1 \cdot (s' - s) = 0 \text{ | degree} = 2
+$$
+
+### Input decomposition
+
+We need to make sure that inputs $a$ and $b$ are decomposed correctly into their individual bits. To do this, first, we need to make sure that columns $a_0$, $a_1$, $a_2$, $a_3$, $b_0$, $b_1$, $b_2$, $b_3$, can contain only binary values ($0$ or $1$). This can be accomplished with the following constraints (for $i$ ranging between $0$ and $3$):
+
+> $$
+a_i^2 - a_i = 0 \text{ | degree} = 2
+$$
+
+> $$
+b_i^2 - b_i = 0 \text{ | degree} = 2
+$$
+
+Then, we need to make sure that on the first row of every 8-row cycle, the values in the columns $a$ and $b$ are exactly equal to the aggregation of binary values contained in the individual bit columns $a_i$, and $b_i$. This can be enforced with the following constraints:
+
+> $$
+k_0 \cdot \left(a - \sum_{i=0}^3(2^i \cdot a_i)\right) = 0 \text{ | degree} = 2
+$$
+
+> $$
+k_0 \cdot \left(b - \sum_{i=0}^3(2^i \cdot b_i)\right) = 0 \text{ | degree} = 2
+$$
+
+The above constraints enforce that when $k_0 = 1$, $a = \sum_{i=0}^3(2^i \cdot a_i)$ and $b = \sum_{i=0}^3(2^i \cdot b_i)$.
+
+Lastly, we need to make sure that for all rows in an 8-row cycle except for the last one, the values in $a$ and $b$ columns are increased by the values contained in the individual bit columns $a_i$ and $b_i$. Denoting $a$ as the value of column $a$ in the current row, and $a'$ as the value of column $a$ in the next row, we can enforce these conditions as follows:
+
+> $$
+k_1 \cdot \left(a' - \left(a \cdot 16 + \sum_{i=0}^3(2^i \cdot a'_i)\right)\right) = 0 \text{ | degree} = 2
+$$
+
+> $$
+k_1 \cdot \left(b' - \left(b \cdot 16 + \sum_{i=0}^3(2^i \cdot b'_i)\right)\right) = 0 \text{ | degree} = 2
+$$
+
+The above constraints enforce that when $k_1 = 1$ , $a' = 16 \cdot a + \sum_{i=0}^3(2^i \cdot a'_i)$ and $b' = 16 \cdot b + \sum_{i=0}^3(2^i \cdot b'_i)$.
+
+### Output aggregation
+
+To ensure correct aggregation of operations over individual bits, first we need to ensure that in the first row, the aggregated output value of the previous row should be 0.
+> $$
+k_0 \cdot z_p = 0 \text{ | degree} = 2
+$$
+
+Next, we need to ensure that for each row except the last, the aggregated output value must equal the previous aggregated output value in the next row.
+> $$
+k_1 \cdot \left(z - z'_p\right) = 0 \text{ | degree} = 2
+$$
+
+Lastly, we need to ensure that for all rows the value in the $z$ column is computed by multiplying the previous output value (from the $z_p$ column in the current row) by 16 and then adding it to the bitwise operation applied to the row's set of bits of $a$ and $b$. The entire constraint must also be multiplied by the operation selector flag to ensure it is only applied for the appropriate operation.
+
+For `U32AND`, this is enforced with the following constraint:
+
+> $$
+(1 - s) \cdot \left(z -(z_p \cdot 16 + \sum_{i=0}^3(2^i \cdot a_i \cdot b_i))\right) = 0 \text{ | degree} = 3
+$$
+
+For `U32XOR`, this is enforced with the following constraint:
+
+> $$
+s \cdot \left(z -(z_p \cdot 16 + \sum_{i=0}^3(2^i \cdot (a_i + b_i - 2 \cdot a_i \cdot b_i)))\right) = 0 \text{ | degree} = 3
+$$
+
+## Chiplets bus constraints
+
+To simplify the notation for describing bitwise constraints on the chiplets bus, we'll first define variable $u$, which represents how $a$, $b$, and $z$ in the execution trace are reduced to a single value. Denoting the random values received from the verifier as $\alpha_0, \alpha_1$, etc., this can be achieved as follows.
+
+$$
+u = \alpha_0 + \alpha_1 \cdot op_{bit} + \alpha_2 \cdot a + \alpha_3 \cdot b + \alpha_4 \cdot z
+$$
+
+Where, $op_{bit}$ is the unique [operation label](index.md#operation-labels) of the bitwise operation.
+
+The request side of the constraint for the bitwise operation is described in the [stack bitwise operation section](../stack/u32-ops.md#u32and).
+
+To provide the results of bitwise operations to the chiplets bus, we want to include values of $a$, $b$ and $z$ at the last row of the cycle.
+
+First, we'll define another intermediate variable $v_i$. It will include $u$ into the product when $k_1 = 0$. ($u_i$ represents the value of $u$ for row $i$ of the trace.)
+
+$$
+v_i = (1-k_1) \cdot u_i
+$$
+
+Then, setting $m = 1 - k_1$, we can compute the permutation product from the bitwise chiplet as follows:
+
+$$
+\prod_{i=0}^n (v_i \cdot m_i + 1 - m_i)
+$$
+
+The above ensures that when $1 - k_1 = 0$ (which is true for all rows in the 8-row cycle except for the last one), the product does not change. Otherwise, $v_i$ gets included into the product.
+
+The response side of the bus communication can be enforced with the following constraint:
+
+> $$
+b'_{chip} = b_{chip} \cdot (v_i \cdot m_i + 1 - m_i) \text{ | degree} = 4
+$$
diff --git a/docs/miden/vm/design/chiplets/hasher.md b/docs/miden/vm/design/chiplets/hasher.md
new file mode 100644
index 000000000..42735293a
--- /dev/null
+++ b/docs/miden/vm/design/chiplets/hasher.md
@@ -0,0 +1,438 @@
+Miden VM "offloads" all hash-related computations to a separate _hash processor_. This chiplet supports executing the [Rescue Prime Optimized](https://eprint.iacr.org/2022/1577) hash function (or rather a [specific instantiation](https://docs.rs/miden-crypto/latest/miden_crypto/hash/rpo/struct.Rpo256.html) of it) in the following settings:
+
+- A single permutation of Rescue Prime Optimized.
+- A simple 2-to-1 hash.
+- A linear hash of $n$ field elements.
+- Merkle path verification.
+- Merkle root update.
+
+The chiplet can be thought of as having a small instruction set of $11$ instructions. These instructions are listed below, and examples of how these instructions are used by the chiplet are described in the following sections.
+
+| Instruction | Description                                                                                                                                                                                                                                                                                                        |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `HR`        | Executes a single round of the VM's native hash function. All cycles which are not one less than a multiple of $8$ execute this instruction. That is, the chiplet executes this instruction on cycles $0, 1, 2, 3, 4, 5, 6$, but not $7$, and then again, $8, 9, 10, 11, 12, 13, 14$, but not $15$ etc.            |
+| `BP`        | Initiates computation of a single permutation, a 2-to-1 hash, or a linear hash of many elements. This instruction can be executed only on cycles which are multiples of $8$, and it can also be executed concurrently with an `HR` instruction.                                                                    |
+| `MP`        | Initiates Merkle path verification computation. This instruction can be executed only on cycles which are multiples of $8$, and it can also be executed concurrently with an `HR` instruction.                                                                                                                     |
+| `MV`        | Initiates Merkle path verification for the "old" node value during Merkle root update computation. This instruction can be executed only on cycles which are multiples of $8$, and it can also be executed concurrently with an `HR` instruction.                                                                  |
+| `MU`        | Initiates Merkle path verification for the "new" node value during Merkle root update computation. This instruction can be executed only on cycles which are multiples of $8$, and it can also be executed concurrently with an `HR` instruction.                                                                  |
+| `HOUT`      | Returns the result of the currently running computation. This instruction can be executed only on cycles which are one less than a multiple of $8$ (e.g. $7$, $15$ etc.).                                                                                                                                          |
+| `SOUT`      | Returns the whole hasher state. This instruction can be executed only on cycles which are one less than a multiple of $8$, and only if the computation was started using `BP` instruction.                                                                                                                         |
+| `ABP`       | Absorbs a new set of elements into the hasher state when computing a linear hash of many elements. This instruction can be executed only on cycles which are one less than a multiple of $8$, and only if the computation was started using `BP` instruction.                                                      |
+| `MPA`       | Absorbs the next Merkle path node into the hasher state during Merkle path verification computation. This instruction can be executed only on cycles which are one less than a multiple of $8$, and only if the computation was started using `MP` instruction.                                                    |
+| `MVA`       | Absorbs the next Merkle path node into the hasher state during Merkle path verification for the "old" node value during Merkle root update computation. This instruction can be executed only on cycles which are one less than a multiple of $8$, and only if the computation was started using `MV` instruction. |
+| `MUA`       | Absorbs the next Merkle path node into the hasher state during Merkle path verification for the "new" node value during Merkle root update computation. This instruction can be executed only on cycles which are one less than a multiple of $8$, and only if the computation was started using `Mu` instruction. |
+
+## Chiplet trace
+
+Execution trace table of the chiplet consists of $16$ trace columns and $3$ periodic columns. The structure of the table is such that a single permutation of the hash function can be computed using $8$ table rows. The layout of the table is illustrated below.
+
+![hash_execution_trace](../../../../img/miden/vm/design/chiplets/hasher/hash_execution_trace.png)
+
+The meaning of the columns is as follows:
+
+- Three periodic columns $k_0$, $k_1$, and $k_2$ are used to help select the instruction executed at a given row. All of these columns contain patterns which repeat every $8$ rows. For $k_0$ the pattern is $7$ zeros followed by $1$ one, helping us identify the last row in the cycle. For $k_1$ the pattern is $6$ zeros, $1$ one, and $1$ zero, which can be used to identify the second-to-last row in a cycle. For $k_2$ the pattern is $1$ one followed by $7$ zeros, which can identify the first row in the cycle.
+- Three selector columns $s_0$, $s_1$, and $s_2$. These columns can contain only binary values (ones or zeros), and they are also used to help select the instruction to execute at a given row.
+- Twelve hasher state columns $h_0, ..., h_{11}$. These columns are used to hold the hasher state for each round of the hash function permutation. The state is laid out as follows:
+  - The first four columns ($h_0, ..., h_3$) are reserved for capacity elements of the state. When the state is initialized for hash computations, $h_0$ should be set to $0$ if the number of elements to be hashed is a multiple of the rate width ($8$). Otherwise, $h_0$ should be set to $1$. $h_1$ should be set to the domain value if a domain has been provided (as in the case of [control block hashing](../programs.md#program-hash-computation)).  All other capacity elements should be set to $0$'s.
+  - The next eight columns ($h_4, ..., h_{11}$) are reserved for the rate elements of the state. These are used to absorb the values to be hashed. Once the permutation is complete, hash output is located in the first four rate columns ($h_4, ..., h_7$).
+- One index column $i$. This column is used to help with Merkle path verification and Merkle root update computations.
+
+In addition to the columns described above, the chiplet relies on two running product columns which are used to facilitate multiset checks (similar to the ones described [here](https://hackmd.io/@arielg/ByFgSDA7D)). These columns are:
+
+- $b_{chip}$ - which is used to tie the chiplet table with the main VM's stack and decoder. That is, values representing inputs consumed by the chiplet and outputs produced by the chiplet are multiplied into $b_{chip}$, while the main VM stack (or decoder) divides them out of $b_{chip}$. Thus, if the sets of inputs and outputs between the main VM stack and hash chiplet are the same, the value of $b_{chip}$ should be equal to $1$ at the start and the end of the execution trace.
+- $p_1$ - which is used to keep track of the *sibling* table used for Merkle root update computations. Specifically, when a root for the old leaf value is computed, we add an entry for all sibling nodes to the table (i.e., we multiply $p_1$ by the values representing these entries). When the root for the new leaf value is computed, we remove the entries for the nodes from the table (i.e., we divide $p_1$ by the value representing these entries). Thus, if both computations used the same set of sibling nodes (in the same order), the sibling table should be empty by the time Merkle root update procedure completes (i.e., the value of $p_1$ would be $1$).
+
+## Instruction flags
+
+As mentioned above, chiplet instructions are encoded using a combination of periodic and selector columns. These columns can be used to compute a binary flag for each instruction. Thus, when a flag for a given instruction is set to $1$, the chiplet executes this instruction. Formulas for computing instruction flags are listed below.
+
+| Flag       | Value                                                 | Notes                                                                                             |
+| ---------- | ----------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| $f_{rpr}$  | $1 - k_0$                                             | Set to $1$ on the first $7$ steps of every $8$-step cycle.                                        |
+| $f_{bp}$   | $k_2 \cdot s_0 \cdot (1 - s_1) \cdot (1 - s_2)$       | Set to $1$ when selector flags are $(1, 0, 0)$ on rows which are multiples of $8$.                |
+| $f_{mp}$   | $k_2 \cdot s_0 \cdot (1 - s_1) \cdot s_2$             | Set to $1$ when selector flags are $(1, 0, 1)$ on rows which are multiples of $8$.                |
+| $f_{mv}$   | $k_2 \cdot s_0 \cdot s_1 \cdot (1 - s_2)$             | Set to $1$ when selector flags are $(1, 1, 0)$ on rows which are multiples of $8$.                |
+| $f_{mu}$   | $k_2 \cdot s_0 \cdot s_1 \cdot s_2$                   | Set to $1$ when selector flags are $(1, 1, 1)$ on rows which are multiples of $8$.                |
+| $f_{hout}$ | $k_0 \cdot (1 - s_0) \cdot (1 - s_1) \cdot (1 - s_2)$ | Set to $1$ when selector flags are $(0, 0, 0)$ on rows which are $1$ less than a multiple of $8$. |
+| $f_{sout}$ | $k_0 \cdot (1 - s_0) \cdot (1 - s_1) \cdot s_2$       | Set to $1$ when selector flags are $(0, 0, 1)$ on rows which are $1$ less than a multiple of $8$. |
+| $f_{abp}$  | $k_0 \cdot s_0 \cdot (1 - s_1) \cdot (1 - s_2)$       | Set to $1$ when selector flags are $(1, 0, 0)$ on rows which are $1$ less than a multiple of $8$. |
+| $f_{mpa}$  | $k_0 \cdot s_0 \cdot (1 - s_1) \cdot s_2$             | Set to $1$ when selector flags are $(1, 0, 1)$ on rows which are $1$ less than a multiple of $8$. |
+| $f_{mva}$  | $k_0 \cdot s_0 \cdot s_1 \cdot (1 - s_2)$             | Set to $1$ when selector flags are $(1, 1, 0)$ on rows which are $1$ less than a multiple of $8$. |
+| $f_{mua}$  | $k_0 \cdot s_0 \cdot s_1 \cdot s_2$                   | Set to $1$ when selector flags are $(1, 1, 1)$ on rows which are $1$ less than a multiple of $8$. |
+
+A few additional notes about flag values:
+
+- With the exception of $f_{rpr}$, all flags are mutually exclusive. That is, if one flag is set to $1$, all other flats are set to $0$.
+- With the exception of $f_{rpr}$, computing flag values involves $3$ multiplications, and thus the degree of these flags is $4$.
+- We can also define a flag $f_{out} = k_0 \cdot (1 - s_0) \cdot (1 - s_1)$. This flag will be set to $1$ when either $f_{hout}=1$ or $f_{sout}=1$ in the current row.
+- We can define a flag $f_{out}' = k_1 \cdot (1 - s_0') \cdot (1 - s_1')$. This flag will be set to $1$ when either $f_{hout}=1$ or $f_{sout}=1$ in the next row.
+
+We also impose the following restrictions on how values in selector columns can be updated:
+
+- Values in columns $s_1$ and $s_2$ must be copied over from one row to the next, unless $f_{out} = 1$ or $f_{out}' = 1$ indicating the `hout` or `sout` flag is set for the current or the next row.
+- Value in $s_0$ must be set to $1$ if $f_{out}=1$ for the previous row, and to $0$ if any of the flags $f_{abp}$, $f_{mpa}$, $f_{mva}$, or $f_{mua}$ are set to $1$ for the previous row.
+
+The above rules ensure that we must finish one computation before starting another, and we can't change the type of the computation before the computation is finished.
+
+## Computation examples
+
+### Single permutation
+
+Computing a single permutation of Rescue Prime Optimized hash function involves the following steps:
+
+1. Initialize hasher state with $12$ field elements.
+2. Apply Rescue Prime Optimized permutation.
+3. Return the entire hasher state as output.
+
+The chiplet accomplishes the above by executing the following instructions:
+
+```
+[BP, HR]                 // init state and execute a hash round (concurrently)
+HR HR HR HR HR HR        // execute 6 more hash rounds
+SOUT                     // return the entire state as output
+```
+
+Execution trace for this computation would look as illustrated below.
+
+![hash_1_permutation_trace](../../../../img/miden/vm/design/chiplets/hasher/hash_1_permutation_trace.png)
+
+In the above $\{a_0, ..., a_{11}\}$ is the input state of the hasher, and $\{b_0, ..., b_{11}\}$ is the output state of the hasher.
+
+### Simple 2-to-1 hash
+
+Computing a 2-to-1 hash involves the following steps:
+
+1. Initialize hasher state with $8$ field elements, setting the second capacity element to $domain$ if the domain is provided (as in the case of [control block hashing](../programs.md#program-hash-computation)) or else $0$, and the remaining capacity elements to $0$.
+2. Apply Rescue Prime Optimized permutation.
+3. Return elements $[4, 8)$ of the hasher state as output.
+
+The chiplet accomplishes the above by executing the following instructions:
+
+```
+[BP, HR]                 // init state and execute a hash round (concurrently)
+HR HR HR HR HR HR        // execute 6 more hash rounds
+HOUT                     // return elements 4, 5, 6, 7 of the state as output
+```
+
+Execution trace for this computation would look as illustrated below.
+
+![hash_2_to_1_hash](../../../../img/miden/vm/design/chiplets/hasher/hash_2_to_1_hash.png)
+
+In the above, we compute the following:
+
+$$
+\{c_0, c_1, c_2, c_3\} \leftarrow hash(\{a_0, a_1, a_2, a_3\}, \{b_0, b_1, b_2, b_3\})
+$$
+
+### Linear hash of n elements
+
+Computing a linear hash of $n$ elements consists of the following steps:
+
+1. Initialize hasher state with the first $8$ elements, setting the first capacity register to $0$ if $n$ is a multiple of the rate width ($8$) or else $1$, and the remaining capacity elements to $0$.
+2. Apply Rescue Prime Optimized permutation.
+3. Absorb the next set of elements into the state (up to $8$ elements), while keeping capacity elements unchanged.
+4. Repeat steps 2 and 3 until all $n$ elements have been absorbed.
+5. Return elements $[4, 8)$ of the hasher state as output.
+
+The chiplet accomplishes the above by executing the following instructions (for hashing $16$ elements):
+
+```
+[BP, HR]                    // init state and execute a hash round (concurrently)
+HR HR HR HR HR HR           // execute 6 more hash rounds
+ABP                         // absorb the next set of elements into the state
+HR HR HR HR HR HR HR        // execute 7 hash rounds
+HOUT                        // return elements 4, 5, 6, 7 of the state as output
+```
+
+Execution trace for this computation would look as illustrated below.
+
+![hash_linear_hash_n](../../../../img/miden/vm/design/chiplets/hasher/hash_linear_hash_n.png)
+
+In the above, the value absorbed into hasher state between rows $7$ and $8$ is the delta between values $t_i$ and $s_i$. Thus, if we define $b_i = t_i - s_i$ for $i \in [0, 8)$, the above computes the following:
+
+$$
+\{r_0, r_1, r_2, r_3\} \leftarrow hash(a_0, ..., a_7, b_0, ..., b_7)
+$$
+
+### Verify Merkle path
+
+Verifying a Merkle path involves the following steps:
+
+1. Initialize hasher state with the leaf and the first node of the path, setting all capacity elements to $0$s.
+   a. Also, initialize the index register to the leaf's index value.
+2. Apply Rescue Prime Optimized permutation.
+   a. Make sure the index value doesn't change during this step.
+3. Copy the result of the hash to the next row, and absorb the next node of the Merkle path into the hasher state.
+   a. Remove a single bit from the index, and use it to determine how to place the copied result and absorbed node in the state.
+4. Repeat steps 2 and 3 until all nodes of the Merkle path have been absorbed.
+5. Return elements $[4, 8)$ of the hasher state as output.
+   a. Also, make sure the index value has been reduced to $0$.
+
+The chiplet accomplishes the above by executing the following instructions (for Merkle tree of depth $3$):
+
+```
+[MP, HR]                    // init state and execute a hash round (concurrently)
+HR HR HR HR HR HR           // execute 6 more hash rounds
+MPA                         // copy result & absorb the next node into the state
+HR HR HR HR HR HR HR        // execute 7 hash rounds
+HOUT                        // return elements 4, 5, 6, 7 of the state as output
+```
+
+Suppose we have a Merkle tree as illustrated below. This Merkle tree has $4$ leaves, each of which consists of $4$ field elements. For example, leaf $a$ consists of elements $a_0, a_1, a_2, a_3$, leaf be consists of elements $b_0, b_1, b_2, b_3$ etc.
+
+![hash_merkle_tree](../../../../img/miden/vm/design/chiplets/hasher/hash_merkle_tree.png)
+
+If we wanted to verify that leaf $d$ is in fact in the tree, we'd need to compute the following hashes:
+
+$$
+r \leftarrow hash(e, hash(c, d))
+$$
+
+And if $r = g$, we can be convinced that $d$ is in fact in the tree at position $3$. Execution trace for this computation would look as illustrated below.
+
+![hash_merkle_tree_trace](../../../../img/miden/vm/design/chiplets/hasher/hash_merkle_tree_trace.png)
+
+In the above, the prover provides values for nodes $c$ and $e$ non-deterministically.
+
+### Update Merkle root
+
+Updating a node in a Merkle tree (which also updates the root of the tree) can be simulated by verifying two Merkle paths: the path that starts with the old leaf and the path that starts with the new leaf.
+
+Suppose we have the same Merkle tree as in the previous example, and we want to replace node $d$ with node $d'$. The computations we'd need to perform are:
+
+$$
+r \leftarrow hash(e, hash(c, d)) \\
+r' \leftarrow hash(e, hash(c, d'))
+$$
+
+Then, as long as $r = g$, and the same values were used for $c$ and $e$ in both computations, we can be convinced that the new root of the tree is $r'$.
+
+The chiplet accomplishes the above by executing the following instructions:
+
+```
+// verify the old merkle path
+[MV, HR]                    // init state and execute a hash round (concurrently)
+HR HR HR HR HR HR           // execute 6 more hash rounds
+MPV                         // copy result & absorb the next node into the state
+HR HR HR HR HR HR HR        // execute 7 hash rounds
+HOUT                        // return elements 4, 5, 6, 7 of the state as output
+
+// verify the new merkle path
+[MU, HR]                    // init state and execute a hash round (concurrently)
+HR HR HR HR HR HR           // execute 6 more hash rounds
+MPU                         // copy result & absorb the next node into the state
+HR HR HR HR HR HR HR        // execute 7 hash rounds
+HOUT                        // return elements 4, 5, 6, 7 of the state as output
+```
+
+The semantics of `MV` and `MU` instructions are similar to the semantics of `MP` instruction from the previous example (and `MVA` and `MUA` are similar to `MPA`) with one important difference: `MV*` instructions add the absorbed node (together with its index in the tree) to permutation column $p_1$, while `MU*` instructions remove the absorbed node (together with its index in the tree) from $p_1$. Thus, if the same nodes were used during both Merkle path verification, the state of $p_1$ should not change. This mechanism is used to ensure that the same internal nodes were used in both computations.
+
+## AIR constraints
+
+When describing AIR constraints, we adopt the following notation: for column $x$, we denote the value in the current row simply as $x$, and the value in the next row of the column as $x′$. Thus, all transition constraints described in this note work with two consecutive rows of the execution trace.
+
+### Selector columns constraints
+
+For selector columns, first we must ensure that only binary values are allowed in these columns. This can be done with the following constraints:
+
+>$$
+s_0^2 - s_0 = 0 \text{ | degree} = 2 \\
+s_1^2 - s_1 = 0 \text{ | degree} = 2 \\
+s_2^2 - s_2 = 0 \text{ | degree} = 2
+$$
+
+Next, we need to make sure that unless $f_{out}=1$ or $f_{out}'=1$, the values in columns $s_1$ and $s_2$ are copied over to the next row. This can be done with the following constraints:
+
+>$$
+(s_1' - s_1) \cdot (1 - f_{out}') \cdot (1 - f_{out}) = 0  \text{ | degree} = 7 \\
+(s_2' - s_2) \cdot (1 - f_{out}') \cdot (1 - f_{out}) = 0  \text{ | degree} = 7
+$$
+
+Next, we need to enforce that if any of $f_{abp}, f_{mpa}, f_{mva}, f_{mua}$ flags is set to $1$, the next value of $s_0$ is $0$. In all other cases, $s_0$ should be unconstrained. These flags will only be set for rows that are 1 less than a multiple of 8 (the last row of each cycle). This can be done with the following constraint:
+
+>$$
+s_0' \cdot (f_{abp} + f_{mpa} + f_{mva} + f_{mua})= 0  \text{ | degree} = 5
+$$
+
+Lastly, we need to make sure that no invalid combinations of flags are allowed. This can be done with the following constraints:
+
+>$$
+k_0 \cdot (1 - s_0) \cdot s_1 = 0 \text{ | degree} = 3
+$$
+
+The above constraints enforce that on every step which is one less than a multiple of $8$, if $s_0 = 0$, then $s_1$ must also be set to $0$. Basically, if we set $s_0=0$, then we must make sure that either $f_{hout}=1$ or $f_{sout}=1$.
+
+### Node index constraints
+
+Node index column $i$ is relevant only for Merkle path verification and Merkle root update computations, but to simplify the overall constraint system, the same constraints will be imposed on this column for all computations.
+
+Overall, we want values in the index column to behave as follows:
+
+- When we start a new computation, we should be able to set $i$ to an arbitrary value.
+- When a computation is finished, value in $i$ must be $0$.
+- When we absorb a new node into the hasher state we must shift the value in $i$ by one bit to the right.
+- In all other cases value in $i$ should not change.
+
+A shift by one bit to the right can be described with the following equation: $i = 2 \cdot i' + b$, where $b$ is the value of the bit which is discarded. Thus, as long as $b$ is a binary value, the shift to the right is performed correctly, and this can be enforced with the following constraint:
+
+$$
+b^2 - b = 0
+$$
+
+Since we want to enforce this constraint only when a new node is absorbed into the hasher state, we'll define a flag for when this should happen as follows:
+
+$$
+f_{an} = f_{mp} + f_{mv} + f_{mu} + f_{mpa} + f_{mva} + f_{mua}
+$$
+
+And then the full constraint would looks as follows:
+
+>$$
+f_{an} \cdot (b^2 - b) = 0  \text{ | degree} = 6
+$$
+
+Next, to make sure when a computation is finished $i=0$, we can use the following constraint:
+
+>$$
+f_{out} \cdot i = 0 \text{ | degree} = 4
+$$
+
+Finally, to make sure that the value in $i$ is copied over to the next row unless we are absorbing a new row or the computation is finished, we impose the following constraint:
+
+>$$
+(1 - f_{an} - f_{out}) \cdot (i' - i) = 0 \text{ | degree} = 5
+$$
+
+To satisfy these constraints for computations not related to Merkle paths (i.e., 2-to-1 hash and liner hash of elements), we can set $i = 0$ at the start of the computation. This guarantees that $i$ will remain $0$ until the end of the computation.
+
+### Hasher state constraints
+
+Hasher state columns $h_0, ..., h_{11}$ should behave as follows:
+
+- For the first $7$ row of every $8$-row cycle (i.e., when $k_0=0$), we need to apply [Rescue Prime Optimized](https://eprint.iacr.org/2022/1577) round constraints to the hasher state. For brevity, we omit these constraints from this note.
+- On the $8$th row of every $8$-row cycle, we apply the constraints based on which transition flag is set as described in the table below.
+
+Specifically, when absorbing the next set of elements into the state during linear hash computation (i.e., $f_{abp} = 1$), the first $4$ elements (the capacity portion) are carried over to the next row. For $j \in [0, 4)$ this can be described as follows:
+
+>$$
+f_{abp} \cdot (h'_j - h_j) = 0 \text{ | degree} = 5
+$$
+
+When absorbing the next node during Merkle path computation (i.e., $f_{mp} + f_{mv} + f_{mu}=1$), the result of the previous hash ($h_4, ..., h_7$) are copied over either to $(h_4', ..., h_7')$ or to $(h_8', ..., h_{11}')$ depending on the value of $b$, which is defined in the same way as in the previous section. For $j \in [0, 4)$ this can be described as follows:
+
+>$$
+(f_{mp} + f_{mv} + f_{mu}) \cdot ((1 - b) \cdot (h_{j +4}' - h_{j+4}) + b \cdot (h_{j + 8}' - h_{j + 4})) = 0 \text{ | degree} = 6
+$$
+
+Note, that when a computation is completed (i.e., $f_{out}=1$), the next hasher state is unconstrained.
+
+### Multiset check constraints
+In this sections we describe constraints which enforce updates for [multiset check columns](../lookups/multiset.md) $b_{chip}$ and $p_1$. These columns can be updated only on rows which are multiples of $8$ or $1$ less than a multiple of $8$. On all other rows the values in the columns remain the same.
+
+To simplify description of the constraints, we define the following variables. Below, we denote random values sent by the verifier after the prover commits to the main execution trace as $\alpha_0$, $\alpha_1$, $\alpha_2$ etc.
+
+$$
+m = op_{label} + 2^4 \cdot k_0 + 2^5 \cdot k_2 \\
+v_h = \alpha_0 + \alpha_1 \cdot m + \alpha_2 \cdot (clk + 1) + \alpha_3 \cdot i \\
+v_a = \sum_{j=0}^{3}(\alpha_{j+4} \cdot h_j) \\
+v_b = \sum_{j=4}^{7}(\alpha_{j+4} \cdot h_j) \\
+v_c = \sum_{j=8}^{11}(\alpha_{j+4} \cdot h_j) \\
+v_d = \sum_{j=8}^{11}(\alpha_j \cdot h_j) \\
+$$
+
+In the above:
+
+- $m$ is a _transition label_, composed of the [operation label](index.md#operation-labels) and the periodic columns that uniquely identify each transition function. The values in the $k_0$ and $k_2$ periodic columns are included to identify the row in the hash cycle where the operation occurs. They serve to differentiate between operations that share selectors but occur at different rows in the cycle, such as `BP`, which uses $op_{linhash}$ at the first row in the cycle to initiatiate a linear hash, and `ABP`, which uses $op_{linhash}$ at the last row in the cycle to absorb new elements.
+- $v_h$ is a _common header_ which is a combination of the transition label, a unique row address, and the node index. For the unique row address, the `clk` column from the system component is used, but we add $1$, because the system's `clk` column starts at $0$.
+- $v_a$, $v_b$, $v_c$ are the first, second, and third words (4 elements) of the hasher state.
+- $v_d$ is the third word of the hasher state but computed using the same $\alpha$ values as used for the second word. This is needed for computing the value of $v_{leaf}$ below to ensure that the same $\alpha$ values are used for the leaf node regardless of which part of the state the node comes from.
+
+#### Chiplets bus constraints
+
+As described previously, the [chiplets bus](index.md#chiplets-bus) $b_{chip}$, implemented as a running product column, is used to tie the hash chiplet with the main VM's stack and decoder. When receiving inputs from or returning results to the stack (or decoder), the hash chiplet multiplies $b_{chip}$ by their respective values. On the other side, when sending inputs to the hash chiplet or receiving results from the chiplet, the stack (or decoder) divides $b_{chip}$ by their values.
+
+In the section below we describe only the hash chiplet side of the constraints (i.e., multiplying $b_{chip}$ by relevant values). We define the values which are to be multiplied into $b_{chip}$ for each operation as follows:
+
+When starting a new simple or linear hash computation (i.e., $f_{bp}=1$) or when returning the entire state of the hasher ($f_{sout}=1$), the entire hasher state is included into $b_{chip}$:
+$$
+v_{all} = v_h + v_a + v_b + v_c
+$$
+
+When starting a Merkle path computation (i.e., $f_{mp} + f_{mv} + f_{mu} = 1$), we include the leaf of the path into $b_{chip}$. The leaf is selected from the state based on value of $b$ (defined as in the previous section):
+$$
+v_{leaf} = v_h + (1-b) \cdot v_b + b \cdot v_d
+$$
+
+When absorbing a new set of elements into the state while computing a linear hash (i.e., $f_{abp}=1$), we include deltas between the last $8$ elements of the hasher state (the rate) into $b_{chip}$:
+$$
+v_{abp} = v_h + v'_b + v'_c - (v_b + v_c)
+$$
+
+When a computation is complete (i.e., $f_{hout}=1$), we include the second word of the hasher state (the result) into $b_{chip}$:
+$$
+v_{res} = v_h + v_b
+$$
+
+Using the above values, we can describe the constraints for updating column $b_{chip}$ as follows.
+
+>$$
+b_{chip}' = b_{chip} \cdot ((f_{bp} + f_{sout}) \cdot v_{all} + (f_{mp} + f_{mv} + f_{mu}) \cdot v_{leaf} + f_{abp} \cdot v_{abp} + f_{hout} \cdot v_{res} + \\
+1 - (f_{bp} + f_{mp} + f_{mv} + f_{mu} + f_{abp} + f_{out}))
+$$
+
+The above constraint reduces to the following under various flag conditions:
+
+| Condition      | Applied constraint                    |
+| -------------- | ------------------------------------- |
+| $f_{bp} = 1$   | $b_{chip}' = b_{chip} \cdot v_{all}$  |
+| $f_{sout} = 1$ | $b_{chip}' = b_{chip} \cdot v_{all}$  |
+| $f_{mp} = 1$   | $b_{chip}' = b_{chip} \cdot v_{leaf}$ |
+| $f_{mv} = 1$   | $b_{chip}' = b_{chip} \cdot v_{leaf}$ |
+| $f_{mu} = 1$   | $b_{chip}' = b_{chip} \cdot v_{leaf}$ |
+| $f_{abp} = 1$  | $b_{chip}' = b_{chip} \cdot v_{abp}$  |
+| $f_{hout} = 1$ | $b_{chip}' = b_{chip} \cdot v_{res}$  |
+| Otherwise      | $b_{chip}' = b_{chip}$                |
+
+Note that the degree of the above constraint is $7$.
+
+#### Sibling table constraints
+
+*Note: Although this table is described independently, it is implemented as part of the [chiplets virtual table](index.md#chiplets-virtual-table), which combines all virtual tables required by the any of the chiplets into a single master table.*
+
+As mentioned previously, the sibling table (represented by running column $p_1$) is used to keep track of sibling nodes used during Merkle root update computations. For this computation, we need to enforce the following rules:
+* When computing the old Merkle root, whenever a new sibling node is absorbed into the hasher state (i.e., $f_{mv} + f_{mva} = 1$), an entry for this sibling should be included into $p_1$.
+* When computing the new Merkle root, whenever a new sibling node is absorbed into the hasher state (i.e., $f_{mu} + f_{mua} = 1$), the entry for this sibling should be removed from $p_1$.
+
+To simplify the description of the constraints, we use variables $v_b$ and $v_c$ defined above and define the value representing an entry in the sibling table as follows:
+$$
+v_{sibling} = \alpha_0 + \alpha_3 \cdot i + b \cdot v_b + (1-b) \cdot v_c
+$$
+
+Using the above value, we can define the constraint for updating $p_1$ as follows:
+
+>$$
+p_1' \cdot \left( (f_{mv} + f_{mva}) \cdot v_{sibling} + 1 - (f_{mv} + f_{mva}) \right) = \\
+p_1 \cdot \left( (f_{mu} + f_{mua}) \cdot v_{sibling} + 1 - (f_{mu} + f_{mua}) \right)
+$$
+
+The above constraint reduces to the following under various flag conditions:
+
+| Condition      | Applied constraint             |
+| -------------- | ------------------------------ |
+| $f_{mv} = 1$   | $p_1' \cdot v_{sibling} = p_1$ |
+| $f_{mva} = 1$  | $p_1' \cdot v_{sibling} = p_1$ |
+| $f_{mu} = 1$   | $p_1' = p_1 \cdot v_{sibling}$ |
+| $f_{mua} = 1$  | $p_1' = p_1 \cdot v_{sibling}$ |
+| Otherwise      | $p_1' = p_1$                   |
+
+Note that the degree of the above constraint is $7$.
+
+To make sure computation of the old Merkle root is immediately followed by the computation of the new Merkle root, we impose the following constraint:
+
+>$$
+(f_{bp} + f_{mp} + f_{mv}) \cdot (1 - p_1) = 0 \text{ | degree} = 5
+$$
+
+The above means that whenever we start a new computation which is not the computation of the new Merkle root, the sibling table must be empty. Thus, after the hash chiplet computes the old Merkle root, the only way to clear the table is to compute the new Merkle root.
+
+Together with boundary constraints enforcing that $p_1=1$ at the first and last rows of the running product column which implements the sibling table, the above constraints ensure that if a node was included into $p_1$ as a part of computing the old Merkle root, the same node must be removed from $p_1$ as a part of computing the new Merkle root. These two boundary constraints are described as part of the [chiplets virtual table constraints](index.md#chiplets-virtual-table-constraints).
diff --git a/docs/miden/vm/design/chiplets/index.md b/docs/miden/vm/design/chiplets/index.md
new file mode 100644
index 000000000..835ca33f1
--- /dev/null
+++ b/docs/miden/vm/design/chiplets/index.md
@@ -0,0 +1,200 @@
+The chiplets module contains specialized components dedicated to accelerating complex computations. Each chiplet specializes in executing a specific type of computation and is responsible for proving both the correctness of its computations and its own internal consistency.
+
+Currently, Miden VM relies on 4 chiplets:
+
+- The [hash chiplet](hasher.md) (also referred to as the Hasher), used to compute Rescue Prime Optimized hashes both for sequential hashing and for Merkle tree hashing.
+- The [bitwise chiplet](bitwise.md), used to compute bitwise operations (e.g., `AND`, `XOR`) over 32-bit integers.
+- The [memory chiplet](memory.md), used to support random-access memory in the VM.
+- The [kernel ROM chiplet](kernel_rom.md), used to enable executing kernel procedures during the [`SYSCALL` operation](../programs.md#syscall-block).
+
+Each chiplet executes its computations separately from the rest of the VM and proves the internal correctness of its execution trace in a unique way that is specific to the operation(s) it supports. These methods are described by each chiplet’s documentation.
+
+## Chiplets module trace
+
+The execution trace of the Chiplets module is generated by stacking the execution traces of each of its chiplet components. Because each chiplet is expected to generate significantly fewer trace rows than the other VM components (i.e., the decoder, stack, and range checker), stacking them enables the same functionality without adding as many columns to the execution trace.
+
+Each chiplet is identified within the Chiplets module by one or more chiplet selector columns which cause its constraints to be selectively applied.
+
+The result is an execution trace of 17 trace columns, which allows space for the widest chiplet component (the hash chiplet) and a column to select for it.
+
+![chiplets](../../../../img/miden/vm/design/chiplets/chiplets.png)
+
+During the finalization of the overall execution trace, the chiplets' traces (including internal selectors) are appended to the trace of the Chiplets module one after another, as pictured. Thus, when one chiplet's trace ends, the trace of the next chiplet starts in the subsequent row.
+
+Additionally, a padding segment is added to the end of the Chiplets module's trace so that the number of rows in the table always matches the overall trace length of the other VM processors, regardless of the length of the chiplet traces. The padding will simply contain zeroes.
+
+### Chiplets order
+
+The order in which the chiplets are stacked is determined by the requirements of each chiplet, including the width of its execution trace and the degree of its constraints.
+
+For simplicity, all of the "cyclic" chiplets which operate in multi-row cycles and require starting at particular row increments should come before any non-cyclic chiplets, and these should be ordered from longest-cycle to shortest-cycle. This avoids any additional alignment padding between chiplets.
+
+After that, chiplets are ordered by degree of constraints so that higher-degree chiplets get lower-degree chiplet selector flags.
+
+The resulting order is as follows:
+
+| Chiplet         | Cycle Length | Internal Degree | Chiplet Selector Degree | Total Degree | Columns | Chiplet Selector Flag |
+| --------------- | :----------: | :-------------: | :---------------------: | :----------: | :-----: | --------------------- |
+| Hash chiplet    |      8       |        8        |            1            |       9      |   17    | $\{0\}$               |
+| Bitwise chiplet |      8       |        3        |            2            |       5      |   13    | $\{1, 0\}$            |
+| Memory          |      -       |        6        |            3            |       9      |   12    | $\{1, 1, 0\}$         |
+| Kernel ROM      |      -       |        2        |            4            |       6      |   6     | $\{1, 1, 1, 0\}$      |
+| Padding         |      -       |        -        |            -            |       -      |   -     | $\{1, 1, 1, 1\}$      |
+
+### Additional requirements for stacking execution traces
+
+Stacking the chiplets introduces one new complexity. Each chiplet proves its own correctness with its own set of internal transition constraints, many of which are enforced between each row in its trace and the next row. As a result, when the chiplets are stacked, transition constraints applied to the final row of one chiplet will cause a conflict with the first row of the following chiplet.
+
+This is true for any transition constraints which are applied at every row and selected by a `Chiplet Selector Flag` for the current row. (Therefore cyclic transition constraints controlled by periodic columns do not cause an issue.)
+
+This requires the following adjustments for each chiplet.
+
+**In the hash chiplet:** there is no conflict, and therefore no change, since all constraints are periodic.
+
+**In the bitwise chiplet:** there is no conflict, and therefore no change, since all constraints are periodic.
+
+**In the memory chiplet:** all transition constraints cause a conflict. To adjust for this, the selector flag for the memory chiplet is designed to exclude its last row. Thus, memory constraints will not be applied when transitioning from the last row of the memory chiplet to the subsequent row. This is achieved without any additional increase in the degree of constraints by using $s'_2$ as a selector instead of $s_2$ as seen [below](#chiplet-constraints).
+
+**In the kernel ROM chiplet:** the transition constraints applied to the $addr$ column cause a conflict. It is resolved by using a virtual flag to exclude the last row, which increases the degree of these constraints to $3$.
+
+## Operation labels
+
+Each operation supported by the chiplets is given a unique identifier to ensure that the requests and responses sent to the [chiplets bus](#chiplets-bus) ($b_{chip}$) are indeed processed by the intended chiplet for that operation and that chiplets which support more than one operation execute the correct one.
+
+The labels are composed from the flag values of the chiplet selector(s) and internal operation selectors (if applicable). The unique label of the operation is computed as the binary aggregation of the combined selectors plus $1$, note that the combined flag is represented in big-endian, so the bit representation below is reverted.
+
+| Operation              | Chiplet Selector Flag | Internal Selector Flag | Combined Flag    | Label |
+| ---------------------- | --------------------- | :--------------------: | ---------------- | :---: |
+| `HASHER_LINEAR_HASH`   |        $\{0\}$        |     $\{1, 0, 0\}$      | $\{0, 1, 0, 0\}$ |   3   |
+| `HASHER_MP_VERIFY`     |        $\{0\}$        |     $\{1, 0, 1\}$      | $\{0, 1, 0, 1\}$ |  11   |
+| `HASHER_MR_UPDATE_OLD` |        $\{0\}$        |     $\{1, 1, 0\}$      | $\{0, 1, 1, 0\}$ |   7   |
+| `HASHER_MR_UPDATE_NEW` |        $\{0\}$        |     $\{1, 1, 1\}$      | $\{0, 1, 1, 1\}$ |  15   |
+| `HASHER_RETURN_HASH`   |        $\{0\}$        |     $\{0, 0, 0\}$      | $\{0, 0, 0, 0\}$ |   1   |
+| `HASHER_RETURN_STATE`  |        $\{0\}$        |     $\{0, 0, 1\}$      | $\{0, 0, 0, 1\}$ |   9   |
+| `BITWISE_AND`          |      $\{1, 0\}$       |       $\{0\}$          | $\{1, 0, 0\}$    |   2   |
+| `BITWISE_XOR`          |      $\{1, 0\}$       |       $\{1\}$          | $\{1, 0, 1\}$    |   6   |
+| `MEMORY_READ`          |     $\{1, 1, 0\}$     |       $\{1\}$          | $\{1, 1, 0, 1\}$ |  12   |
+| `MEMORY_WRITE`         |     $\{1, 1, 0\}$     |       $\{0\}$          | $\{1, 1, 0, 0\}$ |   4   |
+| `KERNEL_PROC_CALL`     |    $\{1, 1, 1, 0\}$   |                        | $\{1, 1, 1, 0\}$ |   8   |
+
+## Chiplets module constraints
+
+### Chiplet constraints
+
+Each chiplet's internal constraints are defined in the documentation for the individual chiplets. To ensure that constraints are only ever selected for one chiplet at a time, the module's selector columns $s_0, s_1, s_2, s_3$ are combined into flags. Each chiplet's internal constraints are multiplied by its chiplet selector flag, and the degree of each constraint is correspondingly increased.
+
+This gives the following sets of constraints:
+
+> $$
+(1 - s_0) \cdot c_{hash} = 0 \text{ | degree} = 1 + \deg(c_{hash})
+$$
+
+> $$
+s_0 \cdot (1 - s_1) \cdot c_{bitwise} = 0 \text{ | degree} = 2 + \deg(c_{bitwise})
+$$
+
+> $$
+s_0 \cdot s_1 \cdot (1 - s'_2) \cdot c_{memory} = 0 \text{ | degree} = 3 + \deg(c_{memory})
+$$
+
+> $$
+s_0 \cdot s_1 \cdot (s_2) \cdot (1 - s'_3) \cdot c_{krom} = 0 \text{ | degree} = 4 + \deg(c_{krom})
+$$
+
+
+In the above:
+- $c_{hash}, c_{bitwise}, c_{memory}, c_{krom}$ each represent an internal constraint from the indicated chiplet.
+- $\deg(c)$ indicates the degree of the specified constraint.
+- flags are applied in a like manner for all internal constraints in each respective chiplet.
+- the selector for the memory chiplet excludes the last row of the chiplet (as discussed [above](#additional-requirements-for-stacking-execution-traces)).
+
+### Chiplet selector constraints
+
+We also need to ensure that the chiplet selector columns are set correctly. Although there are three columns for chiplet selectors, the stacked trace design means that they do not all act as selectors for the entire trace. Thus, selector constraints should only be applied to selector columns when they are acting as selectors.
+
+- $s_0$ acts as a selector for the entire trace.
+- $s_1$ acts as a selector column when $s_0 = 1$.
+- $s_2$ acts as a selector column when $s_0 = 1$ and $s_1 = 1$.
+- $s_3$ acts as a selector column when $s_0 = 1$, $s_1 = 1$, and $s_2 = 1$.
+
+Two conditions must be enforced for columns acting as chiplet selectors.
+
+1. When acting as a selector, the value in the selector column must be binary.
+2. When acting as a selector, the value in the selector column may only change from $0 \rightarrow 1$.
+
+The following constraints ensure that selector values are binary.
+
+> $$
+s_0^2 - s_0 = 0 \text{ | degree} = 2 \\
+s_0 \cdot (s_1^2 - s_1) = 0 \text{ | degree} = 3 \\
+s_0 \cdot s_1 \cdot (s_2^2 - s_2) = 0 \text{ | degree} = 4
+s_0 \cdot s_1 \cdot s_2 \cdot (s_3^2 - s_3) = 0 \text{ | degree} = 5
+$$
+
+The following constraints ensure that the chiplets are stacked correctly by restricting selector values so they can only change from $0 \rightarrow 1$.
+
+> $$
+s_0 \cdot (s_0 - s'_0) = 0 \text{ | degree} = 2 \\
+s_0 \cdot s_1 \cdot (s_1 - s'_1) \text{ | degree} = 3 \\
+s_0 \cdot s_1 \cdot s_2 \cdot (s_2 - s'_2) \text{ | degree} = 4 \\
+s_0 \cdot s_1 \cdot s_2 \cdot s_3 \cdot (s_3 - s'_3) \text{ | degree} = 5 \\
+$$
+
+In other words, the above constraints enforce that if a selector is $0$ in the current row, then it must be either $0$ or $1$ in the next row; if it is $1$ in the current row, it must be $1$ in the next row.
+
+## Chiplets bus
+
+The chiplets must be explicitly connected to the rest of the VM in order for it to use their operations. This connection must prove that all specialized operations which a given VM component claimed to offload to one of the chiplets were in fact executed by the correct chiplet with the same set of inputs and outputs as those used by the offloading component.
+
+This is achieved via a [bus](../lookups/index.md#communication-buses-in-miden-vm) called $b_{chip}$ where a request can be sent to any chiplet and a corresponding response will be sent back by that chiplet.
+
+The bus is implemented as a single [running product column](../lookups/multiset.md) where:
+
+- Each request is “sent” by computing an operation-specific lookup value from an [operation-specific label](#operation-labels), the operation inputs, and the operation outputs, and then dividing it out of the $b_{chip}$ running product column.
+- Each chiplet response is “sent” by computing the same operation-specific lookup value from the label, inputs, and outputs, and then multiplying it into the $b_{chip}$ running product column.
+
+Thus, if the requests and responses match, then the bus column $b_{chip}$ will start and end with the value $1$. This condition is enforced by boundary constraints on the $b_{chip}$ column.
+
+Note that the order of the requests and responses does not matter, as long as they are all included in $b_{chip}$. In fact, requests and responses for the same operation will generally occur at different cycles.
+
+### Chiplets bus constraints
+
+The chiplets bus constraints are defined by the components that use it to communicate.
+
+Lookup requests are sent to the chiplets bus by the following components:
+
+- The stack sends requests for [bitwise](../stack/u32-ops.md#u32and), [memory](../stack/io-ops.md#memory-access-operations), and [cryptographic hash operations](../stack/crypto-ops.md).
+- The decoder sends requests for [hash operations](../decoder/index.md#program-block-hashing) for program block hashing.
+- The decoder sends a procedure access request to the [Kernel ROM chiplet](./kernel_rom.md) for each `SYSCALL` during [program block hashing](../decoder/index.md#program-block-hashing).
+
+Responses are provided by the [hash](hasher.md#chiplets-bus-constraints), [bitwise](bitwise.md#chiplets-bus-constraints), [memory](memory.md#chiplets-bus-constraints), and [kernel ROM](kernel_rom.md#chiplets-bus-constraints) chiplets.
+
+## Chiplets virtual table
+
+Some chiplets require the use of a [virtual table](../lookups/multiset.md#virtual-tables) to maintain and enforce the correctness of their internal state. Because the length of these virtual tables does not exceed the length of the chiplets themselves, a single virtual table called $vt_{chip}$ can be shared by all chiplets.
+
+Currently, the chiplets virtual table combines two virtual tables:
+- the hash chiplet's [sibling table](hasher.md#sibling-table-constraints)
+- the kernel ROM chiplet's [kernel procedure table](kernel_rom.md#kernel-procedure-table-constraints)
+
+To combine these correctly, the [running product column](../lookups/multiset.md) for this table must be constrained not only at the beginning and the end of the trace, but also where the hash chiplet ends and where the kernel ROM chiplet begins. These positions can be identified using the chiplet selector columns.
+
+### Chiplets virtual table constraints
+
+The expected boundary values for each chiplet's portion of the virtual table must be enforced. This can be done as follows.
+
+For the sibling table to be properly constrained, the value of the running product column must be $1$ when the sibling table starts and finishes. This can be achieved by:
+- enforcing a boundary constraint for $vt_{chip}=1$ at the first row
+- using the the following transition constraint to enforce that the value is once again $1$ at the last cycle of the hash chiplet.
+
+> $$
+(s'_0 - s_0) \cdot (1 - vt_{chip}) = 0 \text{ | degree} = 2
+$$
+
+For the kernel procedure table to be properly constrained, the value must be $1$ when it starts, and it must be equal to the product of all of the kernel ROM procedures when it finishes. This can be achieved by:
+- enforcing a boundary constraint against the last row for the value of all of the kernel ROM procedures
+- using the following transition constraint to enforce that when the active chiplet changes to the kernel ROM chiplet the value is $1$.
+
+> $$
+s_0 \cdot s_1 \cdot (s'_2 - s_2) \cdot (1 - vt'_{chip}) = 0 \text{ | degree} = 4
+$$
\ No newline at end of file
diff --git a/docs/miden/vm/design/chiplets/kernel_rom.md b/docs/miden/vm/design/chiplets/kernel_rom.md
new file mode 100644
index 000000000..c29d7185d
--- /dev/null
+++ b/docs/miden/vm/design/chiplets/kernel_rom.md
@@ -0,0 +1,95 @@
+The kernel ROM enables executing predefined kernel procedures. These procedures are always executed in the root context and can only be accessed by a `SYSCALL` operation. The chiplet tracks and enforces correctness of all kernel procedure calls as well as maintaining a list of all the procedures defined for the kernel, whether they are executed or not. More background about Miden VM execution contexts can be found [here](../../user-docs/assembly/execution-contexts.md).
+
+## Kernel ROM trace
+
+The kernel ROM table consists of 6 columns.
+
+![kernel_rom_execution_trace](../../../../img/miden/vm/design/chiplets/kernel_rom/kernel_rom_execution_trace.png)
+
+The meaning of columns in the above is as follows:
+
+- Column $s_0$ specifies whether the value in the row should be included into the chiplets bus $b_{chip}$.
+- $addr$ is a row address column which starts out at $0$ and must either remain the same or be incremented by $1$ with every row.
+- $r_0, ..., r_3$ are contain the roots of the kernel functions. The values in these columns can change only when the value in the $addr$ column changes. If the $addr$ column remains the same, the values in the $r$ columns must also remain the same.
+
+## Constraints
+
+The following constraints are required to enforce correctness of the kernel ROM trace.
+
+For convenience, let's define $\Delta addr = addr' - addr$.
+
+The $s_0$ column must be binary.
+
+> $$
+s_0^2 - s_0 = 0 \text{ | degree} = 2
+$$
+
+The value in the $addr$ column must either stay the same or increase by $1$.
+
+> $$
+\Delta addr \cdot (1 - \Delta addr) = 0 \text{ | degree} = 2
+$$
+
+Finally, if the $addr$ column stays the same then the kernel procedure root must not change. This can be achieved by enforcing the following constraint against each of the four procedure root columns:
+
+> $$
+(1 - \Delta addr) \cdot (r_i' - r_i) = 0 \text{ | degree} = 2
+$$
+
+These constraints on $addr$ should not be applied to the very last row of the kernel ROM's execution trace, since we do not want to enforce a value that would conflict with the first row of a subsequent chiplet (or padding). Therefore we can create a special virtual flag for this constraint using the $chip\_s_3$ selector column from the [chiplets](index.md) module that selects for the kernel ROM chiplet.
+
+The modified constraints which should be applied are the following:
+
+>$$
+(1 - chip\_s_3') \cdot \Delta addr \cdot (1 - \Delta addr) = 0 \text{ | degree} = 3
+$$
+
+>$$
+(1 - chip\_s_3') \cdot (1 - \Delta addr) \cdot (r_i' - r_i) = 0 \text{ | degree} = 3
+$$
+
+!!! note
+    These constraints should also be multiplied by chiplets module's selector flag for the kernel ROM chiplet, as is true for all constraints in this chiplet._
+
+## Chiplets bus constraints
+
+The chiplets bus is used to keep track of all kernel function calls. To simplify the notation for describing kernel ROM constraints on the chiplets bus, we'll first define variable $u$, which represents how each kernel procedure in the kernel ROM's execution trace is reduced to a single value. Denoting the random values received from the verifier as $\alpha_0, \alpha_1$, etc., this can be achieved as follows.
+
+$$
+v = \alpha_0 + \alpha_1 \cdot op_{krom} + \sum_{i=0}^3 (\alpha_{i + 2} \cdot r_i)
+$$
+
+Where, $op_{krom}$ is the unique [operation label](./index.md#operation-labels) of the kernel procedure call operation.
+
+The request side of the constraint for the operation is enforced during program block hashing of the [`SYSCALL` operation](../decoder/constraints.md#block-hash-computation-constraints).
+
+To provide accessed kernel procedures to the chiplets bus, we must send the kernel procedure to the bus every time it is called, which is indicated by the $s_0$ column.
+
+> $$
+b'_{chip} = b_{chip} \cdot (s_0 \cdot v + 1 - s_0) \text{ | degree} = 3
+$$
+
+Thus, when $s_0 = 0$ this reduces to $b'_{chip} = b_{chip}$, but when $s_0=1$ it becomes $b'_{chip} = b_{chip} \cdot u$.
+
+## Kernel procedure table constraints
+
+!!! note
+    Although this table is described independently, it is implemented as part of the [chiplets virtual table](../chiplets/index.md#chiplets-virtual-table), which combines all virtual tables required by the any of the chiplets into a single master table.*
+
+This kernel procedure table keeps track of all *unique* kernel function roots. The values in this table will be updated only when the value in the address column changes.
+
+The row value included into $vt_{chip}$ is:
+
+$$
+v = \alpha_0 + \alpha_1 \cdot addr + \sum_{i=0}^3 (\alpha_{i + 2} \cdot r_i)
+$$
+
+The constraint against $vt_{chip}$ is:
+
+> $$
+vt_{chip}' = vt_{chip} \cdot (\Delta addr \cdot v + 1 - \Delta addr) \text{ | degree} = 3
+$$
+
+Thus, when $\Delta addr = 0$, the above reduces to $vt'_{chip}=vt_{chip}$, but when $\Delta addr = 1$, the above becomes $vt'_{chip} = vt_{chip} \cdot v$.
+
+We also need to impose boundary constraints to make sure that running product column implementing the kernel procedure table is equal to $1$ when the kernel procedure table begins and to the product of all unique kernel functions when it ends. The last boundary constraint means that the verifier only needs to know which kernel was used, but doesn't need to know which functions were invoked within the kernel. These two constraints are described as part of the [chiplets virtual table constraints](../chiplets/index.md#chiplets-virtual-table-constraints).
\ No newline at end of file
diff --git a/docs/miden/vm/design/chiplets/memory.md b/docs/miden/vm/design/chiplets/memory.md
new file mode 100644
index 000000000..fc9341e67
--- /dev/null
+++ b/docs/miden/vm/design/chiplets/memory.md
@@ -0,0 +1,286 @@
+# Memory chiplet
+
+Miden VM supports linear read-write random access memory. This memory is word-addressable, meaning, four values are located at each address, and we can read and write values to/from memory in batches of four. Each value is a field element in a $64$-bit prime field with modulus $2^{64} - 2^{32} + 1$. Memory address can be any field element.
+
+In this note we describe the rationale for selecting the above design and describe AIR constraints needed to support it.
+
+The design makes extensive use of $16$-bit range checks. An efficient way of implementing such range checks is described [here](../range.md).
+
+## Alternative designs
+
+The simplest (and most efficient) alternative to the above design is contiguous write-once memory. To support such memory, we need to allocate just two trace columns as illustrated below.
+
+![memory_alternative_design](../../../../img/miden/vm/design/chiplets/memory/memory_alternative_design.png)
+
+In the above, `addr` column holds memory address, and `value` column holds the field element representing the value stored at this address. Notice that some rows in this table are duplicated. This is because we need one row per memory access (either read or write operation). In the example above, value $b$ was first stored at memory address $1$, and then read from this address.
+
+The AIR constraints for this design are very simple. First, we need to ensure that values in the `addr` column either remain the same or are incremented by $1$ as we move from one row to the next. This can be achieved with the following constraint:
+
+$$
+(a' - a) \cdot (a' - a - 1) = 0
+$$
+
+where $a$ is the value in `addr` column in the current row, and $a'$ is the value in this column in the next row.
+
+Second, we need to make sure that if the value in the `addr` column didn't change, the value in the `value` column also remained the same (i.e., a value stored in a given address can only be set once). This can be achieved with the following constraint:
+
+$$
+(v' - v) \cdot (a' - a - 1) = 0
+$$
+
+where $v$ is the value in `value` column at the current row, and $v'$ is the value in this column in the next row.
+
+As mentioned above, this approach is very efficient: each memory access requires just $2$ trace cells.
+
+### Read-write memory
+
+Write-once memory is tricky to work with, and many developers may need to climb a steep learning curve before they become comfortable working in this model. Thus, ideally, we'd want to support read-write memory. To do this, we need to introduce additional columns as illustrated below.
+
+![memory_read_write](../../../../img/miden/vm/design/chiplets/memory/memory_read_write.png)
+
+In the above, we added `clk` column, which keeps track of the clock cycle at which memory access happened. We also need to differentiate between memory reads and writes. To do this, we now use two columns to keep track of the value: `old val` contains the value stored at the address before the operation, and `new val` contains the value after the operation. Thus, if `old val` and `new val` are the same, it was a read operation. If they are different, it was a write operation.
+
+The AIR constraints needed to support the above structure are as follows.
+
+We still need to make sure memory addresses are contiguous:
+
+$$
+(a' - a) \cdot (a' - a - 1) = 0
+$$
+
+Whenever memory address changes, we want to make sure that `old val` is set to $0$ (i.e., our memory is always initialized to $0$). This can be done with the following constraint:
+
+$$
+(a' - a) \cdot v_{old}' = 0
+$$
+
+On the other hand, if memory address doesn't change, we want to make sure that `new val` in the current row is the same as `old val` in the next row. This can be done with the following constraint:
+
+$$
+(1 + a - a') \cdot (v_{new} - v_{old}') = 0
+$$
+
+Lastly, we need to make sure that for the same address values in `clk` column are always increasing. One way to do this is to perform a $16$-bit range check on the value of $(i' - i - 1)$, where $i$ is the reference to `clk` column. However, this would mean that memory operations involving the same address must happen within $65536$ VM cycles from each other. This limitation would be difficult to enforce statically. To remove this limitation, we need to add two more columns as shown below:
+
+![memory_limitation_diagram](../../../../img/miden/vm/design/chiplets/memory/memory_limitation_diagram.png)
+
+In the above column `d0` contains the lower $16$ bits of $(i' - i - 1)$ while `d1` contains the upper $16$ bits. The constraint needed to enforces this is as follows:
+
+$$
+(1 + a - a') \cdot ((i' - i - 1) - (2^{16} \cdot d_1' + d_0')) = 0
+$$
+
+Additionally, we need to apply $16$-bit range checks to columns `d0` and `d1`.
+
+Overall, the cost of reading or writing a single element is now $6$ trace cells and $2$ $16$-bit range-checks.
+
+### Non-contiguous memory
+
+Requiring that memory addresses are contiguous may also be a difficult limitation to impose statically. To remove this limitation, we need to introduce one more column as shown below:
+
+![memory_non_contiguous_memory](../../../../img/miden/vm/design/chiplets/memory/memory_non_contiguous_memory.png)
+
+In the above, the prover sets the value in the new column `t` to $0$ when the address doesn't change, and to $1 / (a' - a)$ otherwise. To simplify constraint description, we'll define variable $n$ computed as follows:
+
+$$
+n = (a' - a) \cdot t'
+$$
+
+Then, to make sure the prover sets the value of $t$ correctly, we'll impose the following constraints:
+
+$$
+n^2 - n = 0 \\
+(1 - n) \cdot  (a' - a) = 0
+$$
+
+The above constraints ensure that $n=1$ whenever the address changes, and $n=0$ otherwise. We can then define the following constraints to make sure values in columns `d0` and `d1` contain either the delta between addresses or between clock cycles.
+
+| Condition | Constraint                                      | Comments                                                                                                                                   |
+| --------- | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| $n=1$     | $(a' - a) - (2^{16} \cdot d_1' + d_0') = 0$     | When the address changes, columns `d0` and `d1` at the next row should contain the delta between the old and the new address.              |
+| $n=0$     | $(i' - i - 1) - (2^{16} \cdot d_1' + d_0') = 0$ | When the address remains the same, columns `d0` and `d1` at the next row should contain the delta between the old and the new clock cycle. |
+
+We can combine the above constraints as follows:
+
+$$
+\left(n \cdot (a' - a) + (1 - n) \cdot (i' - i - 1)\right) - (2^{16} \cdot d_1' + d_0') = 0
+$$
+
+The above constraint, in combination with $16$-bit range checks against columns `d0` and `d1` ensure that values in `addr` and `clk` columns always increase monotonically, and also that column `addr` may contain duplicates, while values in `clk` column must be unique for a given address.
+
+### Context separation
+
+In many situations it may be desirable to assign memories to different contexts. For example, when making a cross-contract calls, the memories of the caller and the callee should be separate. That is, the caller should not be able to access the memory of the callee and vice-versa.
+
+To accommodate this feature, we need to add one more column as illustrated below.
+
+![memory_context_separation](../../../../img/miden/vm/design/chiplets/memory/memory_context_separation.png)
+
+This new column `ctx` should behave similarly to the address column: values in it should increase monotonically, and there could be breaks between them. We also need to change how the prover populates column `t`:
+
+- If the context changes, `t` should be set to the inverse $(c' - c)$, where $c$ is a reference to column `ctx`.
+- If the context remains the same but the address changes, column `t` should be set to the inverse of $(a' - a)$.
+- Otherwise, column `t` should be set to $0$.
+
+To simplify the description of constraints, we'll define two variables $n_0$ and $n_1$ as follows:
+
+$$
+n_0 = (c' - c) \cdot t' \\
+n_1 = (a' - a) \cdot t'
+$$
+
+Thus, $n_0 = 1$ when the context changes, and $0$ otherwise. Also, $(1 - n_0) \cdot n_1 = 1$ when context remains the same and address changes, and $0$ otherwise.
+
+To make sure the prover sets the value of column `t` correctly, we'll need to impose the following constraints:
+
+$$
+n_0^2 - n_0 = 0 \\
+(1 - n_0) \cdot  (c' - c) = 0 \\
+(1 - n_0) \cdot (n_1^2 - n_1) = 0 \\
+(1 - n_0) \cdot (1 - n_1) \cdot (a' - a) = 0
+$$
+
+We can then define the following constraints to make sure values in columns `d0` and `d1` contain the delta between contexts, between addresses, or between clock cycles.
+
+| Condition            | Constraint                                      | Comments                                                                                                                                                         |
+| -------------------- | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| $n_0=1$              | $(c' - c) - (2^{16} \cdot d_1' + d_0') = 0$     | When the context changes, columns `d0` and `d1` at the next row should contain the delta between the old and the new contexts.                                   |
+| $n_0=0$ <br> $n_1=1$ | $(a' - a) - (2^{16} \cdot d_1' + d_0') = 0$     | When the context remains the same but the address changes, columns `d0` and `d1` at the next row should contain the delta between the old and the new addresses. |
+| $n_0=0$ <br> $n_1=0$ | $(i' - i - 1) - (2^{16} \cdot d_1' + d_0') = 0$ | When both the context and the address remain the same, columns `d0` and `d1` at the next row should contain the delta between the old and the new clock cycle.   |
+
+We can combine the above constraints as follows:
+
+$$
+\left(n_0 \cdot (c' - c) + (1 - n_0) \cdot \left(n_1 \cdot (a - a') + (1 - n_1) \cdot (i' - i - 1) \right) \right) - (2^{16} \cdot d_1' + d_0') = 0
+$$
+
+The above constraint, in combination with $16$-bit range checks against columns `d0` and `d1` ensure that values in `ctx`, `addr`, and `clk` columns always increase monotonically, and also that columns `ctx` and `addr` may contain duplicates, while the values in column `clk` must be unique for a given combination of `ctx` and `addr`.
+
+Notice that the above constraint has degree $5$.
+
+## Miden approach
+
+While the approach described above works, it comes at significant cost. Reading or writing a single value requires $8$ trace cells and $2$ $16$-bit range checks. Assuming a single range check requires roughly $2$ trace cells, the total number of trace cells needed grows to $12$. This is about $6$x worse the simple contiguous write-once memory described earlier.
+
+Miden VM frequently needs to deal with batches of $4$ field elements, which we call _words_. For example, the output of Rescue Prime Optimized hash function is a single word. A single 256-bit integer value can be stored as two words (where each element contains one $32$-bit value). Thus, we can optimize for this common use case by making the memory _word-addressable_. That is $4$ field elements are located at each memory address, and we can read and write elements to/from memory in batches of four.
+
+The layout of Miden VM memory table is shown below:
+
+![memory_miden_vm_layout](../../../../img/miden/vm/design/chiplets/memory/memory_miden_vm_layout.png)
+
+where:
+
+- `s0` is a selector column which is set to $1$ for read operations and $0$ for write operations.
+- `s1` is a selector oclumn which is set to $1$ when previously accessed memory is being read and $0$ otherwise. In other words, it is set to $1$ only when the context and address are the same as they were in the previous row and the `s0` operation selector is set to $1$ (indicating a read).
+- `ctx` contains context ID. Values in this column must increase monotonically but there can be gaps between two consecutive values of up to $2^{32}$. Also, two consecutive values can be the same. In AIR constraint description below, we refer to this column as $c$.
+- `addr` contains memory address. Values in this column must increase monotonically for a given context but there can be gaps between two consecutive values of up to $2^{32}$. Also, two consecutive values can be the same. In AIR constraint description below, we refer to this column as $a$.
+- `clk` contains clock cycle at which the memory operation happened. Values in this column must increase monotonically for a given context and memory address but there can be gaps between two consecutive values of up to $2^{32}$. In AIR constraint description below, we refer to this column as $i$.
+- `v0, v1, v2, v3` columns contain field elements stored at a given context/address/clock cycle after the memory operation.
+- Columns `d0` and `d1` contain lower and upper $16$ bits of the delta between two consecutive context IDs, addresses, or clock cycles. Specifically:
+  - When the context changes, these columns contain $(c' - c)$.
+  - When the context remains the same but the address changes, these columns contain $(a' - a)$.
+  - When both the context and the address remain the same, these columns contain $(i' - i - 1)$.
+- Column `t` contains the inverse of the delta between two consecutive context IDs, addresses, or clock cycles. Specifically:
+  - When the context changes, this column contains the inverse of $(c' - c)$.
+  - When the context remains the same but the address changes, this column contains the inverse of $(a' - a)$.
+  - When both the context and the address remain the same, this column contains the inverse of $(i' - i - 1)$.
+
+For every memory access operation (i.e., read or write), a new row is added to the memory table. For read operations, `s0` is set to $1$. If neither `ctx` nor `addr` have changed, then `s1` is set to $1$ and the `v` columns are set to equal the values from the previous row. If `ctx` or `addr` have changed, then `s1` is set to $0$ and the `v` columns are initialized to $0$. For write operations, the values may be different, and both selector columns `s0` and `s1` are set to $0$.
+
+The amortized cost of reading or writing a single value is between $4$ and $5$ trace cells (this accounts for the trace cells needed for $16$-bit range checks). Thus, from performance standpoint, this approach is roughly $2.5$x worse than the simple contiguous write-once memory described earlier. However, our view is that this trade-off is worth it given that this approach provides read-write memory, context separation, and eliminates the contiguous memory requirement.
+
+### AIR constraints
+
+To simplify description of constraints, we'll define two variables $n_0$ and $n_1$ as follows:
+
+$$
+n_0 = \Delta c \cdot t' \\
+n_1 = \Delta a \cdot t'
+$$
+
+Where $\Delta c = c' - c$ and $\Delta a = a' - a$.
+
+To make sure the prover sets the value of column `t` correctly, we'll need to impose the following constraints:
+
+>$$
+n_0^2 - n_0 = 0 \text{ | degree} = 4
+$$
+
+>$$
+(1 - n_0) \cdot  \Delta c = 0 \text{ | degree} = 3
+$$
+
+>$$
+(1 - n_0) \cdot (n_1^2 - n_1) = 0 \text{ | degree} = 6
+$$
+
+>$$
+(1 - n_0) \cdot (1 - n_1) \cdot \Delta a = 0 \text{ | degree} = 5
+$$
+
+The above constraints guarantee that when context changes, $n_0 = 1$. When context remains the same but address changes, $(1 - n_0) \cdot n_1 = 1$. And when neither the context nor the address change, $(1 - n_0) \cdot (1 - n_1) = 1$.
+
+To enforce the values of the selector columns, we first require that they both contain only binary values.
+
+>$$
+s_0^2 - s_0 = 0 \text{ | degree} = 2
+$$
+
+>$$
+s_1^2 - s_1 = 0 \text{ | degree} = 2
+$$
+
+Then we require that $s_1$ is always set to $1$ during read operations when the context and address did not change and to $0$ in all other cases.
+
+>$$
+(1 - n_0) \cdot (1 - n_1) \cdot s'_0 \cdot (1 - s'_1) = 0 \text{ | degree} = 6
+$$
+
+>$$
+(n_0 + (1 - n_0) \cdot n_1  + (1 - s'_0)) \cdot s'_1 = 0 \text{ | degree} = 5
+$$
+
+The first constraint enforces that `s_1` is $1$ when the operation is a read and `ctx` and `addr` are both unchanged. The second constraint enforces that when either the context changed, the address changed, or the operation is a write, then `s_1` is set to $0$.
+
+
+To enforce the values of context ID, address, and clock cycle grow monotonically as described in the previous section, we define the following constraint.
+
+>$$
+\left(n_0 \cdot \Delta c + (1 - n_0) \cdot (n_1 \cdot \Delta a + (1 - n_1) \cdot \Delta i) \right) - (2^{16} \cdot d_1' + d_0') = 0 \text{ | degree} = 5
+$$
+
+Where $\Delta i = i' - i - 1$.
+
+In addition to this constraint, we also need to make sure that the values in registers $d_0$ and $d_1$ are less than $2^{16}$, and this can be done with [range checks](../range.md).
+
+Next, we need to make sure that values at a given memory address are always initialized to $0$. This can be done with the following constraint:
+
+>$$
+s_0 \cdot (1 - s_1) \cdot v_i = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 3
+$$
+
+Thus, when the operation is a read and either the context changes or the address changes, values in the $v_i$ columns are guaranteed to be zeros.
+
+Lastly, we need to make sure that for the same context/address combination, the $v_i$ columns of the current row are equal to the corresponding $v_i$ columns of the next row. This can be done with the following constraints:
+
+>$$
+s_1 \cdot (v_i' - v_i) = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 2
+$$
+
+#### Chiplets bus constraints
+
+Communication between the memory chiplet and the stack is accomplished via the chiplets bus $b_{chip}$. To respond to memory access requests from the stack, we need to divide the current value in $b_{chip}$ by the value representing a row in the memory table. This value can be computed as follows:
+
+$$
+v_{mem} = \alpha_0 + \alpha_1 \cdot op_{mem} + \alpha_2 \cdot c + \alpha_3 \cdot a + \alpha_4 \cdot i + \sum_{j=0}^3(\alpha_{j + 5} \cdot v_j)
+$$
+
+Where, $op_{mem}$ is the unique [operation label](index.md#operation-labels) of the memory access operation.
+
+To ensure that values of memory table rows are included into the chiplets bus, we impose the following constraint:
+
+>$$
+b_{chip}' = b_{chip} \cdot v_{mem} \text{ | degree} = 2
+$$
+
+On the stack side, for every memory access request, a corresponding value is divided out of the $b_{chip}$ column. Specifics of how this is done are described [here](../stack/io-ops.md#memory-access-operations).
diff --git a/docs/miden/vm/design/decoder/constraints.md b/docs/miden/vm/design/decoder/constraints.md
new file mode 100644
index 000000000..02a1388b2
--- /dev/null
+++ b/docs/miden/vm/design/decoder/constraints.md
@@ -0,0 +1,624 @@
+In this section we describe AIR constraint for Miden VM program decoder. These constraints enforce that the execution trace generated by the prover when executing a particular program complies with the rules described in the [previous section](index.md).
+
+To refer to decoder execution trace columns, we use the names shown on the diagram below (these are the same names as in the previous section). Additionally, we denote the register containing the value at the top of the stack as $s_0$.
+
+![air_decoder_columns](../../../../img/miden/vm/design/decoder/constraints/air_decoder_columns.png)
+
+We assume that the VM exposes a flag per operation which is set to $1$ when the operation is executed, and to $0$ otherwise. The notation for such flags is $f_{opname}$. For example, when the VM executes a `PUSH` operation, flag $f_{push} = 1$. All flags are mutually exclusive - i.e., when one flag is set to $1$ all other flags are set to $0$. The flags are computed based on values in `op_bits` columns.
+
+AIR constraints for the decoder involve operations listed in the table below. For each operation we also provide the degree of the corresponding flag and the effect that the operation has on the operand stack (however, in this section we do not cover the constraints needed to enforce the correct transition of the operand stack).
+
+| Operation | Flag          | Degree | Effect on stack                                                                                  |
+| --------- | :-----------: | :----: | ------------------------------------------------------------------------------------------------ |
+| `JOIN`    | $f_{join}$    | 5      | Stack remains unchanged.                                                                         |
+| `SPLIT`   | $f_{split}$   | 5      | Top stack element is dropped.                                                                    |
+| `LOOP`    | $f_{loop}$    | 5      | Top stack element is dropped.                                                                    |
+| `REPEAT`  | $f_{repeat}$  | 4      | Top stack element is dropped.                                                                    |
+| `SPAN`    | $f_{span}$    | 5      | Stack remains unchanged.                                                                         |
+| `RESPAN`  | $f_{respan}$  | 4      | Stack remains unchanged.                                                                         |
+| `DYN`     | $f_{dyn}$     | 5      | Stack remains unchanged.                                                                         |
+| `CALL`    | $f_{call}$    | 4      | Stack remains unchanged.                                                                         |
+| `SYSCALL` | $f_{syscall}$ | 4      | Stack remains unchanged.                                                                         |
+| `END`     | $f_{end}$     | 4      | When exiting a loop block, top stack element is dropped; otherwise, the stack remains unchanged. |
+| `HALT`    | $f_{halt}$    | 4      | Stack remains unchanged.                                                                         |
+| `PUSH`    | $f_{push}$    | 4      | An immediate value is pushed onto the stack.                                                     |
+
+We also use the [control flow flag](../stack/op-constraints.md#control-flow-flag) $f_{ctrl}$ exposed by the VM, which is set when any one of the above control flow operations is being executed. It has degree $5$.
+
+As described [previously](index.md#program-decoding), the general idea of the decoder is that the prover provides the program to the VM by populating some of cells in the trace non-deterministically. Values in these are then used to update virtual tables (represented via multiset checks) such as block hash table, block stack table etc. Transition constraints are used to enforce that the tables are updates correctly, and we also apply boundary constraints to enforce the correct initial and final states of these tables. One of these boundary constraints binds the execution trace to the hash of the program being executed. Thus, if the virtual tables were updated correctly and boundary constraints hold, we can be convinced that the prover executed the claimed program on the VM.
+
+In the sections below, we describe constraints according to their logical grouping. However, we start out with a set of general constraints which are applicable to multiple parts of the decoder.
+
+## General constraints
+
+When `SPLIT` or `LOOP` operation is executed, the top of the operand stack must contain a binary value:
+
+> $$
+(f_{split} + f_{loop}) \cdot (s_0^2 - s_0) = 0 \text{ | degree} = 7
+$$
+
+When a `DYN` operation is executed, the hasher registers must all be set to $0$:
+
+> $$
+f_{dyn} \cdot (1 - h_i) = 0 \text { for } i \in [0, 8) \text{ | degree} = 6
+$$
+
+When `REPEAT` operation is executed, the value at the top of the operand stack must be $1$:
+
+> $$
+f_{repeat} \cdot (1 - s_0) = 0 \text{ | degree} = 5
+$$
+
+Also, when `REPEAT` operation is executed, the value in $h_4$ column (the `is_loop_body` flag), must be set to $1$. This ensures that `REPEAT` operation can be executed only inside a loop:
+
+> $$
+f_{repeat} \cdot (1 - h_4) = 0 \text{ | degree} = 5
+$$
+
+When `RESPAN` operation is executed, we need to make sure that the block ID is incremented by $8$:
+
+> $$
+f_{respan} \cdot (a' - a - 8) = 0 \text{ | degree} = 5
+$$
+
+When `END` operation is executed and we are exiting a *loop* block (i.e., `is_loop`, value which is stored in $h_5$, is $1$), the value at the top of the operand stack must be $0$:
+
+> $$
+f_{end} \cdot h_5 \cdot s_0 = 0 \text{ | degree} = 6
+$$
+
+Also, when `END` operation is executed and the next operation is `REPEAT`, values in $h_0, ..., h_4$ (the hash of the current block and the `is_loop_body` flag) must be copied to the next row:
+
+> $$
+f_{end} \cdot f_{repeat}' \cdot (h_i' - h_i) = 0 \text { for } i \in [0, 5) \text{ | degree} = 9
+$$
+
+A `HALT` instruction can be followed only by another `HALT` instruction:
+
+> $$
+f_{halt} \cdot (1 - f_{halt}') = 0 \text{ | degree} = 8
+$$
+
+When a `HALT` operation is executed, block address column must be $0$:
+
+> $$
+f_{halt} \cdot a = 0 \text{ | degree} = 5
+$$
+
+Values in `op_bits` columns must be binary (i.e., either $1$ or $0$):
+
+> $$
+b_i^2 - b_i = 0 \text{ for } i \in [0, 7) \text{ | degree} = 2
+$$
+
+When the value in `in_span` column is set to $1$, control flow operations cannot be executed on the VM, but when `in_span` flag is $0$, only control flow operations can be executed on the VM:
+
+> $$
+1 - sp - f_{ctrl} = 0 \text{ | degree} = 5
+$$
+
+## Block hash computation constraints
+
+As described [previously](index.md#program-block-hashing), when the VM starts executing a new block, it also initiates computation of the block's hash. There are two separate methodologies for computing block hashes.
+
+For *join*, *split*, and *loop* blocks, the hash is computed directly from the hashes of the block's children. The prover provides these child hashes non-deterministically by populating registers $h_0,..., h_7$. For *dyn*, the hasher registers are populated with zeros, so the resulting hash is a constant value. The hasher is initialized using the hash chiplet, and we use the address of the hasher as the block's ID. The result of the hash is available $7$ rows down in the hasher table (i.e., at row with index equal to block ID plus $7$). We read the result from the hasher table at the time the `END` operation is executed for a given block.
+
+For *span* blocks, the hash is computed by absorbing a linear sequence of instructions (organized into operation groups and batches) into the hasher and then returning the result. The prover provides operation batches non-deterministically by populating registers $h_0, ..., h_7$. Similarly to other blocks, the hasher is initialized using the hash chiplet at the start of the block, and we use the address of the hasher as the ID of the first operation batch in the block. As we absorb additional operation batches into the hasher (by executing `RESPAN` operation), the batch address is incremented by $8$. This moves the "pointer" into the hasher table $8$ rows down with every new batch. We read the result from the hasher table at the time the `END` operation is executed for a given block.
+
+### Chiplets bus constraints
+
+The decoder communicates with the hash chiplet via the [chiplets bus](../chiplets/index.md#chiplets-bus). This works by dividing values of the multiset check column $b_{chip}$ by the values of operations providing inputs to or reading outputs from the hash chiplet. A constraint to enforce this would look as $b_{chip}' \cdot u = b_{chip}$, where $u$ is the value which defines the operation.
+
+In constructing value of $u$ for decoder AIR constraints, we will use the following labels (see [here](../chiplets/hasher.md#multiset-check-constraints) for an explanation of how values for these labels are computed):
+
+* $m_{bp}$ this label specifies that we are starting a new hash computation.
+* $m_{abp}$ this label specifies that we are absorbing the next sequence of $8$ elements into an ongoing hash computation.
+* $m_{hout}$ this label specifies that we are reading the result of a hash computation.
+
+To simplify constraint description, we define the following variables:
+
+$$
+h_{init} = \alpha_0 + \alpha_1 \cdot m_{bp} + \alpha_2 \cdot a' + \sum_{i=0}^7(\alpha_{i + 8} \cdot h_i)
+$$
+
+In the above, $h_{init}$ can be thought of as initiating a hasher with address $a'$ and absorbing $8$ elements from the hasher state ($h_0, ..., h_7$) into it. Control blocks are always padded to fill the hasher rate and as such the $\alpha_4$ (first capacity register) term is set to $0$.
+
+$$
+h_{abp} = \alpha_0 + \alpha_1 \cdot m_{abp} + \alpha_2 \cdot a' + \sum_{i=0}^7(\alpha_{i + 8} \cdot h_i)
+$$
+
+It should be noted that $a$ refers to a column in the decoder, as depicted. The addresses in this column are set using the address from the hasher chiplet for the corresponding hash initialization / absorption / return. In the case of $h_{abp}$ the value of the address in column $a$ in the current row of the decoder is set to equal the value of the address of the row in the hasher chiplet where the previous absorption (or initialization) occurred. $a'$ is the address of the next row of the decoder, which is set to equal the address in the hasher chiplet where the absorption referred to by the $h_{abp}$ label is happening.
+
+$$
+h_{res} = \alpha_0 + \alpha_1 \cdot m_{hout} + \alpha_2 \cdot (a + 7) + \sum_{i=0}^3(\alpha_{i + 8} \cdot h_i)
+$$
+
+In the above, $a$ represents the address value in the decoder which corresponds to the hasher chiplet address at which the hasher was initialized (or the last absorption took place).  As such, $a + 7$ corresponds to the hasher chiplet address at which the result is returned.
+
+$$
+f_{ctrli} = f_{join} + f_{split} + f_{loop} + f_{dyn} + f_{call} \text{ | degree} = 5
+$$
+
+In the above, $f_{ctrli}$ is set to $1$ when a control flow operation that signifies the initialization of a control block is being executed on the VM.  Otherwise, it is set to $0$. An exception is made for the `SYSCALL` operation. Although it also signifies the initialization of a control block, it must additionally send a procedure access request to the [kernel ROM chiplet](../chiplets/kernel_rom.md) via the chiplets bus. Therefore, it is excluded from this flag and its communication with the chiplets bus is handled separately.
+
+$$
+d = \sum_{b=0}^6(b_i \cdot 2^i)
+$$
+
+In the above, $d$ represents the opcode value of the opcode being executed on the virtual machine. It is calculated via a bitwise combination of the op bits. We leverage the opcode value to achieve domain separation when hashing control blocks. This is done by populating the second capacity register of the hasher with the value $d$ via the $\alpha_5$ term when initializing the hasher.
+
+Using the above variables, we define operation values as described below.
+
+When a control block initializer operation (`JOIN`, `SPLIT`, `LOOP`, `DYN`, `CALL`, `SYSCALL`) is executed, a new hasher is initialized and the contents of $h_0, ..., h_7$ are absorbed into the hasher. As mentioned above, the opcode value $d$ is populated in the second capacity resister via the $\alpha_5$ term.
+
+$$
+u_{ctrli} = f_{ctrli} \cdot (h_{init} + \alpha_5 \cdot d) \text{ | degree} = 6
+$$
+
+As mentioned previously, the value sent by the `SYSCALL` operation is defined separately, since in addition to communicating with the hash chiplet it must also send a kernel procedure access request to the kernel ROM chiplet. This value of this kernel procedure request is described by $k_{proc}$.
+
+$$
+k_{proc} = \alpha_6 + \alpha_7 \cdot op_{krom} + \sum_{i=0}^3 (\alpha_{i + 8} \cdot h_i)
+$$
+
+In the above, $op_{krom}$ is the unique [operation label](./index.md#operation-labels) of the kernel procedure call operation. The values $h_0, h_1, h_2, h_3$ contain the root hash of the procedure being called, which is the procedure that must be requested from the kernel ROM chiplet.
+
+$$
+u_{syscall} = f_{syscall} \cdot (h_{init} + \alpha_5 \cdot d) \cdot k_{proc} \text{ | degree} = 7
+$$
+
+The above value sends both the hash initialization request and the kernel procedure access request to the chiplets bus when the `SYSCALL` operation is executed.
+
+When `SPAN` operation is executed, a new hasher is initialized and contents of $h_0, ..., h_7$ are absorbed into the hasher. The number of operation groups to be hashed is padded to a multiple of the rate width ($8$) and so the $\alpha_4$ is set to 0:
+
+$$
+u_{span} = f_{span} \cdot h_{init} \text{ | degree} = 6
+$$
+
+When `RESPAN` operation is executed, contents of $h_0, ..., h_7$ (which contain the new operation batch) are absorbed into the hasher:
+
+$$
+u_{respan} = f_{respan} \cdot h_{abp} \text{ | degree} = 5
+$$
+
+When `END` operation is executed, the hash result is copied into registers $h_0, .., h_3$:
+
+$$
+u_{end} = f_{end} \cdot h_{res} \text{ | degree} = 5
+$$
+
+Using the above definitions, we can describe the constraint for computing block hashes as follows:
+
+> $$
+b_{chip}' \cdot (u_{ctrli} + u_{syscall} + u_{span} + u_{respan} + u_{end} + \\
+1 - (f_{ctrli} + f_{syscall} + f_{span} + f_{respan} + f_{end})) = b_{chip}
+$$
+
+We need to add $1$ and subtract the sum of the relevant operation flags to ensure that when none of the flags is set to $1$, the above constraint reduces to $b_{chip}' = b_{chip}$.
+
+The degree of this constraint is $8$.
+
+## Block stack table constraints
+As described [previously](index.md#block-stack-table), block stack table keeps track of program blocks currently executing on the VM. Thus, whenever the VM starts executing a new block, an entry for this block is added to the block stack table. And when execution of a block completes, it is removed from the block stack table.
+
+Adding and removing entries to/from the block stack table is accomplished as follows:
+* To add an entry, we multiply the value in column $p_1$ by a value representing a tuple `(blk_id, prnt_id, is_loop)`. A constraint to enforce this would look as $p_1' = p_1 \cdot v$, where $v$ is the value representing the row to be added.
+* To remove an entry, we divide the value in column $p_1$ by a value representing a tuple `(blk_id, prnt_id, is_loop)`. A constraint to enforce this would look as $p_1' \cdot u = p_1$, where $u$ is the value representing the row to be removed.
+
+Before describing the constraints for the block stack table, we first describe how we compute the values to be added and removed from the table for each operation. In the below, for block start operations (`JOIN`, `SPLIT`, `LOOP`, `SPAN`) $a$ refers to the ID of the parent block, and $a'$ refers to the ID of the starting block. For `END` operation, the situation is reversed: $a$ is the ID of the ending block, and $a'$ is the ID of the parent block. For `RESPAN` operation, $a$ refers to the ID of the current operation batch, $a'$ refers to the ID of the next batch, and the parent ID for both batches is set by the prover non-deterministically in register $h_1$.
+
+When `JOIN` operation is executed, row $(a', a, 0)$ is added to the block stack table:
+
+$$
+v_{join} = f_{join} \cdot (\alpha_0 + \alpha_1 \cdot a' + \alpha_2 \cdot a) \text{ | degree} = 6
+$$
+
+When `SPLIT` operation is executed, row $(a', a, 0)$ is added to the block stack table:
+
+$$
+v_{split} = f_{split} \cdot (\alpha_0 + \alpha_1 \cdot a' + \alpha_2 \cdot a) \text{ | degree} = 6
+$$
+
+When `LOOP` operation is executed, row $(a', a, 1)$ is added to the block stack table if the value at the top of the operand stack is $1$, and row $(a', a, 0)$ is added to the block stack table if the value at the top of the operand stack is $0$:
+
+$$
+v_{loop} = f_{loop} \cdot (\alpha_0 + \alpha_1 \cdot a' + \alpha_2 \cdot a + \alpha_3 \cdot s_0) \text{ | degree} = 6
+$$
+
+When `SPAN` operation is executed, row $(a', a, 0)$ is added to the block stack table:
+
+$$
+v_{span} = f_{span} \cdot (\alpha_0 + \alpha_1 \cdot a' + \alpha_2 \cdot a) \text{ | degree} = 6
+$$
+
+When `RESPAN` operation is executed, row $(a, h_1', 0)$ is removed from the block stack table, and row $(a', h_1', 0)$ is added to the table. The prover sets the value of register $h_1$ at the next row to the ID of the parent block:
+
+$$
+u_{respan} = f_{respan} \cdot (\alpha_0 + \alpha_1 \cdot a + \alpha_2 \cdot h_1') \text{ | degree} = 5  \\
+v_{respan} = f_{respan} \cdot (\alpha_0 + \alpha_1 \cdot a' + \alpha_2 \cdot h_1') \text{ | degree} = 5
+$$
+
+When a `DYN` operation is executed, row $(a', a, 0)$ is added to the block stack table:
+
+$$
+v_{dyn} = f_{dyn} \cdot (\alpha_0 + \alpha_1 \cdot a' + \alpha_2 \cdot a) \text{ | degree} = 6
+$$
+
+When `END` operation is executed, row $(a, a', h_5)$ is removed from the block span table. Register $h_5$ contains the `is_loop` flag:
+
+$$
+u_{end} = f_{end} \cdot (\alpha_0 + \alpha_1 \cdot a + \alpha_2 \cdot a' + \alpha_3 \cdot h_5) \text{ | degree} = 5
+$$
+
+Using the above definitions, we can describe the constraint for updating the block stack table as follows:
+
+> $$
+p_1' \cdot (u_{end} + u_{respan} + 1 - (f_{end} + f_{respan})) = p_1 \cdot \\
+(v_{join} + v_{split} + v_{loop} + v_{span} + v_{respan} + v_{dyn} + 1 - \\
+(f_{join} + f_{split} + f_{loop} + f_{span} + f_{respan} + f_{dyn}))
+$$
+
+We need to add $1$ and subtract the sum of the relevant operation flags from each side to ensure that when none of the flags is set to $1$, the above constraint reduces to $p_1' = p_1$.
+
+The degree of this constraint is $7$.
+
+In addition to the above transition constraint, we also need to impose boundary constraints against the $p_1$ column to make sure the first and the last value in the column is set to $1$. This enforces that the block stack table starts and ends in an empty state.
+
+## Block hash table constraints
+
+As described [previously](index.md#block-hash-table), when the VM starts executing a new program block, it adds hashes of the block's children to the block hash table. And when the VM finishes executing a block, it removes the block's hash from the block hash table. This means that the block hash table gets updated when we execute the `JOIN`, `SPLIT`, `LOOP`, `REPEAT`, `DYN`, and `END` operations (executing `SPAN` operation does not affect the block hash table because a *span* block has no children).
+
+Adding and removing entries to/from the block hash table is accomplished as follows:
+* To add an entry, we multiply the value in column $p_2$ by a value representing a tuple `(prnt_id, block_hash, is_first_child, is_loop_body)`. A constraint to enforce this would look as $p_2' = p_2 \cdot v$, where $v$ is the value representing the row to be added.
+* To remove an entry, we divide the value in column $p_2$ by a value representing a tuple `(prnt_id, block_hash, is_first_child, is_loop_body)`. A constraint to enforce this would look as $p_2' \cdot u = p_2$, where $u$ is the value representing the row to be removed.
+
+To simplify constraint descriptions, we define values representing left and right children of a block as follows:
+
+$$
+ch_1 = \alpha_0 + \alpha_1 \cdot a' + \sum_{i=0}^3(\alpha_{i+2} \cdot h_i) \text{ | degree} = 1 \\
+ch_2 = \alpha_0 + \alpha_1 \cdot a' + \sum_{i=0}^3(\alpha_{i+2} \cdot h_{i+4}) \text{ | degree} = 1
+$$
+
+Graphically, this looks like so:
+
+![air_decoder_left_right_child](../../../../img/miden/vm/design/decoder/constraints/air_decoder_left_right_child.png)
+
+In a similar manner, we define a value representing the result of hash computation as follows:
+
+$$
+bh = \alpha_0 + \alpha_1 \cdot a + \sum_{i=0}^3(\alpha_{i+2} \cdot h_i) + \alpha_7 \cdot h_4 \text{ | degree} = 1
+$$
+
+Note that in the above we use $a$ (block address from the current row) rather than $a'$ (block address from the next row) as we did for for values of $ch_1$ and $ch_2$. Also, note that we are not adding a flag indicating whether the block is the first child of a join block (i.e., $\alpha_6$ term is missing). It will be added later on.
+
+Using the above variables, we define row values to be added to and removed from the block hash table as follows.
+
+When `JOIN` operation is executed, hashes of both child nodes are added to the block hash table. We add $\alpha_6$ term to the first child value to differentiate it from the second child (i.e., this sets `is_first_child` to $1$):
+
+$$
+v_{join} = f_{join} \cdot (ch_1 + \alpha_6) \cdot ch_2  \text{ | degree} = 7
+$$
+
+When `SPLIT` operation is executed and the top of the stack is $1$, hash of the *true* branch is added to the block hash table, but when the top of the stack is $0$, hash of the *false* branch is added to the block hash table:
+
+$$
+v_{split} = f_{split} \cdot (s_0 \cdot ch_1 + (1 - s_0) \cdot ch_2)  \text{ | degree} = 7
+$$
+
+When `LOOP` operation is executed and the top of the stack is $1$, hash of loop body is added to the block hash table. We add $\alpha_7$ term to indicate that the child is a body of a loop. The below also means that if the top of the stack is $0$, nothing is added to the block hash table as the expression evaluates to $0$:
+
+$$
+v_{loop} = f_{loop} \cdot s_0 \cdot (ch_1 + \alpha_7) \text{ | degree} = 7
+$$
+
+When `REPEAT` operation is executed, hash of loop body is added to the block hash table. We add $\alpha_7$ term to indicate that the child is a body of a loop:
+
+$$v_{repeat} = f_{repeat} \cdot (ch_1 + \alpha_7) \text{ | } \text{degree} = 5$$
+
+When the `DYN` operation is executed, the hash of the dynamic child is added to the block hash table. Since the child is dynamically specified by the top four elements of the stack, the value representing the *dyn* block's child must be computed based on the stack rather than from the decoder's hasher registers:
+
+$$
+ch_{dyn} = \alpha_0 + \alpha_1 \cdot a' + \sum_{i=0}^3(\alpha_{i+2} \cdot s_{3-i}) \text{ | degree} = 1
+$$
+
+$$
+v_{dyn} = f_{dyn} \cdot ch_{dyn}  \text{ | degree} = 6
+$$
+
+When `END` operation is executed, hash of the completed block is removed from the block hash table. However, we also need to differentiate between removing the first and the second child of a *join* block. We do this by looking at the next operation. Specifically, if the next operation is neither `END` nor `REPEAT` we know that another block is about to be executed, and thus, we have just finished executing the first child of a *join* block. Thus, if the next operation is neither `END` nor `REPEAT` we need to set the term for $\alpha_6$ coefficient to $1$ as shown below:
+
+$$
+u_{end} = f_{end} \cdot (bh + \alpha_6 \cdot (1 - (f_{end}' + f_{repeat}'))) \text{ | } \text{degree} = 8
+$$
+
+Using the above definitions, we can describe the constraint for updating the block hash table as follows:
+
+> $$
+p_2' \cdot (u_{end} + 1 - f_{end}) = \\
+p_2 \cdot (v_{join} + v_{split} + v_{loop} + v_{repeat} + v_{dyn} + 1 - (f_{join} + f_{split} + f_{loop} + f_{repeat} + f_{dyn}))
+$$
+
+We need to add $1$ and subtract the sum of the relevant operation flags from each side to ensure that when none of the flags is set to $1$, the above constraint reduces to $p_2' = p_2$.
+
+The degree of this constraint is $9$.
+
+In addition to the above transition constraint, we also need to set the following boundary constraints against the $p_2$ column:
+
+* The first value in the column represents a row for the entire program. Specifically, the row tuple would be `(0, program_hash, 0, 0)`. This row should be removed from the table when the last `END` operation is executed.
+* The last value in the column is $1$ - i.e., the block hash table is empty.
+
+## Span block
+
+Span block constraints ensure proper decoding of span blocks. In addition to the block stack table constraints and block hash table constraints described previously, decoding of span blocks requires constraints described below.
+
+### In-span column constraints
+
+The `in_span` column (denoted as $sp$) is used to identify rows which execute non-control flow operations. The values in this column are set as follows:
+
+* Executing a `SPAN` operation sets the value of `in_span` column to $1$.
+* The value remains $1$ until the `END` operation is executed.
+* If `RESPAN` operation is executed between `SPAN` and `END` operations, in the row at which `RESPAN` operation is executed `in_span` is set to $0$. It is then reset to $1$ in the following row.
+* In all other cases, value in the `in_span` column should be $0$.
+
+The picture below illustrates the above rules.
+
+![air_decoder_in_spans_column_constraint](../../../../img/miden/vm/design/decoder/constraints/air_decoder_in_spans_column_constraint.png)
+
+To enforce the above rules we need the following constraints.
+
+When executing `SPAN` or `RESPAN` operation, the next value in $sp$ column must be set to $1$:
+
+> $$
+(f_{span} + f_{respan}) \cdot (1 - sp') = 0 \text{ | degree} = 6
+$$
+
+When the next operation is `END` or `RESPAN`, the next value in $sp$ column must be set $0$.
+
+> $$
+(f_{end}' + f_{respan}') \cdot sp' = 0 \text{ | degree} = 5
+$$
+
+In all other cases, the value in $sp$ column must be copied over to the next row:
+
+> $$
+(1 - f_{span} - f_{respan} - f_{end}' - f_{respan}') \cdot (sp' - sp) = 0 \text{ | degree} = 6
+$$
+
+Additionally, we will need to impose a boundary constraint which specifies that the first value in $sp = 0$. Note, however, that we do not need to impose a constraint ensuring that values in $sp$ are binary - this will follow naturally from the above constraints.
+
+Also, note that the combination of the above constraints makes it impossible to execute `END` or `RESPAN` operations right after `SPAN` or `RESPAN` operations.
+
+### Block address constraints
+
+When we are inside a *span* block, values in block address columns (denoted as $a$) must remain the same. This can be enforced with the following constraint:
+
+> $$
+sp \cdot (a' - a) = 0 \text{ | degree} = 2
+$$
+
+Notice that this constraint does not apply when we execute any of the control flow operations. For such operations, the prover sets the value of the $a$ column non-deterministically, except for the `RESPAN` operation. For the `RESPAN` operation the value in the $a$ column is incremented by $8$, which is enforced by a constraint described previously.
+
+Notice also that this constraint implies that when the next operation is the `END` operation, the value in the $a$ column must also be copied over to the next row. This is exactly the behavior we want to enforce so that when the `END` operation is executed, the block address is set to the address of the current span batch.
+
+### Group count constraints
+
+The `group_count` column (denoted as $gc$) is used to keep track of the number of operation groups which remains to be executed in a span block.
+
+In the beginning of a span block (i.e., when `SPAN` operation is executed), the prover sets the value of $gc$ non-deterministically. This value is subsequently decremented according to the rules described below. By the time we exit the span block (i.e., when `END` operation is executed), the value in $gc$ must be $0$.
+
+The rules for decrementing values in the $gc$ column are as follows:
+
+* The count cannot be decremented by more than $1$ in a single row.
+* When an operation group is fully executed (which happens when $h_0 = 0$ inside a span block), the count is decremented by $1$.
+* When `SPAN`, `RESPAN`, or `PUSH` operations are executed, the count is decremented by $1$.
+
+Note that these rules imply that `PUSH` operation cannot be the last operation in an operation group (otherwise the count would have to be decremented by $2$).
+
+To simplify the description of the constraints, we will define the following variable:
+
+$$
+\Delta gc = gc - gc'
+$$
+
+Using this variable, we can describe the constraints against the $gc$ column as follows:
+
+Inside a *span* block, group count can either stay the same or decrease by one:
+
+> $$
+sp \cdot \Delta gc \cdot (\Delta gc - 1) = 0 \text{ | degree} = 3
+$$
+
+When group count is decremented inside a *span* block, either $h_0$ must be $0$ (we consumed all operations in a group) or we must be executing `PUSH` operation:
+
+> $$
+sp \cdot \Delta gc \cdot (1 - f_{push})\cdot h_0 = 0 \text{ | degree} = 7
+$$
+
+Notice that the above constraint does not preclude $f_{push} = 1$ and $h_0 = 0$ from being true at the same time. If this happens, op group decoding constraints (described [here](#op-group-decoding-constraints)) will force that the operation following the `PUSH` operation is a `NOOP`.
+
+When executing a `SPAN`, a `RESPAN`, or a `PUSH` operation, group count must be decremented by $1$:
+
+> $$
+(f_{span} + f_{respan} + f_{push}) \cdot (\Delta gc - 1) = 0 \text{ | degree} = 6
+$$
+
+If the next operation is either an `END` or a `RESPAN`, group count must remain the same:
+
+> $$
+\Delta gc \cdot (f_{end}' + f_{respan}') = 0 \text{ | degree} = 5
+$$
+
+When an `END` operation is executed, group count must be $0$:
+
+> $$
+f_{end} \cdot gc = 0 \text{ | degree} = 5
+$$
+
+### Op group decoding constraints
+
+Inside a *span* block, register $h_0$ is used to keep track of operations to be executed in the current operation group. The value of this register is set by the prover non-deterministically at the time when the prover executes a `SPAN` or a `RESPAN` operation, or when processing of a new operation group within a batch starts. The picture below illustrates this.
+
+![air_decoder_op_group_constraint](../../../../img/miden/vm/design/decoder/constraints/air_decoder_op_group_constraint.png)
+
+In the above:
+
+* The prover sets the value of $h_0$ non-deterministically at row $0$. The value is set to an operation group containing operations `op0` through `op8`.
+* As we start executing the group, at every row we "remove" the least significant operation from the group. This can be done by subtracting opcode of the operation from the group, and then dividing the result by $2^7$.
+* By row $9$ the group is fully executed. This decrements the group count and set `op_index` to $0$ (constraints against `op_index` column are described in the next section).
+* At row $10$ we start executing the next group with operations `op9` through `op11`. In this case, the prover populates $h_0$ with the group having its first operation (`op9`) already removed, and sets the `op_bits` registers to the value encoding `op9`.
+* By row $12$ this group is also fully executed.
+
+To simplify the description of the constraints, we define the following variables:
+
+$$
+op = \sum_{i=0}^6 (b_i \cdot 2^i) \\
+f_{sgc} = sp \cdot sp' \cdot (1 - \Delta gc)
+$$
+
+$op$ is just an opcode value implied by the values in `op_bits` registers. $f_{sgc}$ is a flag which is set to $1$ when the group count within a *span* block does not change. We multiply it by $sp'$ to make sure the flag is $0$ when we are about to end decoding of an operation batch. Note that $f_{sgc}$ flag is mutually exclusive with $f_{span}$, $f_{respan}$, and $f_{push}$ flags as these three operations decrement the group count.
+
+Using these variables, we can describe operation group decoding constraints as follows:
+
+When a `SPAN`, a `RESPAN`, or a `PUSH` operation is executed or when the group count does not change, the value in $h_0$ should be decremented by the value of the opcode in the next row.
+
+> $$
+(f_{span} + f_{respan} + f_{push} + f_{sgc}) \cdot (h_0 - h_0' \cdot 2^7 - op') = 0 \text{ | degree} = 6
+$$
+
+Notice that when the group count does change, and we are not executing $f_{span}$, $f_{respan}$, or $f_{push}$ operations, no constraints are placed against $h_0$, and thus, the prover can populate this register non-deterministically.
+
+When we are in a *span* block and the next operation is `END` or `RESPAN`, the current value in $h_0$ column must be $0$.
+
+> $$
+sp \cdot (f_{end}' + f_{respan}') \cdot h_0 = 0 \text{ | degree} = 6
+$$
+
+### Op index constraints
+
+The `op_index` column (denoted as $ox$) tracks index of an operation within its operation group. It is used to ensure that the number of operations executed per group never exceeds $9$. The index is zero-based, and thus, the possible set of values for $ox$ is between $0$ and $8$ (both inclusive).
+
+To simplify the description of the constraints, we will define the following variables:
+
+$$
+ng = \Delta gc - f_{push} \\
+\Delta ox = ox' - ox
+$$
+
+The value of $ng$ is set to $1$ when we are about to start executing a new operation group (i.e., group count is decremented but we did not execute a `PUSH` operation). Using these variables, we can describe the constraints against the $ox$ column as follows.
+
+When executing `SPAN` or `RESPAN` operations the next value of `op_index` must be set to $0$:
+
+> $$
+(f_{span} + f_{respan}) \cdot ox' = 0 \text{ | degree} = 6
+$$
+
+When starting a new operation group inside a *span* block, the next value of `op_index` must be set to $0$. Note that we multiply by $sp$ to exclude the cases when the group count is decremented because of `SPAN` or `RESPAN` operations:
+
+> $$
+sp \cdot ng \cdot ox' = 0 \text{ | degree} = 6
+$$
+
+When inside a *span* block but not starting a new operation group, `op_index` must be incremented by $1$. Note that we multiply by $sp'$ to exclude the cases when we are about to exit processing of an operation batch (i.e., the next operation is either `END` or `RESPAN`):
+
+> $$
+sp \cdot sp' \cdot (1 - ng) \cdot (\Delta ox - 1) = 0 \text{ | degree} = 7
+$$
+
+Values of `op_index` must be in the range $[0, 8]$.
+
+> $$
+\prod_{i=0}^{8}(ox - i) = 0 \text{ | degree} = 9
+$$
+
+### Op batch flags constraints
+
+Operation batch flag columns (denoted $bc_0$, $bc_1$, and $bc_2$) are used to specify how many operation groups are present in an operation batch. This is relevant for the last batch in a span block (or the first batch if there is only one batch in a block) as all other batches should be completely full (i.e., contain 8 operation groups).
+
+These columns are used to define the following 4 flags:
+
+* $f_{g8} = bc_0$: there are 8 operation groups in the batch.
+* $f_{g4} = (1 - bc_0) \cdot bc_1 \cdot bc_2$:  there are 4 operation groups in the batch.
+* $f_{g2} = (1 - bc_0) \cdot (1 - bc_1) \cdot bc_2$: there are 2 operation groups in the batch.
+* $f_{g1} = (1 - bc_0) \cdot bc_1 \cdot (1 - bc_2)$: there is only 1 operation groups in the batch.
+
+Notice that the degree of $f_{g8}$ is $1$, while the degree of the remaining flags is $3$.
+
+These flags can be set to $1$ only when we are executing `SPAN` or `RESPAN` operations as this is when the VM starts processing new operation batches. Also, for a given flag we need to ensure that only the specified number of operations groups are present in a batch. This can be done with the following constraints.
+
+All batch flags must be binary:
+
+> $$
+bc_i^2 - bc_i = 0 \text{ for } i \in [0, 3) \text{ | degree} = 2
+$$
+
+When `SPAN` or `RESPAN` operations is executed, one of the batch flags must be set to $1$.
+
+> $$
+(f_{span} + f_{respan}) - (f_{g1} + f_{g2} + f_{g4} + f_{g8}) = 0 \text{ | degree} = 5
+$$
+
+When we have at most 4 groups in a batch, registers $h_4, ..., h_7$ should be set to $0$'s.
+
+> $$
+(f_{g1} + f_{g2} + f_{g4}) \cdot h_i = 0 \text{ for } i \in [4, 8) \text{ | degree} = 4
+$$
+
+When we have at most 2 groups in a batch, registers $h_2$ and $h_3$ should also be set to $0$'s.
+
+> $$
+(f_{g1} + f_{g2}) \cdot h_i = 0 \text{ for } i \in 2, 3 \text{ | degree} = 4
+$$
+
+When we have at most 1 groups in a batch, register $h_1$ should also be set to $0$.
+
+> $$
+f_{g1} \cdot h_1 = 0 \text{ | degree} = 4
+$$
+
+### Op group table constraints
+
+Op group table is used to ensure that all operation groups in a given batch are consumed before a new batch is started (i.e., via a `RESPAN` operation) or the execution of a *span* block is complete (i.e., via an `END` operation). The op group table is updated according to the following rules:
+
+* When a new operation batch is started, we add groups from this batch to the table. To add a group to the table, we multiply the value in column $p_3$ by a value representing a tuple `(batch_id, group_pos, group)`. A constraint to enforce this would look as $p_3' = p_3 \cdot v$, where $v$ is the value representing the row to be added. Depending on the batch, we may need to add multiple groups to the table (i.e., $p_3' = p_3 \cdot v_1 \cdot v_2 \cdot v_3 ...$). Flags $f_{g1}$, $f_{g2}$, $f_{g4}$, and $f_{g8}$ are used to define how many groups to add.
+* When a new operation group starts executing or when an immediate value is consumed, we remove the corresponding group from the table. To do this, we divide the value in column $p_3$ by a value representing a tuple `(batch_id, group_pos, group)`. A constraint to enforce this would look as $p_3' \cdot u = p_3$, where $u$ is the value representing the row to be removed.
+
+To simplify constraint descriptions, we first define variables representing the rows to be added to and removed from the op group table.
+
+When a `SPAN` or a `RESPAN` operation is executed, we compute the values of the rows to be added to the op group table as follows:
+
+$$
+v_i = \alpha_0 + \alpha_1 \cdot a' + \alpha_2 \cdot (gc - i) + \alpha_3 \cdot h_{i} \text{ | degree} = 1
+$$
+
+Where $i \in [1, 8)$. Thus, $v_1$ defines row value for group in $h_1$, $v_2$ defines row value for group $h_2$ etc. Note that batch address column comes from the next row of the block address column ($a'$).
+
+We compute the value of the row to be removed from the op group table as follows:
+
+$$
+u = \alpha_0 + \alpha_1 \cdot a + \alpha_2 \cdot gc + \alpha_3 \cdot ((h_0' \cdot 2^7 + op') \cdot (1 - f_{push}) + s_0' \cdot f_{push}) \text{ | degree} = 5
+$$
+
+In the above, the value of the group is computed as $(h_0' \cdot 2^7 + op') \cdot (1 - f_{push}) + s_0' \cdot f_{push}$. This basically says that when we execute a `PUSH` operation we need to remove the immediate value from the table. This value is at the top of the stack (column $s_0$) in the next row. However, when we are not executing a `PUSH` operation, the value to be removed is an op group value which is a combination of values in $h_0$ and `op_bits` columns (also in the next row). Note also that value for batch address comes from the current value in the block address column ($a$), and the group position comes from the current value of the group count column ($gc$).
+
+We also define a flag which is set to $1$ when a group needs to be removed from the op group table.
+
+$$
+f_{dg} = sp \cdot \Delta gc
+$$
+
+The above says that we remove groups from the op group table whenever group count is decremented. We multiply by $sp$ to exclude the cases when the group count is decremented due to `SPAN` or `RESPAN` operations.
+
+Using the above variables together with flags $f_{g2}$, $f_{g4}$, $f_{g8}$ defined in the previous section, we describe the constraint for updating op group table as follows (note that we do not use $f_{g1}$ flag as when a batch consists of a single group, nothing is added to the op group table):
+
+> $$
+p_3' \cdot (f_{dg} \cdot u + 1 - f_{dg}) = p_3 \cdot (f_{g2} \cdot v_1 + f_{g4} \cdot \prod_{i=1}^3 v_i + f_{g8} \cdot \prod_{i=1}^7 v_i - 1 + (f_{span} + f_{respan}))
+$$
+
+The above constraint specifies that:
+
+* When `SPAN` or `RESPAN` operations are executed, we add between $1$ and $7$ groups to the op group table.
+* When group count is decremented inside a *span* block, we remove a group from the op group table.
+
+The degree of this constraint is $9$.
+
+In addition to the above transition constraint, we also need to impose boundary constraints against the $p_3$ column to make sure the first and the last value in the column is set to $1$. This enforces that the op group table table starts and ends in an empty state.
diff --git a/docs/miden/vm/design/decoder/index.md b/docs/miden/vm/design/decoder/index.md
new file mode 100644
index 000000000..dbd1c95d1
--- /dev/null
+++ b/docs/miden/vm/design/decoder/index.md
@@ -0,0 +1,653 @@
+Miden VM program decoder is responsible for ensuring that a program with a given [MAST root](../programs.md) is executed by the VM. As the VM executes a program, the decoder does the following:
+
+1. Decodes a sequence of field elements supplied by the prover into individual operation codes (or *opcodes* for short).
+2. Organizes the sequence of field elements into code blocks, and computes the hash of the program according to the methodology described [here](../programs.md#program-hash-computation).
+
+At the end of program execution, the decoder outputs the computed program hash. This hash binds the sequence of opcodes executed by the VM to a program the prover claims to have executed. The verifier uses this hash during the STARK proof verification process to verify that the proof attests to a correct execution of a specific program (i.e., the prover didn't claim to execute program $A$ while in fact executing a different program $B$).
+
+The sections below describe how Miden VM decoder works. Throughout these sections we make the following assumptions:
+
+1. An opcode requires $7$ bits to represent.
+2. An immediate value requires one full field element to represent.
+3. A `NOOP` operation has a numeric value of $0$, and thus, can be encoded as seven zeros. Executing a `NOOP` operation does not change the state of the VM, but it does advance operation counter, and may affect program hash.
+
+## Program execution
+
+Miden VM programs consist of a set of code blocks organized into a binary tree. The leaves of the tree contain linear sequences of instructions, and control flow is defined by the internal nodes of the tree.
+
+Managing control flow in the VM is accomplished by executing control flow operations listed in the table below. Each of these operations require exactly one VM cycle to execute.
+
+| Operation | Description                                                                  |
+| --------- | ---------------------------------------------------------------------------- |
+| `JOIN`    | Initiates processing of a new [Join block](../programs.md#join-block).       |
+| `SPLIT`   | Initiates processing of a new [Split block](../programs.md#split-block).     |
+| `LOOP`    | Initiates processing of a new [Loop block](../programs.md#loop-block).       |
+| `REPEAT`  | Initiates a new iteration of an executing loop.                              |
+| `SPAN`    | Initiates processing of a new [Span block](../programs.md#span-block).       |
+| `RESPAN`  | Initiates processing of a new operation batch within a span block.           |
+| `DYN`     | Initiates processing of a new [Dyn block](../programs.md#dyn-block).         |
+| `CALL`    | Initiates processing of a new [Call block](../programs.md#call-block).       |
+| `SYSCALL` | Initiates processing ofa new  [Syscall block](../programs.md#syscall-block). |
+| `END`     | Marks the end of a program block.                                            |
+| `HALT`    | Marks the end of the entire program.                                         |
+
+Let's consider a simple program below:
+
+```
+begin
+    <operations1>
+    if.true
+        <operations2>
+    else
+        <operations3>
+    end
+end
+```
+
+Block structure of this program is shown below.
+
+```
+JOIN
+    SPAN
+        <operations1>
+    END
+    SPLIT
+        SPAN
+            <operations2>
+        END
+        SPAN
+            <operations3>
+        END
+    END
+END
+```
+
+Executing this program on the VM can result in one of two possible instruction sequences. First, if after operations in `<operations1>` are executed the top of the stack is $1$, the VM will execute the following:
+
+```
+JOIN
+SPAN
+<operations1>
+END
+SPLIT
+SPAN
+<operations2>
+END
+END
+END
+HALT
+```
+
+However, if after `<operations1>` are executed, the top of the stack is $0$, the VM will execute the following:
+
+```
+JOIN
+SPAN
+<operations1>
+END
+SPLIT
+SPAN
+<operations3>
+END
+END
+END
+HALT
+```
+
+The main task of the decoder is to output exactly the same program hash, regardless of which one of the two possible execution paths was taken. However, before we can describe how this is achieved, we need to give an overview of the overall decoder structure.
+
+## Decoder structure
+
+The decoder is one of the more complex parts of the VM. It consists of the following components:
+
+* Main [execution trace](#decoder-trace) consisting of $24$ trace columns which contain the state of the decoder at a given cycle of a computation.
+* Connection to the hash chiplet, which is used to offload [hash computations](#program-block-hashing) from the decoder.
+* $3$ [virtual tables](#control-flow-tables) (implemented via multi-set checks), which keep track of code blocks and operations executing on the VM.
+
+### Decoder trace
+
+Decoder trace columns can be grouped into several logical sets of registers as illustrated below.
+
+![decoder_trace.png](../../../../img/miden/vm/design/decoder/decoder_trace.png)
+
+These registers have the following meanings:
+
+1. Block address register $a$. This register contains address of the hasher for the current block (row index from the auxiliary hashing table). It also serves the role of unique block identifiers. This is convenient, because hasher addresses are guaranteed to be unique.
+2. Registers $b_0, ..., b_6$, which encode opcodes for operation to be executed by the VM. Each of these registers can contain a single binary value (either $1$ or $0$). And together these values describe a single opcode.
+3. Hasher registers $h_0, ..., h_7$. When control flow operations are executed, these registers are used to provide inputs for the current block's hash computation (e.g., for `JOIN`, `SPLIT`, `LOOP`, `SPAN`, `CALL`, `SYSCALL` operations) or to record the result of the hash computation (i.e., for `END` operation). However, when regular operations are executed, $2$ of these registers are used to help with op group decoding, and the remaining $6$ can be used to hold operation-specific helper variables.
+4. Register $sp$ which contains a binary flag indicating whether the VM is currently executing instructions inside a *span* block. The flag is set to $1$ when the VM executes non-control flow instructions, and is set to $0$ otherwise.
+5. Register $gc$ which keep track of the number of unprocessed operation groups in a given *span* block.
+6. Register $ox$ which keeps track of a currently executing operation's index within its operation group.
+7. Operation batch flags $c_0, c_1, c_2$ which indicate how many operation groups a given operation batch contains. These flags are set only for `SPAN` and `RESPAN` operations, and are set to $0$'s otherwise.
+8. Two additional registers (not shown) used primarily for constraint degree reduction.
+
+### Program block hashing
+
+To compute hashes of program blocks, the decoder relies on the [hash chiplet](../chiplets/hasher.md). Specifically, the decoder needs to perform two types of hashing operations:
+
+1. A simple 2-to-1 hash, where we provide a sequence of $8$ field elements, and get back $4$ field elements representing the result. Computing such a hash requires $8$ rows in the hash chiplet.
+2. A sequential hash of $n$ elements. Computing such a hash requires multiple absorption steps, and at each step $8$ field elements are absorbed into the hasher. Thus, computing a sequential hash of $n$ elements requires $\lceil {n/8} \rceil$ rows in the hash chiplet. At the end, we also get $4$ field elements representing the result.
+
+To make hashing requests to the hash chiplet and to read the results from it, we will need to divide out relevant values from the [chiplets bus](../chiplets/index.md#chiplets-bus) column $b_{chip}$ as described below.
+
+#### Simple 2-to-1 hash
+
+To initiate a 2-to-1 hash of $8$ elements ($v_0, ..., v_7$) we need to divide $b_{chip}$ by the following value:
+
+$$
+\alpha_0 + \alpha_1 \cdot m_{bp} + \alpha_2 \cdot r + \sum_{i=0}^7 (\alpha_{i+8} \cdot v_i)
+$$
+
+where:
+* $m_{bp}$ is a label indicating beginning of a new permutation. Value of this label is computed based on hash chiplet selector flags according to the methodology described [here](../chiplets/hasher.md#multiset-check-constraints).
+* $r$ is the address of the row at which the hashing begins.
+* Some $\alpha$ values are skipped in the above (e.g., $\alpha_3$) because of the specifics of how auxiliary hasher table rows are reduced to field elements (described [here](../chiplets/hasher.md#multiset-check-constraints)). For example, $\alpha_3$ is used as a coefficient for node index values during Merkle path computations in the hasher, and thus, is not relevant in this case.  The $\alpha_4$ term is omitted when the number of items being hashed is a multiple of the rate width ($8$) because it is multiplied by 0 - the value of the first capacity register as determined by the [hasher chiplet logic](../chiplets/hasher.md#simple-2-to-1-hash).
+
+To read the $4$-element result ($u_0, ..., u_3$), we need to divide $b_{chip}$ by the following value:
+
+$$
+\alpha_0 + \alpha_1 \cdot m_{hout} + \alpha_2 \cdot (r + 7) + \sum_{i=0}^3 (\alpha_{i+8} \cdot u_i)
+$$
+
+where:
+* $m_{hout}$ is a label indicating return of the hash value. Value of this label is computed based on hash chiplet selector flags according to the methodology described [here](../chiplets/hasher.md#multiset-check-constraints).
+* $r$ is the address of the row at which the hashing began.
+
+#### Sequential hash
+
+To initiate a sequential hash of $n$ elements ($v_0, ..., v_{n-1}$), we need to divide $b_{chip}$ by the following value:
+
+$$
+\alpha_0 + \alpha_1 \cdot m_{bp} + \alpha_2 \cdot r + \alpha_4 \cdot n + \sum_{i=0}^7 (\alpha_{i+8} \cdot v_i)
+$$
+
+This also absorbs the first $8$ elements of the sequence into the hasher state. Then, to absorb the next sequence of $8$ elements (e.g., $v_8, ..., v_{15}$), we need to divide $b_{chip}$ by the following value:
+
+$$
+\alpha_0 + \alpha_1 \cdot m_{abp} + \alpha_2 \cdot (r + 7) + \sum_{i=0}^7 (\alpha_{i+8} \cdot v_{i + 8})
+$$
+
+Where $m_{abp}$ is a label indicating absorption of more elements into the hasher state. Value of this label is computed based on hash chiplet selector flags according to the methodology described [here](../chiplets/hasher.md#multiset-check-constraints).
+
+We can keep absorbing elements into the hasher in the similar manner until all elements have been absorbed. Then, to read the result (e.g., $u_0, ..., u_3$), we need to divide $b_{chip}$ by the following value:
+
+$$
+\alpha_0 + \alpha_1 \cdot m_{hout} + \alpha_2 \cdot (r + \lceil n / 8 \rceil \cdot 8  - 1) + \sum_{i=0}^3 (\alpha_{i+8} \cdot u_i)
+$$
+
+Thus, for example, if $n = 14$, the result will of the hash will be available at hasher row $r + 15$.
+
+### Control flow tables
+
+In addition to the hash chiplet, control flow operations rely on $3$ virtual tables: *block stack* table, *block hash* table, and _op group_ table. These tables are virtual in that they don't require separate trace columns. Their state is described solely by running product columns: $p_1$, $p_2$, and $p_3$. The tables are described in the following sections.
+
+#### Block stack table
+
+When the VM starts executing a new program block, it adds its block ID together with the ID of its parent block (and some additional info) to the *block stack* table. When a program block is fully executed, it is removed from the table. In this way, the table represents a stack of blocks which are currently executing on the VM. By the time program execution completes, block stack table must be empty.
+
+The table can be thought of as consisting of $3$ columns as shown below:
+
+![decoder_block_stack_table](../../../../img/miden/vm/design/decoder/decoder_block_stack_table.png)
+
+where:
+* The first column ($t_0$) contains the ID of the block.
+* The second column ($t_1$) contains the ID of the parent block. If the block has no parent (i.e., it is a root block of the program), parent ID is 0.
+* The third column ($t_2$) contains a binary value which is set to $1$ is the block is a *loop* block, and to $0$ otherwise.
+
+Running product column $p_1$ is used to keep track of the state of the table. At any step of the computation, the current value of $p_1$ defines which rows are present in the table.
+
+To reduce a row in the block stack table to a single value, we compute the following.
+
+$$
+row = \alpha_0 + \sum_{i=0}^3 (\alpha_{i+1} \cdot t_i)
+$$
+
+Where $\alpha_0, ..., \alpha_3$ are the random values provided by the verifier.
+
+#### Block hash table
+
+When the VM starts executing a new program block, it adds hashes of the block's children to the *block hash* table. And when the VM finishes executing a block, it removes its hash from the block hash table. Thus, by the time program execution completes, block hash table must be empty.
+
+The table can be thought of as consisting of $7$ columns as shown below:
+
+![block_hash_table](../../../../img/miden/vm/design/decoder/block_hash_table.png)
+
+where:
+* The first column ($t_0$) contains the ID of the block's parent. For program root, parent ID is $0$.
+* The next $4$ columns ($t_1, ..., t_4$) contain the hash of the block.
+* The next column ($t_5$) contains a binary value which is set to $1$ if the block is the first child of a *join* block, and to $0$ otherwise.
+* The last column ($t_6$) contains a binary value which is set to $1$ if the block is a body of a loop, and to $0$ otherwise.
+
+Running product column $p_2$ is used to keep track of the state of the table. At any step of the computation, the current value of $p_2$ defines which rows are present in the table.
+
+To reduce a row in the block hash table to a single value, we compute the following.
+
+$$
+row = \alpha_0 + \sum_{i=0}^6 (\alpha_{i+1} \cdot t_i)
+$$
+
+Where $\alpha_0, ..., \alpha_7$ are the random values provided by the verifier.
+
+Unlike other virtual tables, block hash table does not start out in an empty state. Specifically, it is initialized with a single row containing the hash of the program's root block. This needs to be done because the root block does not have a parent and, thus, otherwise it would never be added to the block hash table.
+
+Initialization of the block hash table is done by setting the initial value of $p_2$ to the value of the row containing the hash of a program's root block.
+
+#### Op group table
+
+*Op group* table is used in decoding of *span* blocks, which are leaves in a program's MAST. As described [here](../programs.md#span-block), a *span* block can contain one or more operation batches, each batch containing up to $8$ operation groups.
+
+When the VM starts executing a new batch of operations, it adds all operation groups within a batch, except for the first one, to the *op group* table. Then, as the VM starts executing an operation group, it removes the group from the table. Thus, by the time all operation groups in a batch have been executed, the *op group* table must be empty.
+
+The table can be thought of as consisting of $3$ columns as shown below:
+
+![decoder_op_group_table](../../../../img/miden/vm/design/decoder/decoder_op_group_table.png)
+
+The meaning of the columns is as follows:
+
+* The first column ($t_0$) contains operation batch ID. During the execution of the program, each operation batch is assigned a unique ID.
+* The second column ($t_1$) contains the position of the group in the *span* block (not just in the current batch). The position is $1$-based and is counted from the end. Thus, for example, if a *span* block consists of a single batch with $4$ groups, the position of the first group would be $4$, the position of the second group would be $3$ etc. (the reason for this is explained in [this](#single-batch-span) section). Note that the group with position $4$ is not added to the table, because it is the first group in the batch, so the first row of the table will be for the group with position $3$.
+* The third column ($t_2$) contains the actual values of operation groups (this could include up to $9$ opcodes or a single immediate value).
+
+Permutation column $p_3$ is used to keep track of the state of the table. At any step of the computation, the current value of $p_3$ defines which rows are present in the table.
+
+To reduce a row in the op group table to a single value, we compute the following.
+
+$$
+row = \alpha_0 + \sum_{i=0}^2 (\alpha_{i+1} \cdot t_i)
+$$
+
+Where $\alpha_0, ..., \alpha_3$ are the random values provided by the verifier.
+
+### Control flow operation semantics
+
+In this section we describe high-level semantics of executing all control flow operations. The descriptions are not meant to be complete and omit some low-level details. However, they provide good intuition on how these operations work.
+
+#### JOIN operation
+
+Before a `JOIN` operation is executed by the VM, the prover populates $h_0, ..., h_7$ registers with hashes of left and right children of the *join* program block as shown in the diagram below.
+
+![decoder_join_operation](../../../../img/miden/vm/design/decoder/decoder_join_operation.png)
+
+In the above diagram, `blk` is the ID of the *join* block which is about to be executed. `blk` is also the address of the hasher row in the auxiliary hasher table. `prnt` is the ID of the block's parent.
+
+When the VM executes a `JOIN` operation, it does the following:
+
+1. Adds a tuple `(blk, prnt, 0)` to the block stack table.
+2. Adds tuples `(blk, left_child_hash, 1, 0)` and `(blk, right_child_hash, 0, 0)` to the block hash table.
+3. Initiates a 2-to-1 hash computation in the hash chiplet (as described [here](#simple-2-to-1-hash)) using `blk` as row address in the auxiliary hashing table and $h_0, ..., h_7$ as input values.
+
+#### SPLIT operation
+
+Before a `SPLIT` operation is executed by the VM, the prover populates $h_0, ..., h_7$ registers with hashes of true and false branches of the *split* program block as shown in the diagram below.
+
+![decoder_split_operation](../../../../img/miden/vm/design/decoder/decoder_split_operation.png)
+
+In the above diagram, `blk` is the ID of the *split* block which is about to be executed. `blk` is also the address of the hasher row in the auxiliary hasher table. `prnt` is the ID of the block's parent.
+
+When the VM executes a `SPLIT` operation, it does the following:
+
+1. Adds a tuple `(blk, prnt, 0)` to the block stack table.
+2. Pops the stack and:\
+   a. If the popped value is $1$, adds a tuple `(blk, true_branch_hash, 0, 0)` to the block hash table.\
+   b. If the popped value is $0$, adds a tuple `(blk, false_branch_hash, 0, 0)` to the block hash table.\
+   c. If the popped value is neither $1$ nor $0$, the execution fails.
+3. Initiates a 2-to-1 hash computation in the hash chiplet (as described [here](#simple-2-to-1-hash)) using `blk` as row address in the auxiliary hashing table and $h_0, ..., h_7$ as input values.
+
+#### LOOP operation
+
+Before a `LOOP` operation is executed by the VM, the prover populates $h_0, ..., h_3$ registers with hash of the loop's body as shown in the diagram below.
+
+![decoder_loop_operation](../../../../img/miden/vm/design/decoder/decoder_loop_operation.png)
+
+In the above diagram, `blk` is the ID of the *loop* block which is about to be executed. `blk` is also the address of the hasher row in the auxiliary hasher table. `prnt` is the ID of the block's parent.
+
+When the VM executes a `LOOP` operation, it does the following:
+
+1. Pops the stack and:\
+   a. If the popped value is $1$ adds a tuple `(blk, prnt, 1)` to the block stack table (the `1` indicates that the loop's body is expected to be executed). Then, adds a tuple `(blk, loop_body_hash, 0, 1)` to the block hash table.\
+   b. If the popped value is $0$, adds `(blk, prnt, 0)` to the block stack table. In this case, nothing is added to the block hash table.\
+   c. If the popped value is neither $1$ nor $0$, the execution fails.
+2. Initiates a 2-to-1 hash computation in the hash chiplet (as described [here](#simple-2-to-1-hash)) using `blk` as row address in the auxiliary hashing table and $h_0, ..., h_3$ as input values.
+
+#### SPAN operation
+
+Before a `SPAN` operation is executed by the VM, the prover populates $h_0, ..., h_7$ registers with contents of the first operation batch of the span block as shown in the diagram below. The prover also sets the group count register $gc$ to the total number of operation groups in the span block.
+
+![decoder_span_block](../../../../img/miden/vm/design/decoder/decoder_span_block.png)
+
+In the above diagram, `blk` is the ID of the *span* block which is about to be executed. `blk` is also the address of the hasher row in the auxiliary hasher table. `prnt` is the ID of the block's parent. `g0_op0` is the first operation of the batch, and `g_0'` is the first operation group of the batch with the first operation removed.
+
+When the VM executes a `SPAN` operation, it does the following:
+
+1. Adds a tuple `(blk, prnt, 0)` to the block stack table.
+2. Adds groups of the operation batch, as specified by op batch flags (see [here](#operation-batch-flags)) to the op group table.
+3. Initiates a sequential hash computation in the hash chiplet (as described [here](#sequential-hash)) using `blk` as row address in the auxiliary hashing table and $h_0, ..., h_7$ as input values.
+4. Sets the `in_span` register to $1$.
+5. Decrements `group_count` register by $1$.
+6. Sets the `op_index` register to $0$.
+
+#### DYN operation
+
+Before a `DYN` operation is executed by the VM, the prover populates $h_0, ..., h_7$ registers with $0$ as shown in the diagram below.
+
+![decoder_dyn_operation](../../../../img/miden/vm/design/decoder/decoder_dyn_operation.png)
+
+In the above diagram, `blk` is the ID of the *dyn* block which is about to be executed. `blk` is also the address of the hasher row in the auxiliary hasher table. `prnt` is the ID of the block's parent.
+
+When the VM executes a `DYN` operation, it does the following:
+
+1. Adds a tuple `(blk, prnt, 0)` to the block stack table.
+2. Gets the hash of the dynamic code block `dynamic_block_hash` from the top four elements of the stack.
+2. Adds the tuple `(blk, dynamic_block_hash, 0, 0)` to the block hash table.
+3. Initiates a 2-to-1 hash computation in the hash chiplet (as described [here](#simple-2-to-1-hash)) using `blk` as row address in the auxiliary hashing table and $h_0, ..., h_7$ as input values.
+
+#### END operation
+
+Before an `END` operation is executed by the VM, the prover populates $h_0, ..., h_3$ registers with the hash of the block which is about to end. The prover also sets values in $h_4$ and $h_5$ registers as follows:
+* $h_4$ is set to $1$ if the block is a body of a *loop* block. We denote this value as `f0`.
+* $h_5$ is set to $1$ if the block is a *loop* block. We denote this value as `f1`.
+
+![decoder_end_operation](../../../../img/miden/vm/design/decoder/decoder_end_operation.png)
+
+In the above diagram, `blk` is the ID of the block which is about to finish executing. `prnt` is the ID of the block's parent.
+
+When the VM executes an `END` operation, it does the following:
+
+1. Removes a tuple `(blk, prnt, f1)` from the block stack table.
+2. Removes a tuple `(prnt, current_block_hash, nxt, f0)` from the block hash table, where $nxt=0$ if the next operation is either `END` or `REPEAT`, and $1$ otherwise.
+3. Reads the hash result from the hash chiplet (as described [here](#program-block-hashing)) using `blk + 7` as row address in the auxiliary hashing table.
+4. If $h_5 = 1$ (i.e., we are exiting a *loop* block), pops the value off the top of the stack and verifies that the value is $0$.
+5. Verifies that `group_count` register is set to $0$.
+
+#### HALT operation
+
+Before a `HALT` operation is executed by the VM, the VM copies values in $h_0, ..., h_3$ registers to the next row as illustrated in the diagram below:
+
+![decoder_halt_operation](../../../../img/miden/vm/design/decoder/decoder_halt_operation.png)
+
+In the above diagram, `blk` is the ID of the block which is about to finish executing.
+
+When the VM executes a `HALT` operation, it does the following:
+
+1. Verifies that block address register is set to $0$.
+2. If we are not at the last row of the trace, verifies that the next operation is `HALT`.
+3. Copies values of $h_0, ..., h_3$ registers to the next row.
+4. Populates all other decoder registers with $0$'s in the next row.
+
+#### REPEAT operation
+
+Before a `REPEAT` operation is executed by the VM, the VM copies values in registers $h_0, ..., h_4$ to the next row as shown in the diagram below.
+
+![decoder_repeat_operation](../../../../img/miden/vm/design/decoder/decoder_repeat_operation.png)
+
+In the above diagram, `blk` is the ID of the loop's body and `prnt` is the ID of the loop.
+
+When the VM executes a `REPEAT` operation, it does the following:
+
+1. Checks whether register $h_4$ is set to $1$. If it isn't (i.e., we are not in a loop), the execution fails.
+2. Pops the stack and if the popped value is $1$, adds a tuple `(prnt, loop_body_loop 0, 1)` to the block hash table. If the popped value is not $1$, the execution fails.
+
+The effect of the above is that the VM needs to execute the loop's body again to clear the block hash table.
+
+#### RESPAN operation
+
+Before a `RESPAN` operation is executed by the VM, the VM copies the ID of the current block `blk` and the number of remaining operation groups in the span to the next row, and sets the value of `in_span` column to $0$. The prover also sets the value of $h_1$ register for the next row to the ID of the current block's parent `prnt` as shown in the diagram below:
+
+![decoder_respan_operation](../../../../img/miden/vm/design/decoder/decoder_respan_operation.png)
+
+In the above diagram, `g0_op0` is the first operation of the new operation batch, and `g0'` is the first operation group of the batch with `g0_op0` operation removed.
+
+When the VM executes a `RESPAN` operation, it does the following:
+
+1. Increments block address by $8$.
+2. Removes the tuple `(blk, prnt, 0)` from the block stack table.
+3. Adds the tuple `(blk+8, prnt, 0)` to the block stack table.
+4. Absorbs values in registers $h_0, ..., h_7$ into the hasher state of the hash chiplet (as described [here](#sequential-hash)).
+5. Sets the `in_span` register to $1$.
+6. Adds groups of the operation batch, as specified by op batch flags (see [here](#operation-batch-flags)) to the op group table using `blk+8` as batch ID.
+
+The net result of the above is that we incremented the ID of the current block by $8$ and added the next set of operation groups to the op group table.
+
+## Program decoding
+
+When decoding a program, we start at the root block of the program. We can compute the hash of the root block directly from hashes of its children. The prover provides hashes of the child blocks non-deterministically, and we use them to compute the program's hash (here we rely on the hash chiplet). We then verify the program hash via boundary constraints. Thus, if the prover provided valid hashes for the child blocks, we will get the expected program hash.
+
+Now, we need to verify that the VM executed the child blocks correctly. We do this recursively similar to what is described above: for each of the blocks, the prover provides hashes of its children non-deterministically and we verify that the hash has been computed correctly. We do this until we get to the leaf nodes (i.e., *span* blocks). Hashes of *span* blocks are computed sequentially from the instructions executed by the VM.
+
+The sections below illustrate how different types of code blocks are decoded by the VM.
+
+### JOIN block decoding
+
+When decoding a *join* bock, the VM first executes a `JOIN` operation, then executes the first child block, followed by the second child block. Once the children of the *join* block are executed, the VM executes an `END` operation. This is illustrated in the diagram below.
+
+![decoder_join_block_decoding](../../../../img/miden/vm/design/decoder/decoder_join_block_decoding.png)
+
+As described previously, when the VM executes a `JOIN` operation, hashes of both children are added to the block hash table. These hashes are removed only when the `END` operations for the child blocks are executed. Thus, until both child blocks are executed, the block hash table is not cleared.
+
+### SPLIT block decoding
+
+When decoding a *split* block, the decoder pops an element off the top of the stack, and if the popped element is $1$, executes the block corresponding to the `true branch`. If the popped element is $0$, the decoder executes the block corresponding to the `false branch`. This is illustrated on the diagram below.
+
+![decoder_split_block_decoding](../../../../img/miden/vm/design/decoder/decoder_split_block_decoding.png)
+
+As described previously, when the VM executes a `SPLIT` operation, only the hash of the branch to be executed is added to the block hash table. Thus, until the child block corresponding to the required branch is executed, the block hash table is not cleared.
+
+### LOOP block decoding
+
+When decoding a *loop* bock, we need to consider two possible scenarios:
+
+* When the top of the stack is $1$, we need to enter the loop and execute loop body at least once.
+* When the top of the stack is, $0$ we need to skip the loop.
+
+In both cases, we need to pop an element off the top of the stack.
+
+#### Executing the loop
+
+If the top of the stack is $1$, the VM executes a `LOOP` operation. This removes the top element from the stack and adds the hash of the loop's body to the block hash table. It also adds a row to the block stack table setting the `is_loop` value to $1$.
+
+To clear the block hash table, the VM needs to execute the loop body (executing the `END` operation for the loop body block will remove the corresponding row from the block hash table). After loop body is executed, if the top of the stack is $1$, the VM executes a `REPEAT` operation (executing `REPEAT` operation when the top of the stack is $0$ will result in an error). This operation again adds the hash of the loop's body to the block hash table. Thus, the VM needs to execute the loop body again to clear the block hash table.
+
+This process is illustrated on the diagram below.
+
+![decoder_loop_execution](../../../../img/miden/vm/design/decoder/decoder_loop_execution.png)
+
+The above steps are repeated until the top of the stack becomes $0$, at which point the VM executes the `END` operation. Since in the beginning we set `is_loop` column in the block stack table to $1$, $h_6$ column will be set to $1$ when the `END` operation is executed. Thus, executing the `END` operation will also remove the top value from the stack. If the removed value is not $0$, the operation will fail. Thus, the VM can exit the loop block only when the top of the stack is $0$.
+
+#### Skipping the loop
+
+If the top of the stack is $0$, the VM still executes the `LOOP` operation. But unlike in the case when we need to enter the loop, the VM sets `is_loop` flag to $0$ in the block stack table, and does not add any rows to the block hash table. The last point means that the only possible operation to be executed after the `LOOP` operation is the `END` operation. This is illustrated in the diagram below.
+
+![decoder_loop_skipping](../../../../img/miden/vm/design/decoder/decoder_loop_skipping.png)
+
+Moreover, since we've set the `is_loop` flag to $0$, executing the `END` operation does not remove any items from the stack.
+
+### DYN block decoding
+
+When decoding a *dyn* bock, the VM first executes a `DYN` operation, then executes the child block dynamically specified by the top of the stack. Once the child of the *dyn* block has been executed, the VM executes an `END` operation. This is illustrated in the diagram below.
+
+![decoder_dyn_block_decoding](../../../../img/miden/vm/design/decoder/decoder_dyn_block_decoding.png)
+
+As described previously, when the VM executes a `DYN` operation, the hash of the child is added to the block hash table. This hash is removed only when the `END` operation for the child block is executed. Thus, until the child block corresponding to the dynamically specified target is executed, the block hash table is not cleared.
+
+### SPAN block decoding
+
+As described [here](../programs.md#span-block), a *span* block can contain one or more operation batches, each batch containing up to $8$ operation groups. At the high level, decoding of a span block is done as follows:
+
+1. At the beginning of the block, we make a request to the hash chiplet which initiates the hasher, absorbs the first operation batch ($8$ field elements) into the hasher, and returns the row address of the hasher, which we use as the unique ID for the *span* block (see [here](#sequential-hash)).
+2. We then add groups of the operation batch, as specified by op batch flags (but always skipping the first one) to the op group table.
+3. We then remove operation groups from the op group table in the FIFO order one by one, and decode them in the manner similar to the one described [here](#operation-group-decoding).
+4. Once all operation groups in a batch have been decoded, we absorb the next batch into the hasher and repeat the process described above.
+5. Once all batches have been decoded, we return the hash of the span block from the hasher.
+
+Overall, three control flow operations are used when decoding a *span* block:
+
+1. `SPAN` operation is used to initialize a hasher and absorbs the first operation batch into it.
+2. `RESPAN` operation is used to absorb any additional batches in the span block.
+3. `END` operation is used to end the decoding of a span block and retrieve its hash from the hash chiplet.
+
+#### Operation group decoding
+
+As described [here](../programs.md#span-block), an operation group is a sequence of operations which can be encoded into a single field element. For a field element of $64$ bits, we can fit up to $9$ operations into a group. We do this by concatenating binary representations of opcodes together with the first operation located in the least significant position.
+
+We can read opcodes from the group by simply subtracting them from the op group value and then dividing the result by $2^7$. Once the value of the op group reaches $0$, we know that all opcodes have been read. Graphically, this can be illustrated like so:
+
+![decoder_operation_group_decoding](../../../../img/miden/vm/design/decoder/decoder_operation_group_decoding.png)
+
+Notice that despite their appearance, `op bits` is actually $7$ separate registers, while `op group` is just a single register.
+
+We also need to make sure that at most $9$ operations are executed as a part of a single group. For this purpose we use the `op_index` column. Values in this column start out at $0$ for each operation group, and are incremented by $1$ for each executed operation. To make sure that at most $9$ operations can be executed in a group, the value of the `op_index` column is not allowed to exceed $8$.
+
+#### Operation batch flags
+
+Operation batch flags are used to specify how many operation groups comprise in a given operation batch. For most batches, the number of groups will be equal to $8$. However, for the last batch in a block (or for the first batch, if the block consists of only a single batch), the number of groups may be less than $8$. Since processing of new batches starts only on `SPAN` and `RESPAN` operations, only for these operations the flags can be set to non-zero values.
+
+To simplify the constraint system, number of groups in a batch can be only one of the following values: $1$, $2$, $4$, and $8$. If number of groups in a batch does not match one of these values, the batch is simply padded with `NOOP`'s (one `NOOP` per added group). Consider the diagram below.
+
+![decoder_OPERATION_batch_flags](../../../../img/miden/vm/design/decoder/decoder_OPERATION_batch_flags.png)
+
+In the above, the batch contains $3$ operation groups. To bring the count up to $4$, we consider the $4$-th group (i.e., $0$) to be a part of the batch. Since a numeric value for `NOOP` operation is $0$, op group value of $0$ can be interpreted as a single `NOOP`.
+
+Operation batch flags (denoted as $c_0, c_1, c_2$), encode the number of groups and define how many groups are added to the op group table as follows:
+
+* `(1, 0, 0)` - $8$ groups. Groups in $h_1, ... h_7$ are added to the op group table.
+* `(0, 1, 0)` - $4$ groups. Groups in $h_1, ... h_3$ are added to the op group table
+* `(0, 0, 1)` - $2$ groups. Groups in $h_1$ is added to the op group table.
+* `(0, 1, 1)` - $1$ group. Nothing is added to the op group table
+* `(0, 0, 0)` - not a `SPAN` or `RESPAN` operation.
+
+#### Single-batch span
+
+The simplest example of a *span* block is a block with a single batch. This batch may contain up to $8$ operation groups (e.g., $g_0, ..., g_7$). Decoding of such a block is illustrated in the diagram below.
+
+![decoder_single_batch_span](../../../../img/miden/vm/design/decoder/decoder_single_batch_span.png)
+
+Before the VM starts processing this *span* block, the prover populates registers $h_0, ..., h_7$ with operation groups $g_0, ..., g_7$. The prover also puts the total number of groups into the `group count` register $gc$. In this case, the total number of groups is $8$.
+
+When the VM executes a `SPAN` operation, it does the following:
+
+1. Initiates hashing of elements $g_0, ..., g_7$ using hash chiplet. The hasher address is used as the block ID `blk`, and it is inserted into `addr` register in the next row.
+2. Adds a tuple `(blk, prnt, 0)` to the block stack table.
+3. Sets the `is_span` register to $1$ in the next row.
+4. Sets the `op_index` register to $0$ in the next row.
+5. Decrements `group_count` register by $1$.
+6. Sets `op bits` registers at the next step to the first operation of $g_0$, and also copies $g_0$ with the first operation removed (denoted as $g_0'$) to the next row.
+7. Adds groups $g_1, ..., g_7$ to the op group table. Thus, after the `SPAN` operation is executed, op group table looks as shown below.
+
+![decoder_op_group_table_after_span_op](../../../../img/miden/vm/design/decoder/decoder_op_group_table_after_span_op.png)
+
+Then, with every step the next operation is removed from $g_0$, and by step $9$, value of $g_0$ is $0$. Once this happens, the VM does the following:
+
+1. Decrements `group_count` register by $1$.
+2. Sets `op bits` registers at the next step to the first operation of $g_1$.
+3. Sets `hasher` register $h_0$ to the value of $g_1$ with the first operation removed (denoted as $g_1'$).
+4. Removes row `(blk, 7, g1)` from the op group table. This row can be obtained by taking values from registers: `addr`, `group_count`, and $h_0' + \displaystyle\sum_{i=0}^6(2^i \cdot b_i')$ for $i \in [0, 7)$, where $h_0'$ and $b_i'$ refer to values in the next row for the first hasher column and `op_bits` columns respectively.
+
+Note that we rely on the `group_count` column to construct the row to be removed from the op group table. Since group count is decremented from the total number of groups to $0$, to remove groups from the op group table in correct order, we need to assign group position to groups in the op group table in the reverse order. For example, the first group to be removed should have position $7$, the second group to be removed should have position $6$ etc.
+
+Decoding of $g_1$ is performed in the same manner as decoding of $g_0$: with every subsequent step the next operation is removed from $g_1$ until its value reaches $0$, at which point, decoding of group $g_2$ begins.
+
+The above steps are executed until value of `group_count` reaches $0$. Once `group_count` reaches $0$ and the last operation group $g_7$ is executed, the VM executed the `END` operation. Semantics of the `END` operation are described [here](#end-operation).
+
+Notice that by the time we get to the `END` operation, all rows are removed from the op group table.
+
+#### Multi-batch span
+
+A *span* block may contain an unlimited number of operation batches. As mentioned previously, to absorb a new batch into the hasher, the VM executes a `RESPAN` operation. The diagram below illustrates decoding of a *span* block consisting of two operation batches.
+
+![decoder_multi_batch_span](../../../../img/miden/vm/design/decoder/decoder_multi_batch_span.png)
+
+Decoding of such a block will look very similar to decoding of the single-span block described previously, but there also will be some differences.
+
+First, after the `SPAN` operation is executed, the op group table will look as follows:
+
+![decoder_op_group_table_multi_span](../../../../img/miden/vm/design/decoder/decoder_op_group_table_multi_span.png)
+
+Notice that while the same groups ($g_1, ..., g_7$) are added to the table, their positions now reflect the total number of groups in the *span* block.
+
+Second, executing a `RESPAN` operation increments hasher address by $8$. This is done because absorbing additional $8$ elements into the hasher state requires $8$ more rows in the auxiliary hasher table.
+
+Incrementing value of `addr` register actually changes the ID of the *span* block (though, for a *span* block, it may be more appropriate to view values in this column as IDs of individual operation batches). This means that we also need to update the block stack table. Specifically, we need to remove row `(blk, prnt, 0)` from it, and replace it with row `(blk + 8, prnt, 0)`. To perform this operation, the prover sets the value of $h_1$ in the next row to `prnt`.
+
+Executing a `RESPAN` operation also adds groups $g_9, g_{10}, g_{11}$ to the op group table, which now would look as follows:
+
+![decoder_op_group_table_post_respan](../../../../img/miden/vm/design/decoder/decoder_op_group_table_post_respan.png)
+
+Then, the execution of the second batch proceeds in the manner similar to the first batch: we remove operations from the current op group, execute them, and when the value of the op group reaches $0$, we start executing the next group in the batch. Thus, by the time we get to the `END` operation, the op group table should be empty.
+
+When executing the `END` operation, the hash of the *span* block will be read from hasher row at address `addr + 7`, which, in our example, will be equal to `blk + 15`.
+
+#### Handling immediate values
+
+Miden VM operations can carry immediate values. Currently, the only such operation is a `PUSH` operation. Since immediate values can be thought of as constants embedded into program code, we need to make sure that changing immediate values affects program hash.
+
+To achieve this, we treat immediate values in a manner similar to how we treat operation groups. Specifically, when computing hash of a *span* block, immediate values are absorbed into the hasher state in the same way as operation groups are. As mentioned previously, an immediate value is represented by a single field element, and thus, an immediate value takes place of a single operation group.
+
+The diagram below illustrates decoding of a *span* block with $9$ operations one of which is a `PUSH` operation.
+
+![decoder_decoding_span_block_with_push](../../../../img/miden/vm/design/decoder/decoder_decoding_span_block_with_push.png)
+
+In the above, when the `SPAN` operation is executed, immediate value `imm0` will be added to the op group table, which will look as follows:
+
+![decoder_imm_vale_op_group_table](../../../../img/miden/vm/design/decoder/decoder_imm_vale_op_group_table.png)
+
+Then, when the `PUSH` operation is executed, the VM will do the following:
+
+1. Decrement `group_count` by $1$.
+2. Remove a row from the op group table equal to `(addr, group_count, s0')`, where $s_0'$ is the value of the top of the stack at the next row (i.e., it is the value that is pushed onto the stack).
+
+Thus, after the `PUSH` operation is executed, the op group table is cleared, and group count decreases to $0$ (which means that there are no more op groups to execute). Decoding of the rest of the op group proceeds as described in the previous sections.
+
+## Program decoding example
+
+Let's run through an example of decoding a simple program shown previously:
+
+```
+begin
+    <operations1>
+    if.true
+        <operations2>
+    else
+        <operations3>
+    end
+end
+```
+
+Translating this into code blocks with IDs assigned, we get the following:
+
+```
+b0: JOIN
+    b1: SPAN
+        <operations1>
+    b1: END
+    b2: SPLIT
+        b3: SPAN
+            <operations2>
+        b3: END
+        b4: SPAN
+            <operations3>
+        b4: END
+    b2: END
+b0: END
+```
+
+The root of the program is a *join* block $b_0$. This block contains two children: a *span* bock $b_1$ and a *split* block $b_2$. In turn, the *split* block $b_2$ contains two children: a *span* block $b_3$ and a *span* block $b_4$.
+
+When this program is executed on the VM, the following happens:
+
+1. Before the program starts executing, block hash table is initialized with a single row containing the hash of $b_0$.
+2. Then, `JOIN` operation for $b_0$ is executed. It adds hashes of $b_1$ and $b_2$ to the block hash table. It also adds an entry for $b_0$ to the block stack table. States of both tables after this step are illustrated below.
+3. Then, *span* $b_1$ is executed and a sequential hash of its operations is computed. Also, when `SPAN` operation for $b_1$ is executed, an entry for $b_1$ is added to the block stack table. At the end of $b_1$ (when `END` is executed), entries for $b_1$ are removed from both the block hash and block stack tables.
+4. Then, `SPLIT` operation for $b_2$ is executed. It adds an entry for $b_2$ to the block stack table. Also, depending on whether the top of the stack is $1$ or $0$, either hash of $b_3$ or hash of $b_4$ is added to the block hash table. Let's say the top of the stack is $1$. Then, at this point, the block hash and block stack tables will look like in the second picture below.
+5. Then, *span* $b_3$ is executed and a sequential hash of its instructions is computed. Also, when `SPAN` operation for $b_3$ is executed, an entry for $b_3$ is added to the block stack table. At the end of $b_3$ (when `END` is executed), entries for $b_3$ are removed from both the block hash and block stack tables.
+6. Then, `END` operation for $b_2$ is executed. It removes the hash of $b_2$ from the block hash table, and also removes the entry for $b_2$ from the block stack table. The third picture below illustrates the states of block stack and block hash tables after this step.
+7. Then, `END` for $b_0$ is executed, which removes entries for $b_0$ from the block stack and block hash tables. At this point both tables are empty.
+8. Finally, a sequence of `HALT` operations is executed until the length of the trace reaches a power of two.
+
+States of block hash and block stack tables after step 2:
+![decoder_state_block_hash_2](../../../../img/miden/vm/design/decoder/decoder_state_block_hash_2.png)
+
+States of block hash and block stack tables after step 4:
+![decoder_state_block_hash_4](../../../../img/miden/vm/design/decoder/decoder_state_block_hash_4.png)
+
+States of block hash and block stack tables after step 6:
+![decoder_state_block_hash_6](../../../../img/miden/vm/design/decoder/decoder_state_block_hash_6.png)
diff --git a/docs/miden/vm/design/index.md b/docs/miden/vm/design/index.md
new file mode 100644
index 000000000..1347e41ae
--- /dev/null
+++ b/docs/miden/vm/design/index.md
@@ -0,0 +1,53 @@
+In the following sections, we provide in-depth descriptions of Miden VM internals, including all AIR constraints for the proving system. We also provide rationale for making specific design choices.
+
+Throughout these sections we adopt the following notations and assumptions:
+
+* All arithmetic operations, unless noted otherwise, are assumed to be in a prime field with modulus $p = 2^{64} - 2^{32} + 1$.
+* A _binary_ value means a field element which is either $0$ or $1$.
+* We use lowercase letters to refer to individual field elements (e.g., $a$), and uppercase letters to refer to groups of $4$ elements, also referred to as words (e.g., $A$). To refer to individual elements within a word, we use numerical subscripts. For example, $a_0$ is the first element of word $A$, $b_3$ is the last element of word $B$, etc.
+* When describing AIR constraints:
+  - For a column $x$, we denote the value in the current row simply as $x$, and the value in the next row of the column as $x'$. Thus, all transition constraints for Miden VM work with two consecutive rows of the execution trace.
+  - For multiset equality constraints, we denote random values sent by the verifier after the prover commits to the main execution trace as $\alpha_0, \alpha_1, \alpha_2$ etc.
+  - To differentiate constraints from other formulas, we frequently use the following format for constraint equations.
+
+$$
+x' - (x + y) = 0 \text{ | degree} = 1
+$$
+
+In the above, the constraint equation is followed by the implied algebraic degree of the constraint. This degree is determined by the number of multiplications between trace columns. If a constraint does not involve any multiplications between columns, its degree is $1$. If a constraint involves multiplication between two columns, its degree is $2$. If we need to multiply three columns together, the degree is $3$ ect.
+
+The maximum allowed constraint degree in Miden VM is $9$. If a constraint degree grows beyond that, we frequently need to introduce additional columns to reduce the degree.
+
+## VM components
+Miden VM consists of several interconnected components, each providing a specific set of functionality. These components are:
+
+* **System**, which is responsible for managing system data, including the current VM cycle (`clk`), the free memory pointer (`fmp`) used for specifying the region of memory available to procedure locals, and the current and parent execution contexts.
+* **Program decoder**, which is responsible for computing a commitment to the executing program and converting the program into a sequence of operations executed by the VM.
+* **Operand stack**, which is a push-down stack which provides operands for all operations executed by the VM.
+* **Range checker**, which is responsible for providing 16-bit range checks needed by other components.
+* **Chiplets**, which is a set of specialized circuits used to accelerate commonly-used complex computations. Currently, the VM relies on 4 chiplets:
+  - Hash chiplet, used to compute Rescue Prime Optimized hashes both for sequential hashing and for Merkle tree hashing.
+  - Bitwise chiplet, used to compute bitwise operations (e.g., `AND`, `XOR`) over 32-bit integers.
+  - Memory chiplet, used to support random-access memory in the VM.
+  - Kernel ROM chiplet, used to enable calling predefined kernel procedures which are provided before execution begins.
+
+The above components are connected via **buses**, which are implemented using [lookup arguments](./lookups/index.md). We also use [multiset check lookups](./lookups/multiset.md) internally within components to describe **virtual tables**.
+
+## VM execution trace
+
+The execution trace of Miden VM consists of $71$ main trace columns, $2$ buses, and $5$ virtual tables, as shown in the diagram below.
+
+![vm_trace.png](../../../img/miden/vm//design/vm_trace.png)
+
+As can be seen from the above, the system, decoder, stack, and range checker components use dedicated sets of columns, while all chiplets share the same $17$ columns. To differentiate between chiplets, we use a set of binary selector columns, a combination of which uniquely identifies each chiplet.
+
+The system component does not yet have a dedicated documentation section, since the design is likely to change. However, the following two columns are not expected to change:
+
+* `clk` which is used to keep track of the current VM cycle. Values in this column start out at $0$ and are incremented by $1$ with each cycle.
+* `fmp` which contains the value of the free memory pointer used for specifying the region of memory available to procedure locals.
+
+AIR constraints for the `fmp` column are described in [system operations](./stack/system-ops.md) section. For the `clk` column, the constraints are straightforward:
+
+$$
+clk' - (clk + 1) = 0 \text{ | degree} = 1
+$$
\ No newline at end of file
diff --git a/docs/miden/vm/design/lookups/index.md b/docs/miden/vm/design/lookups/index.md
new file mode 100644
index 000000000..d73a293e6
--- /dev/null
+++ b/docs/miden/vm/design/lookups/index.md
@@ -0,0 +1,54 @@
+Zero knowledge virtual machines frequently make use of lookup arguments to enable performance optimizations. Miden VM uses two types of arguments: multiset checks and a multivariate lookup based on logarithmic derivatives known as LogUp. A brief introduction to multiset checks can be found [here](https://hackmd.io/@arielg/ByFgSDA7D). The description of LogUp can be found [here](https://eprint.iacr.org/2022/1530.pdf).
+
+In Miden VM, lookup arguments are used for two purposes:
+
+1. To prove the consistency of intermediate values that must persist between different cycles of the trace without storing the full data in the execution trace (which would require adding more columns to the trace).
+2. To prove correct interaction between two independent sections of the execution trace, e.g., between the main trace where the result of some operation is required, but would be expensive to compute, and a specialized component which can perform that operation cheaply.
+
+The first is achieved using [virtual tables](#virtual-tables-in-miden-vm) of data, where we add a row at some cycle in the trace and remove it at a later cycle when it is needed again. Instead of maintaining the entire table in the execution trace, multiset checks allow us to prove data consistency of this table using one running product column.
+
+The second is done by reducing each operation to a lookup value and then using a [communication bus](#communication-buses-in-miden-vm) to provably connect the two sections of the trace. These communication buses can be implemented either via [multiset checks](multiset.md#communication-buses) or via the [LogUp argument](logup.md).
+
+
+## Virtual tables in Miden VM
+
+Miden VM makes use of 6 virtual tables across 4 components, all of which are implemented via [multiset checks](multiset.md#virtual-tables):
+
+- Stack:
+    - [Overflow table](../stack/index.md#overflow-table)
+- Decoder:
+    - [Block stack table](../decoder/index.md#block-stack-table)
+    - [Block hash table](../decoder/index.md#block-hash-table)
+    - [Op group table](../decoder/index.md#op-group-table)
+- Chiplets:
+    - [Chiplets virtual table](../chiplets/index.md#chiplets-virtual-table), which combines the following two tables into one:
+        - [Hash chiplet sibling table](../chiplets/hasher.md#sibling-table-constraints)
+        - [Kernel ROM chiplet procedure table](../chiplets/kernel_rom.md#kernel-procedure-table-constraints)
+
+## Communication buses in Miden VM
+
+One strategy for improving the efficiency of a zero knowledge virtual machine is to use specialized components for complex operations and have the main circuit “offload” those operations to the corresponding components by specifying inputs and outputs and allowing the proof of execution to be done by the dedicated component instead of by the main circuit.
+
+These specialized components are designed to prove the internal correctness of the execution of the operations they support. However, in isolation they cannot make any guarantees about the source of the input data or the destination of the output data.
+
+In order to prove that the inputs and outputs specified by the main circuit match the inputs and outputs provably executed in the specialized component, some kind of provable communication bus is needed.
+
+This bus is typically implemented as some kind of lookup argument, and in Miden VM in particular we use multiset checks or LogUp.
+
+Miden VM uses 2 communication buses:
+
+- The chiplets bus [$b_{chip}$](../chiplets/index.md#chiplets-bus), which communicates with all of the chiplets (Hash, Bitwise, Memory, and Kernel ROM). It is implemented using multiset checks.
+- The range checker bus [$b_{range}$](../range.md#communication-bus), which facilitates requests between the [stack](../stack/u32-ops.md) and [memory](../chiplets/memory.md) components and the [range checker](../range.md). It is implemented using LogUp.
+
+
+## Length of auxiliary columns for lookup arguments
+
+The auxiliary columns used for buses and virtual tables are computed by including information from the *current* row of the main execution trace into the *next* row of the auxiliary trace column. Thus, in order to ensure that the trace is long enough to give the auxiliary column space for its final value, a padding row may be required at the end of the trace of the component upon which the auxiliary column depends.
+
+This is true when the data in the main trace could go all the way to the end of the trace, such as in the case of the range checker.
+
+## Cost of auxiliary columns for lookup arguments
+
+It is important to note that depending on the field in which we operate, an auxilliary column implementing a lookup argument may actually require more than one trace column. This is specifically true for small fields.
+
+Since Miden uses a 64-bit field, each auxiliary column needs to be represented by $2$ columns to achieve ~100-bit security and by $3$ columns to achieve ~128-bit security.
\ No newline at end of file
diff --git a/docs/miden/vm/design/lookups/logup.md b/docs/miden/vm/design/lookups/logup.md
new file mode 100644
index 000000000..5f6c971b1
--- /dev/null
+++ b/docs/miden/vm/design/lookups/logup.md
@@ -0,0 +1,66 @@
+## Multivariate lookups with logarithmic derivatives
+
+The description of LogUp can be found [here](https://eprint.iacr.org/2022/1530.pdf). In MidenVM, LogUp is used to implement efficient [communication buses](./index.md#communication-buses-in-miden-vm). 
+
+Using the LogUp construction instead of a simple [multiset check](./multiset.md) with running products reduces the computational effort for the prover and the verifier. Given two columns $a$ and $b$ in the main trace where $a$ contains duplicates and $b$ does not (i.e. $b$ is part of the lookup table), LogUp allows us to compute two logarithmic derivatives and check their equality.
+
+$$
+\sum_{i=0}^{l} \frac{1}{(\alpha - a_i)} = \sum_{i=0}^{n} \frac{m_i}{(\alpha - b_i)}
+$$
+
+In the above:
+
+- $l$ is the number of values in $a$, which must be smaller than the size of the field. (The prime field used for Miden VM has modulus $p = 2^{64} - 2^{32} + 1$, so $l < p$ must be true.)
+- $n$ is the number of values in $b$, which must be smaller than the size of the field. ($n < p$, for Miden VM)
+- $m_i$ is the multiplicity of $b_i$, which is expected to match the number of times the value $b_i$ is duplicated in column $a$. It must be smaller than the size of the set of lookup values. ($m_i < n$)
+- $\alpha$ is a random value that is sent to the prover by the verifier after the prover commits to the execution trace of the program.
+
+Thus, instead of needing to compute running products, we are able to assert correct lookups by computing running sums.
+
+## Usage in Miden VM
+
+The generalized trace columns and constraints for this construction are as follows, where component $X$ is some component in the trace and lookup table $T$ contains the values $v$ which need to be looked up from $X$ and how many times they are looked up (the multiplicity $m$).
+
+![logup_component_x](../../../../img/miden/vm/design/lookups/logup_component.png)
+
+![logup_table_t](../../../../img/miden/vm/design/lookups/logup_table.png)
+
+### Constraints
+
+The diagrams above show running sum columns for computing the logarithmic derivatives for both $X$ and $T$. As an optimization, we can combine these values into a single auxiliary column in the extension field that contains the running sum of values from both logarithmic derivatives. We'll refer to this column as a _communication bus_ $b$, since it communicates the lookup request from the component $X$ to the lookup table $T$. 
+
+This can be expressed as follows:
+
+> $$
+b' = b + \frac{m}{(\alpha - v)} - \frac{1}{(\alpha - x)}
+$$
+
+Since constraints must be expressed without division, the actual constraint which is enforced will be the following:
+
+> $$
+b' \cdot (\alpha - v) \cdot (\alpha - x) = b \cdot (\alpha - x) \cdot (\alpha - v) + m \cdot (\alpha - x) - (\alpha - v) \text{ | degree} = 3
+$$
+
+In general, we will write constraints within these docs using the previous form, since it's clearer and more readable.
+
+Additionally, boundary constraints must be enforced against $b$ to ensure that its initial and final values are $1$. This will enforce that the logarithmic derivatives for $X$ and $T$ were equal.
+
+### Extending the construction to multiple components
+
+The functionality of the bus can easily be extended to receive lookup requests from multiple components. For example, to additionally support requests from column $y$, the bus constraint would be modified to the following:
+
+> $$
+b' = b + \frac{m}{(\alpha - v)} - \frac{1}{(\alpha - x)} - \frac{1}{(\alpha - y)} \text{ | degree} = 4
+$$
+
+Since the maximum constraint degree in Miden VM is 9, the lookup table $T$ could accommodate requests from at most 7 trace columns in the same trace row via this construction.
+
+### Extending the construction with flags
+
+Boolean flags can also be used to determine when requests from various components are sent to the bus. For example, let $f_x$ be 1 when a request should be sent from $x$ and 0 otherwise, and let $f_y$ be similarly defined for column $y$. We can use the following constraint to turn requests on or off:
+
+> $$
+b' = b + \frac{m}{(\alpha - v)} - \frac{f_x}{(\alpha - x)} - \frac{f_y}{(\alpha - y)} \text{ | degree} = 4
+$$
+
+If any of these flags have degree greater than 2 then this will increase the overall degree of the constraint and reduce the number of lookup requests that can be accommodated by the bus per row.
\ No newline at end of file
diff --git a/docs/miden/vm/design/lookups/multiset.md b/docs/miden/vm/design/lookups/multiset.md
new file mode 100644
index 000000000..cd220bd7a
--- /dev/null
+++ b/docs/miden/vm/design/lookups/multiset.md
@@ -0,0 +1,107 @@
+# Multiset checks
+
+A brief introduction to multiset checks can be found [here](https://hackmd.io/@arielg/ByFgSDA7D). In Miden VM, multiset checks are used to implement [virtual tables](#virtual-tables) and efficient [communication buses](index.md#communication-buses-in-miden-vm).
+
+## Running product columns
+
+Although the multiset equality check can be thought of as comparing multiset equality between two vectors $a$ and $b$, in Miden VM it is implemented as a single running product column in the following way:
+
+- The running product column is initialized to a value $x$ at the beginning of the trace. (We typically use $x = 1$.)
+- All values of $a$ are multiplied into the running product column.
+- All values of $b$ are divided out of the running product column.
+- If $a$ and $b$ were multiset equal, then the running product column will equal $x$ at the end of the trace.
+
+Running product columns are computed using a set of random values $\alpha_0$, $\alpha_1, ...$ sent to the prover by the verifier after the prover commits to the execution trace of the program.
+
+## Virtual tables
+
+Virtual tables can be used to store intermediate data which is computed at one cycle and used at a different cycle. When the data is computed, the row is added to the table, and when it is used later, the row is deleted from the table. Thus, all that needs to be proved is the data consistency between the row that was added and the row that was deleted.
+
+The consistency of a virtual table can be proved with a single trace column $p$, which keeps a running product of rows that were inserted into and deleted from the table. This is done by reducing each row to a single value, multiplying the value into $p$ when the row is inserted, and dividing the value out of $p$ when the row is removed. Thus, at any step of the computation, $p$​ will contain a product of all rows currently in the table.
+
+The initial value of $p$​ is set to 1. Thus, if the table is empty by the time Miden VM finishes executing a program (we added and then removed exactly the same set of rows), the final value of $p$​ will also be equal to 1. The initial and final values are enforced via boundary constraints.
+
+### Computing a virtual table's trace column
+
+To compute a product of rows, we'll first need to reduce each row to a single value. This can be done as follows.
+
+Let $t_0, t_1, t_2, ...$ be columns in the virtual table, and assume the verifier sends a set of random values $\alpha_0$, $\alpha_1, ...$ to the prover after the prover commits to the execution trace of the program.
+
+The prover reduces row $i$ in the table to a single value $r_i$ as:
+
+$$
+r_i = \alpha_0 + \alpha_1 \cdot t_{0, i} + \alpha_2 \cdot t_{1, i} + \alpha_3 \cdot t_{2, i} + ...
+$$
+
+Then, when row $i$ is added to the table, we'll update the value in the $p$ column like so:
+
+$$
+p' = p \cdot r_i
+$$
+
+Analogously, when row $i$ is removed from the table, we'll update the value in column $p$ like so:
+
+$$
+p' = \frac{p}{r_i}
+$$
+
+### Virtual tables in Miden VM
+
+Miden VM makes use of 6 virtual tables across 4 components:
+
+- Stack:
+    - [Overflow table](../stack/index.md#overflow-table)
+- Decoder:
+    - [Block stack table](../decoder/index.md#block-stack-table)
+    - [Block hash table](../decoder/index.md#block-hash-table)
+    - [Op group table](../decoder/index.md#op-group-table)
+- Chiplets:
+    - [Chiplets virtual table](../chiplets/index.md#chiplets-virtual-table), which combines the following two tables into one:
+        - [Hash chiplet sibling table](../chiplets/hasher.md#sibling-table-constraints)
+        - [Kernel ROM chiplet procedure table](../chiplets/kernel_rom.md#kernel-procedure-table-constraints)
+
+## Communication buses via multiset checks
+
+A `bus` can be implemented as a single trace column $b$ where a request can be sent to a specific component and a corresponding response will be sent back by that component.
+
+The values in this column contain a running product of the communication with the component as follows:
+
+- Each request is “sent” by computing a lookup value from some information that's specific to the specialized component, the operation inputs, and the operation outputs, and then dividing it out of the running product column $b$.
+- Each chiplet response is “sent” by computing the same lookup value from the component-specific information, inputs, and outputs, and then multiplying it into the running product column $b$.
+
+Thus, if the requests and responses match, and the bus column $b$ is initialized to $1$, then $b$ will start and end with the value $1$. This condition is enforced by boundary constraints on column $b$.
+
+Note that the order of the requests and responses does not matter, as long as they are all included in $b$. In fact, requests and responses for the same operation will generally occur at different cycles. Additionally, there could be multiple requests sent in the same cycle, and there could also be a response provided at the same cycle that a request is received.
+
+### Communication bus constraints
+
+These constraints can be expressed in a general way with the 2 following requirements:
+
+- The lookup value must be computed using random values $\alpha_0, \alpha_1$, etc. that are provided by the verifier after the prover has committed to the main execution trace.
+- The lookup value must include all uniquely identifying information for the component/operation and its inputs and outputs.
+
+Given an example operation $op_{ex}$ with inputs $i_0, ..., i_n$ and outputs $o_0, ..., o_m$, the lookup value can be computed as follows:
+
+$$lookup = \alpha_0 + \alpha_1 \cdot op_{ex} + \alpha_2 \cdot i_0 + ... + \alpha_{n+2} \cdot i_n + \alpha_{n+3} \cdot o_0 + ... + \alpha_{n + 2 + m} \cdot o_m$$
+
+The constraint for sending this to the bus as a request would be:
+
+$$b' \cdot lookup = b$$
+
+The constraint for sending this to the bus as a response would be:
+
+$$b' = b \cdot lookup$$
+
+However, these constraints must be combined, since it's possible that requests and responses both occur during the same cycle.
+
+To combine them, let $u_{lookup}$ be the request value and let $v_{lookup}$ be the response value. These values are both computed the same way as shown above, but the data sources are different, since the input/output values used to compute $u_{lookup}$ come from the trace of the component that's "offloading" the computation, while the input/output values used to compute $v_{lookup}$ come from the trace of the specialized component.
+
+The final constraint can be expressed as:
+
+$$b' \cdot u_{lookup} = b \cdot v_{lookup}$$
+
+### Communication buses in Miden VM
+
+In Miden VM, the specialized components are implemented as dedicated segments of the execution trace, which include the 3 chiplets in the Chiplets module (the hash chiplet, bitwise chiplet, and memory chiplet).
+
+Miden VM currently uses multiset checks to implement the chiplets bus [$b_{chip}$](../chiplets/index.md#chiplets-bus), which communicates with all of the chiplets (Hash, Bitwise, and Memory).
diff --git a/docs/miden/vm/design/programs.md b/docs/miden/vm/design/programs.md
new file mode 100644
index 000000000..d1a6ee45e
--- /dev/null
+++ b/docs/miden/vm/design/programs.md
@@ -0,0 +1,130 @@
+Miden VM consumes programs in a form of a Merkelized Abstract Syntax Tree (MAST). This tree is a binary tree where each node is a *code block*. The VM starts execution at the root of the tree, and attempts to recursively execute each required block according to its semantics. If the execution of a code block fails, the VM halts at that point and no further blocks are executed. A set of currently available blocks and their execution semantics are described below.
+
+## Code blocks
+
+### Join block
+
+A **join** block is used to describe sequential execution. When the VM encounters a *join* block, it executes its left child first, and then executes its right child.
+
+![join_block](../../../img/miden/vm/design/programs/join_block.png)
+
+A *join* block must always have two children, and thus, cannot be a leaf node in the tree.
+
+### Split block
+A **split** block is used to describe conditional execution. When the VM encounters a *split* block, it checks the top of the stack. If the top of the stack is $1$, it executes the left child, if the top of the stack is $0$, it executes the right child. If the top of the stack is neither $0$ nor $1$, the execution fails.
+
+![split_block](../../../img/miden/vm/design/programs/split_block.png)
+
+A *split* block must always have two children, and thus, cannot be a leaf node in the tree.
+
+### Loop block
+A **loop** block is used to describe condition-based iterative execution. When the VM encounters a *loop* block, it checks the top of the stack. If the top of the stack is $1$, it executes the loop body, if the top of the stack is $0$, the block is not executed. If the top of the stack is neither $0$ nor $1$, the execution fails.
+
+After the body of the loop is executed, the VM checks the top of the stack again. If the top of the stack is $1$, the body is executed again, if the top of the stack is $0$, the loop is exited. If the top of the stack is neither $0$ nor $1$, the execution fails.
+
+![loop_block](../../../img/miden/vm/design/programs/loop_block.png)
+
+A *loop* block must always have one child, and thus, cannot be a leaf node in the tree.
+
+### Dyn block
+A **dyn** block is used to describe a node whose target is specified dynamically via the stack. When the VM encounters a *dyn* block, it executes a program which hashes to the target specified by the top of the stack. Thus, it has a dynamic target rather than a hardcoded target. In order to execute a *dyn* block, the VM must be aware of a program with the hash value that is specified by the top of the stack. Otherwise, the execution fails.
+
+![dyn_block](../../../img/miden/vm/design/programs/dyn_block.png)
+
+A *dyn* block must always have one (dynamically-specified) child. Thus, it cannot be a leaf node in the tree.
+
+### Call block
+
+A **call** block is used to describe a function call which is executed in a [user context](../user-docs/assembly/execution-contexts.md). When the VM encounters a *call* block, it creates a new user context, then executes a program which hashes to the target specified by the *call* block in the new context. Thus, in order to execute a *call* block, the VM must be aware of a program with the specified hash. Otherwise, the execution fails. At the end of the *call* block, execution returns to the previous context.
+
+
+When executing a *call* block, the VM does the following:
+1. Checks if a *syscall* is already being executed and fails if so.
+2. Sets the depth of the stack to 16.
+3. Upon return, checks that the depth of the stack is 16. If so, the original stack depth is restored. Otherwise, an error occurs.
+
+![call_block](../../../img/miden/vm/design/programs/call_block.png)
+
+A *call* block does not have any children. Thus, it must be leaf node in the tree.
+
+### Syscall block
+
+A **syscall** block is used to describe a function call which is executed in the [root context](../user-docs/assembly/execution-contexts.md). When the VM encounters a *syscall* block, it returns to the root context, then executes a program which hashes to the target specified by the *syscall* block. Thus, in order to execute a *syscall* block, the VM must be aware of a program with the specified hash, and that program must belong to the kernel against which the code is compiled. Otherwise, the execution fails. At the end of the *syscall* block, execution returns to the previous context.
+
+When executing a *syscall* block, the VM does the following:
+1. Checks if a *syscall* is already being executed and fails if so.
+2. Sets the depth of the stack to 16.
+3. Upon return, checks that the depth of the stack is 16. If so, the original stack depth is restored. Otherwise, an error occurs.
+
+![syscall_block](../../../img/miden/vm/design/programs/syscall_block.png)
+
+A *syscall* block does not have any children. Thus, it must be leaf node in the tree.
+
+### Span block
+A **span** block is used to describe a linear sequence of operations. When the VM encounters a *span* block, it breaks the sequence of operations into batches and groups according to the following rules:
+* A group is represented by a single field element. Thus, assuming a single operation can be encoded using 7 bits, and assuming we are using a 64-bit field, a single group may encode up to 9 operations or a single immediate value.
+* A batch is a set of groups which can be absorbed by a hash function used by the VM in a single permutation. For example, assuming the hash function can absorb up to 8 field elements in a single permutation, a single batch may contain up to 8 groups.
+* There is no limit on the number of batches contained within a single span.
+
+Thus, for example, executing 8 pushes in a row will result in two operation batches as illustrated in the picture below:
+
+![span_block_creation](../../../img/miden/vm/design/programs/span_block_creation.png)
+
+* The first batch will contain 8 groups, with the first group containing 7 `PUSH` opcodes and 1 `NOOP`, and the remaining 7 groups containing immediate values for each of the push operations. The reason for the `NOOP` is explained later in this section.
+* The second batch will contain 2 groups, with the first group containing 1 `PUSH` opcode and 1 `NOOP`, and the second group containing the immediate value for the last push operation.
+
+
+If a sequence of operations does not have any operations which carry immediate values, up to 72 operations can fit into a single batch.
+
+From the user's perspective, all operations are executed in order, however, the VM may insert occasional `NOOP`s to ensure proper alignment of all operations in the sequence. Currently, the alignment requirements are as follows:
+* An operation carrying an immediate value cannot be the last operation in a group. Thus, for example, if a `PUSH` operation is the last operation in a group, the VM will insert a `NOOP` after it.
+
+A *span* block does not have any children, and thus, must be leaf node in the tree.
+
+## Program example
+Consider the following program, where $a_0, ..., a_i$, $b_0, ..., b_j$ etc. represent individual operations:
+
+```
+a_0, ..., a_i
+if.true
+    b_0, ..., b_j
+else
+    c_0, ..., c_k
+    while.true
+        d_0, ..., d_n
+    end
+    e_0, ..., e_m
+end
+f_0, ..., f_l
+```
+
+A MAST for this program would look as follows:
+
+![mast_of_program](../../../img/miden/vm/design/programs/mast_of_program.png)
+
+Execution of this program would proceed as follows:
+
+1. The VM will start execution at the root of the program which is block $B_5$.
+2. Since, $B_5$ is a *join block*, the VM will attempt to execute block $B_4$ first, and only after that execute block $f$.
+3. Block $B_4$ is also a *join block*, and thus, the VM will execute block $a$ by executing operations $a_0, ..., a_i$ in sequence, and then execute block $B_3$.
+4. Block $B_3$ is a *split block*, and thus, the VM will pop the value off the top of the stack. If the popped value is $1$, operations from block $b$ will be executed in sequence. If the popped value is $0$, then the VM will attempt to execute block $B_2$.
+5. $B_2$ is a *join block*, thus, the VM will try to execute block $B_1$ first, and then execute operations from block $e$.
+6. Block $B_1$ is also a *join_block*, and thus, the VM will first execute all operations in block $c$, and then will attempt to execute block $B_0$.
+7. Block $B_0$ is a loop block, thus, the VM will pop the value off the top of the stack. If the pooped value is $1$, the VM will execute the body of the loop defined by block $d$. If the popped value is $0$, the VM will not execute block $d$ and instead will move up the tree executing first block $e$, then $f$.
+8. If the VM does enter the loop, then after operation $d_n$ is executed, the VM will pop the value off the top of the stack again. If the popped value is $1$, the VM will execute block $d$ again, and again until the top of the stack becomes $0$. Once the top of the stack becomes $0$, the VM will exit the loop and will move up the tree executing first block $e$, then $f$.
+
+## Program hash computation
+Every Miden VM program can be reduced to a unique hash value. Specifically, it is infeasible to find two Miden VM programs with distinct semantics which hash to the same value. Padding a program with `NOOP`s does not change a program's execution semantics, and thus, programs which differ only in the number and/or placement of `NOOP`s may hash to the same value, although in most cases padding with `NOOP` should not affect program hash.
+
+To prevent program hash collisions we implement domain separation across the variants of control blocks. We define the domain value to be the opcode of the operation that initializes the control block.
+
+Below we denote $hash$ to be an arithmetization-friendly hash function with $4$-element output and capable of absorbing $8$ elements in a single permutation. The hash domain is specified as the subscript of the hash function and its value is used to populate the second capacity register upon initialization of control block hashing - $hash_{domain}(a, b)$.
+
+* The hash of a **join** block is computed as $hash_{join}(a, b)$, where $a$ and $b$ are hashes of the code block being joined.
+* The hash of a **split** block is computed as $hash_{split}(a, b)$, where $a$ is a hash of a code block corresponding to the *true* branch of execution, and $b$ is a hash of a code block corresponding to the *false branch* of execution.
+* The hash of a **loop** block is computed as $hash_{loop}(a, 0)$, where $a$ is a hash of a code block corresponding to the loop body.
+* The hash of a **dyn** block is set to a constant, so it is the same for all *dyn* blocks. It does not depend on the hash of the dynamic child. This constant is computed as the RPO hash of two empty words (`[ZERO, ZERO, ZERO, ZERO]`) using a domain value of `DYN_DOMAIN`, where `DYN_DOMAIN` is the op code of the `Dyn` operation.
+* The hash of a **call** block is computed as $hash_{call}(a, 0)$, where $a$ is a hash of a program of which the VM is aware.
+* The hash of a **syscall** block is computed as $hash_{syscall}(a, 0)$, where $a$ is a hash of a program belonging to the kernel against which the code was compiled.
+* The hash of a **span** block is computed as $hash(a_1, ..., a_k)$, where $a_i$ is the $i$th batch of operations in the *span* block. Each batch of operations is defined as containing $8$ field elements, and thus, hashing a $k$-batch *span* block requires $k$ absorption steps.
+    * In cases when the number of operations is insufficient to fill the last batch entirely, `NOOPs` are appended to the end of the last batch to ensure that the number of operations in the batch is always equal to $8$.
diff --git a/docs/miden/vm/design/range.md b/docs/miden/vm/design/range.md
new file mode 100644
index 000000000..946e806f5
--- /dev/null
+++ b/docs/miden/vm/design/range.md
@@ -0,0 +1,173 @@
+Miden VM relies very heavily on 16-bit range-checks (checking if a value of a field element is between $0$ and $2^{16}$). For example, most of the [u32 operations](stack/u32-ops.md) need to perform between two and four 16-bit range-checks per operation. Similarly, operations involving memory (e.g. load and store) require two 16-bit range-checks per operation.
+
+Thus, it is very important for the VM to be able to perform a large number of 16-bit range checks very efficiently. In this note we describe how this can be achieved using the [LogUp](lookups/logup.md) lookup argument.
+
+## 8-bit range checks
+
+First, let's define a construction for the simplest possible 8-bit range-check. This can be done with a single column as illustrated below.
+
+![rc_8_bit_range_check](../../../img/miden/vm/design/range/rc_8_bit_range_check.png)
+
+For this to work as a range-check we need to enforce a few constraints on this column:
+
+- The value in the first row must be $0$.
+- The value in the last row must be $255$.
+- As we move from one row to the next, we can either keep the value the same or increment it by $1$.
+
+Denoting $v$ as the value of column $v$ in the current row, and $v'$ as the value of column $v$ in the next row, we can enforce the last condition as follows:
+
+$$
+(v' - v) \cdot (v' - v - 1) = 0
+$$
+
+Together, these constraints guarantee that all values in column $v$ are between $0$ and $255$ (inclusive).
+
+We can then make use of the LogUp lookup argument by adding another column $b$ which will keep a running sum that is the logarithmic derivative of the product of values in the $v$ column. The transition constraint for $b$ would look as follows:
+
+$$
+b' = b + \frac{1}{(\alpha - v)}
+$$
+
+Since constraints cannot include divisions, the constraint would actually be expressed as the following degree 2 constraint:
+
+$$
+b' \cdot (\alpha - v) = b \cdot (\alpha - v) + 1
+$$
+
+Using these two columns we can check if some other column in the execution trace is a permutation of values in $v$. Let's call this other column $x$. We can compute the logarithmic derivative for $x$ as a running sum in the same way as we compute it for $v$. Then, we can check that the last value in $b$ is the same as the final value for the running sum of $x$.
+
+While this approach works, it has a couple of limitations:
+
+- First, column $v$ must contain all values between $0$ and $255$. Thus, if column $x$ does not contain one of these values, we need to artificially add this value to $x$ somehow (i.e., we need to pad $x$ with extra values).
+- Second, assuming $n$ is the length of execution trace, we can range-check at most $n$ values. Thus, if we wanted to range-check more than $n$ values, we'd need to introduce another column similar to $v$.
+
+We can get rid of both requirements by including the _multiplicity_ of the value $v$ into the calculation of the logarithmic derivative for LogUp, which will allow us to specify exactly how many times each value needs to be range-checked.
+
+### A better construction
+
+Let's add one more column $m$ to our table to keep track of how many times each value should be range-checked.
+
+![rc_8_bit_logup](../../../img/miden/vm/design/range/rc_8_bit_logup.png)
+
+The transition constraint for $b$ is now as follows:
+
+$$
+b' = b + \frac{m}{(\alpha - v)}
+$$
+
+This addresses the limitations we had as follows:
+
+1. We no longer need to pad the column we want to range-check with extra values because we can skip the values we don't care about by setting the multiplicity to $0$.
+2. We can range check as many unique values as there are rows in the trace, and there is essentially no limit to how many times each of these values can be range-checked. (The only restriction on the multiplicity value is that it must be less than the size of the set of lookup values. Therefore, for long traces where $n > 2^{16}$, $m < 2^{16}$ must hold, and for short traces $m < n$ must be true.)
+
+Additionally, the constraint degree has not increased versus the naive approach, and the only additional cost is a single trace column.
+
+## 16-bit range checks
+
+To support 16-bit range checks, let's try to extend the idea of the 8-bit table. Our 16-bit table would look like so (the only difference is that column $v$ now has to end with value $65535$):
+
+![rc_16_bit_logup](../../../img/miden/vm/design/range/rc_16_bit_logup.png)
+
+While this works, it is rather wasteful. In the worst case, we'd need to enumerate over 65K values, most of which we may not actually need. It would be nice if we could "skip over" the values that we don't want. One way to do this could be to add bridge rows between two values to be range checked and add constraints to enforce the consistency of the gap between these bridge rows.
+
+If we allow gaps between two consecutive rows to only be 0 or powers of 2, we could enforce a constraint:
+
+$$
+\Delta v \cdot (\Delta v - 1)  \cdot (\Delta v - 2)  \cdot (\Delta v - 4)  \cdot (\Delta v - 8)  \cdot (\Delta v - 16)  \cdot (\Delta v - 32)  \cdot (\Delta v - 64)  \cdot (\Delta v - 128) = 0
+$$
+
+This constraint has a degree 9. This construction allows the minimum trace length to be 1024.
+
+We could go even further and allow the gaps between two consecutive rows to only be 0 or powers of 3. In this case we would enforce the constraint:
+
+$$
+\Delta v \cdot (\Delta v - 1)  \cdot (\Delta v - 3)  \cdot (\Delta v - 9)  \cdot (\Delta v - 27)  \cdot (\Delta v - 81)  \cdot (\Delta v - 243)  \cdot (\Delta v - 729)  \cdot (\Delta v - 2187) = 0
+$$
+
+This allows us to reduce the minimum trace length to 64.
+
+To find out the number of bridge rows to be added in between two values to be range checked, we represent the gap between them as a linear combination of powers of 3, ie,
+
+$$
+(r' - r) = \sum_{i=0}^{7} x_i \cdot 3^i
+$$
+
+Then for each $x_i$ except the first, we add a bridge row at a gap of $3^i$.
+
+## Miden approach
+
+This construction is implemented in Miden with the following requirements, capabilities and constraints.
+
+### Requirements
+
+- 2 columns of the main trace: $m, v$, where $v$ contains the value being range-checked and $m$ is the number of times the value is checked (its multiplicity).
+- 1 [bus](./lookups/index.md#communication-buses-in-miden-vm) $b_{range}$ to ensure that the range checks performed in the range checker match those requested by other VM components (the [stack](./stack/u32-ops.md#range-checks) and the [memory chiplet](./chiplets/memory.md)).
+
+### Capabilities
+
+The construction gives us the following capabilities:
+
+- For long traces (when $n > 2^{16}$), we can do an essentially unlimited number of arbitrary 16-bit range-checks.
+- For short traces ($2^5 < n \le 2^{16}$), we can range-check slightly fewer than $n$ unique values, but there is essentially no practical limit to the total number of range checks.
+
+### Execution trace
+
+The range checker's execution trace looks as follows:
+
+![rc_with_bridge_rows.png](../../../img/miden/vm/design/range/rc_with_bridge_rows.png)
+
+The columns have the following meanings:
+
+- $m$ is the multiplicity column that indicates the number of times the value in that row should be range checked (included into the computation of the logarithmic derivative).
+- $v$ contains the values to be range checked.
+    - These values go from $0$ to $65535$. Values must either stay the same or increase by powers of 3 less than or equal to $3^7$.
+    - The final 2 rows of the 16-bit section of the trace must both equal $65535$. The extra value of $65535$ is required in order to [pad the trace](./lookups/index.md#length-of-auxiliary-columns-for-lookup-arguments) so the [$b_{range}$](#communication-bus) bus column can be computed correctly.
+
+### Execution trace constraints
+
+First, we need to constrain that the consecutive values in the range checker are either the same or differ by powers of 3 that are less than or equal to $3^7$.
+
+> $$
+\Delta v \cdot (\Delta v - 1)  \cdot (\Delta v - 3)  \cdot (\Delta v - 9)  \cdot (\Delta v - 27)  \cdot (\Delta v - 81) \\
+\cdot (\Delta v - 243)  \cdot (\Delta v - 729)  \cdot (\Delta v - 2187) = 0 \text{ | degree} = 9
+$$
+
+In addition to the transition constraints described above, we also need to enforce the following boundary constraints:
+
+- The value of $v$ in the first row is $0$.
+- The value of $v$ in the last row is $65535$.
+
+### Communication bus
+
+$b_{range}$ is the [bus](lookups/index.md#communication-buses-in-miden-vm) that connects components which require 16-bit range checks to the values in the range checker. The bus constraints are defined by the components that use it to communicate.
+
+Requests are sent to the range checker bus by the following components:
+
+- The Stack sends requests for 16-bit range checks during some [`u32` operations](stack/u32-ops.md#range-checks).
+- The [Memory chiplet](chiplets/memory.md) sends requests for 16-bit range checks against the values in the $d_0$ and $d_1$ trace columns to enforce internal consistency.
+
+Responses are provided by the range checker using the transition constraint for the LogUp construction described above.
+
+> $$
+b'_{range} = b_{range} + \frac{m}{(\alpha - v)} \text{ | degree} = 2
+$$
+
+To describe the complete transition constraint for the bus, we'll define the following variables:
+
+- $f_{stack}$: the boolean flag that indicates whether or not a stack operation requiring range checks is occurring. This flag has degree 3.
+- $f_{mem}$: the boolean flag that indicates whether or not a memory operation requiring range checks is occurring. This flag has degree 3.
+- $s_0, s_1, s_2, s_3$: the values for which range checks are requested from the stack when $f_{stack}$ is set.
+- $m_0, m_1$: the values for which range checks are requested from the memory chiplet when $f_{mem}$ is set.
+
+> $$
+b'_{range} = b_{range} + \frac{m}{(\alpha - v)} - \frac{f_{stack}}{(\alpha - s_0)} - \frac{f_{stack}}{(\alpha - s_1)} - \frac{f_{stack}}{(\alpha - s_2)} - \frac{f_{stack}}{(\alpha - s_3)} \\ - \frac{f_{mem}}{(\alpha - m_0)} - \frac{f_{mem}}{(\alpha - m_1)} \text{ | degree} = 9
+$$
+
+As previously mentioned, constraints cannot include divisions, so the actual constraint which is applied will be the equivalent expression in which all denominators have been multiplied through, which is degree 9.
+
+If $b_{range}$ is initialized to $1$ and the values sent to the bus by other VM components match those that are range-checked in the the trace, then at the end of the trace we should end up with $b_{range} = 1$.
+
+Therefore, in addition to the transition constraint described above, we also need to enforce the following boundary constraints:
+
+- The value of $b_{range}$ in the first row $1$.
+- The value of $b_{range}$ in the last row $1$.
diff --git a/docs/miden/vm/design/stack/crypto-ops.md b/docs/miden/vm/design/stack/crypto-ops.md
new file mode 100644
index 000000000..fb8119586
--- /dev/null
+++ b/docs/miden/vm/design/stack/crypto-ops.md
@@ -0,0 +1,165 @@
+In this section we describe the AIR constraints for Miden VM cryptographic operations.
+
+Cryptographic operations in Miden VM are performed by the [Hash chiplet](../chiplets/hasher.md). Communication between the stack and the hash chiplet is accomplished via the chiplet bus $b_{chip}$. To make requests to and to read results from the chiplet bus we need to divide its current value by the value representing the request.
+
+Thus, to describe AIR constraints for the cryptographic operations, we need to define how to compute these input and output values within the stack. We do this in the following sections.
+
+## `HPERM`
+
+The `HPERM` operation applies Rescue Prime Optimized permutation to the top $12$ elements of the stack. The stack is assumed to be arranged so that the $8$ elements of the rate are at the top of the stack. The capacity word follows, with the number of elements to be hashed at the deepest position in stack. The diagram below illustrates this graphically.
+
+![hperm](../../../../img/miden/vm/design/stack/crypto-ops/HPERM.png)
+
+In the above, $r$ (located in the helper register $h_0$) is the row address from the hash chiplet set by the prover non-deterministically.
+
+For the `HPERM` operation, we define input and output values as follows:
+
+$$
+v_{input} = \alpha_0 + \alpha_1 \cdot op_{linhash} + \alpha_2 \cdot h_0 + \sum_{j=0}^{11} (\alpha_{j+4} \cdot s_{11-j})
+$$
+
+$$
+v_{output} = \alpha_0 + \alpha_1 \cdot op_{retstate} + \alpha_2 \cdot (h_0 + 7) + \sum_{j=0}^{11} (\alpha_{j+4} \cdot s_{11-j}')
+$$
+
+In the above, $op_{linhash}$ and $op_{retstate}$ are the unique [operation labels](../chiplets/index.md#operation-labels) for initiating a linear hash and reading the full state of the hasher respectively. Also note that the term for $\alpha_3$ is missing from the above expressions because for Rescue Prime Optimized permutation computation the index column is expected to be set to $0$.
+
+Using the above values, we can describe the constraint for the chiplet bus column as follows:
+
+>$$
+b_{chip}' \cdot v_{input} \cdot v_{output} = b_{chip} \text{ | degree} = 3
+$$
+
+The above constraint enforces that the specified input and output rows must be present in the trace of the hash chiplet, and that they must be exactly $7$ rows apart.
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $12$.
+
+## `MPVERIFY`
+
+The `MPVERIFY` operation verifies that a Merkle path from the specified node resolves to the specified root. This operation can be used to prove that the prover knows a path in the specified Merkle tree which starts with the specified node.
+
+Prior to the operation, the stack is expected to be arranged as follows (from the top):
+
+- Value of the node, 4 elements ($V$ in the below image)
+- Depth of the path, 1 element ($d$ in the below image)
+- Index of the node, 1 element ($i$ in the below image)
+- Root of the tree, 4 elements ($R$ in the below image)
+
+The Merkle path itself is expected to be provided by the prover non-deterministically (via the advice provider). If the prover is not able to provide the required path, the operation fails. Otherwise, the state of the stack does not change. The diagram below illustrates this graphically.
+
+![mpverify](../../../../img/miden/vm/design/stack/crypto-ops/MPVERIFY.png)
+
+In the above, $r$ (located in the helper register $h_0$) is the row address from the hash chiplet set by the prover non-deterministically.
+
+For the `MPVERIFY` operation, we define input and output values as follows:
+
+$$
+v_{input} = \alpha_0 + \alpha_1 \cdot op_{mpver} + \alpha_2 \cdot h_0 + \alpha_3 \cdot s_5 + \sum_{j=0}^3 \alpha_{j+8} \cdot s_{3 - j}
+$$
+
+$$
+v_{output} = \alpha_0 + \alpha_1 \cdot op_{rethash} + \alpha_2 \cdot (h_0 + 8 \cdot s_4 - 1) + \sum_{j=0}^3\alpha_{j + 8} \cdot s_{9 - j}
+$$
+
+In the above, $op_{mpver}$ and $op_{rethash}$ are the unique [operation labels](../chiplets/index.md#operation-labels) for initiating a Merkle path verification computation and reading the hash result respectively. The sum expression for inputs computes the value of the leaf node, while the sum expression for the output computes the value of the tree root.
+
+Using the above values, we can describe the constraint for the chiplet bus column as follows:
+
+>$$
+b_{chip}' \cdot v_{input} \cdot v_{output} = b_{chip} \text{ | degree} = 3
+$$
+
+The above constraint enforces that the specified input and output rows must be present in the trace of the hash chiplet, and that they must be exactly $8 \cdot d - 1$ rows apart, where $d$ is the depth of the node.
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $0$.
+
+## `MRUPDATE`
+
+The `MRUPDATE` operation computes a new root of a Merkle tree where a node at the specified position is updated to the specified value.
+
+The stack is expected to be arranged as follows (from the top):
+
+- old value of the node, 4 element ($V$ in the below image)
+- depth of the node, 1 element ($d$ in the below image)
+- index of the node, 1 element ($i$ in the below image)
+- current root of the tree, 4 elements ($R$ in the below image)
+- new value of the node, 4 element ($NV$ in the below image)
+
+The Merkle path for the node is expected to be provided by the prover non-deterministically (via merkle sets). At the end of the operation, the old node value is replaced with the new root value computed based on the provided path. Everything else on the stack remains the same. The diagram below illustrates this graphically.
+
+![mrupdate](../../../../img/miden/vm/design/stack/crypto-ops/MRUPDATE.png)
+
+In the above, $r$ (located in the helper register $h_0$) is the row address from the hash chiplet set by the prover non-deterministically.
+
+For the `MRUPDATE` operation, we define input and output values as follows:
+
+$$
+v_{inputold} = \alpha_0 + \alpha_1 \cdot op_{mruold} + \alpha_2 \cdot h_0 + \alpha_3 \cdot s_5 + \sum_{j=0}^3\alpha_{j + 8} \cdot s_{3 - j}
+$$
+
+$$
+v_{outputold} = \alpha_0 + \alpha_1 \cdot op_{rethash} + \alpha_2 \cdot (h_0 + 8 \cdot s_4 - 1) + \sum_{j=0}^3\alpha_{j + 8} \cdot s_{9 - j}
+$$
+
+$$
+v_{inputnew} = \alpha_0 + \alpha_1 \cdot op_{mrunew} + \alpha_2 \cdot (h_0 + 8 \cdot s_4) + \alpha_3 \cdot s_5 + \sum_{j=0}^3\alpha_{j + 8} \cdot s_{13 - j}
+$$
+
+$$
+v_{outputnew} = \alpha_0 + \alpha_1 \cdot op_{rethash} + \alpha_2 \cdot (h_0 + 2 \cdot 8 \cdot s_4 - 1) + \sum_{j=0}^3\alpha_{j + 8} \cdot s_{3 - j}'
+$$
+
+In the above, the first two expressions correspond to inputs and outputs for verifying the Merkle path between the old node value and the old tree root, while the last two expressions correspond to inputs and outputs for verifying the Merkle path between the new node value and the new tree root. The hash chiplet ensures the same set of sibling nodes are uses in both of these computations.
+
+The $op_{mruold}$, $op_{mrunew}$, and $op_{rethash}$ are the unique [operation labels](../chiplets/index.md#operation-labels) used by the above computations.
+
+> $$
+b_{chip}' \cdot v_{inputold} \cdot v_{outputold} \cdot v_{inputnew} \cdot v_{outputnew} = b_{chip} \text{ | degree} = 5
+$$
+
+The above constraint enforces that the specified input and output rows for both, the old and the new node/root combinations, must be present in the trace of the hash chiplet, and that they must be exactly $8 \cdot d - 1$ rows apart, where $d$ is the depth of the node. It also ensures that the computation for the old node/root combination is immediately followed by the computation for the new node/root combination.
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** for positions starting from $4$.
+
+## `FRIE2F4`
+
+The `FRIE2F4` operation performs FRI layer folding by a factor of 4 for FRI protocol executed in a degree 2 extension of the base field. It also performs several computations needed for checking correctness of the folding from the previous layer as well as simplifying folding of the next FRI layer.
+
+The stack for the operation is expected to be arranged as follows:
+
+- The first $8$ stack elements contain $4$ query points to be folded. Each point is represented by two field elements because points to be folded are in the extension field. We denote these points as $q_0 = (v_0, v_1)$, $q_1 = (v_2, v_3)$, $q_2 = (v_4, v_5)$, $q_3 = (v_6, v_7)$.
+- The next element $f\_pos$ is the query position in the folded domain. It can be computed as $pos \mod n$, where $pos$ is the position in the source domain, and $n$ is size of the folded domain.
+- The next element $d\_seg$ is a value indicating domain segment from which the position in the original domain was folded. It can be computed as $\lfloor \frac{pos}{n} \rfloor$. Since the size of the source domain is always $4$ times bigger than the size of the folded domain, possible domain segment values can be $0$, $1$, $2$, or $3$.
+- The next element $poe$ is a power of initial domain generator which aid in a computation of the domain point $x$.
+- The next two elements contain the result of the previous layer folding - a single element in the extension field denoted as $pe = (pe_0, pe_1)$.
+- The next two elements specify a random verifier challenge $\alpha$ for the current layer defined as $\alpha = (a_0, a_1)$.
+- The last element on the top of the stack ($cptr$) is expected to be a memory address of the layer currently being folded.
+
+The diagram below illustrates stack transition for `FRIE2F4` operation.
+
+![frie2f4](../../../../img/miden/vm/design/stack/crypto-ops/FRIE2F4.png)
+
+At the high-level, the operation does the following:
+
+- Computes the domain value $x$ based on values of $poe$ and $d\_seg$.
+- Using $x$ and $\alpha$, folds the query values $q_0, ..., q_3$ into a single value $r$.
+- Compares the previously folded value $pe$ to the appropriate value of $q_0, ..., q_3$ to verify that the folding of the previous layer was done correctly.
+- Computes the new value of $poe$ as $poe' = poe^4$ (this is done in two steps to keep the constraint degree low).
+- Increments the layer address pointer by $2$.
+- Shifts the stack by $1$ to the left. This moves an element from the stack overflow table into the last position on the stack top.
+
+To keep the degree of the constraints low, a number of intermediate values are used. Specifically, the operation relies on all $6$ helper registers, and also uses the first $10$ elements of the stack at the next state for degree reduction purposes. Thus, once the operation has been executed, the top $10$ elements of the stack can be considered to be "garbage".
+
+<!-- 
+TODO: add detailed constraint descriptions. See discussion [here](https://github.com/0xPolygonMiden/miden-vm/issues/567#issuecomment-1398088792).
+-->
+
+The effect on the rest of the stack is:
+
+* **Left shift** starting from position $16$.
diff --git a/docs/miden/vm/design/stack/field-ops.md b/docs/miden/vm/design/stack/field-ops.md
new file mode 100644
index 000000000..108bd6095
--- /dev/null
+++ b/docs/miden/vm/design/stack/field-ops.md
@@ -0,0 +1,267 @@
+In this section we describe the AIR constraints for Miden VM field operations (i.e., arithmetic operations over field elements).
+
+## `ADD`
+
+Assume $a$ and $b$ are the elements at the top of the stack. The `ADD` operation computes $c \leftarrow (a + b)$. The diagram below illustrates this graphically.
+
+![add](../../../../img/miden/vm/design/stack/field-operations/ADD.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' - (s_0 + s_1) = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+* **Left shift** starting from position $2$.
+
+## `NEG`
+
+Assume $a$ is the element at the top of the stack. The `NEG` operation computes $b \leftarrow (-a)$. The diagram below illustrates this graphically.
+
+![neg](../../../../img/miden/vm/design/stack/field-operations/NEG.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' + s_0 = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+* **No change** starting from position $1$.
+
+## `MUL`
+
+Assume $a$ and $b$ are the elements at the top of the stack. The `MUL` operation computes $c \leftarrow (a \cdot b)$. The diagram below illustrates this graphically.
+
+![mul](../../../../img/miden/vm/design/stack/field-operations/MUL.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' - s_0 \cdot s_1 = 0 \text{ | degree} = 2
+$$
+
+The effect on the rest of the stack is:
+* **Left shift** starting from position $2$.
+
+## `INV`
+
+Assume $a$ is the element at the top of the stack. The `INV` operation computes $b \leftarrow (a^{-1})$. The diagram below illustrates this graphically.
+
+![inv](../../../../img/miden/vm/design/stack/field-operations/INV.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+1 - s_0' \cdot s_0 = 0 \text{ | degree} = 2
+$$
+
+Note that the above constraint can be satisfied only if the value in $s_0 \neq 0$.
+
+The effect on the rest of the stack is:
+* **No change** starting from position $1$.
+
+## `INCR`
+
+Assume $a$ is the element at the top of the stack. The `INCR` operation computes $b \leftarrow (a+1)$. The diagram below illustrates this graphically.
+
+![incr](../../../../img/miden/vm/design/stack/field-operations/INCR.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' - (s_0 + 1) = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+* **No change** starting from position $1$.
+
+## `NOT`
+
+Assume $a$ is a binary value at the top of the stack. The `NOT` operation computes $b \leftarrow (\lnot a)$. The diagram below illustrates this graphically.
+
+![not](../../../../img/miden/vm/design/stack/field-operations/NOT.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0^2 - s_0 = 0 \text{ | degree} = 2
+$$
+
+>$$
+s_0' - (1 - s_0) = 0 \text{ | degree} = 1
+$$
+
+The first constraint ensures that the value in $s_0$ is binary, and the second constraint ensures the correctness of the boolean `NOT` operation.
+
+The effect on the rest of the stack is:
+* **No change** starting from position $1$.
+
+## `AND`
+
+Assume $a$ and $b$ are binary values at the top of the stack. The `AND` operation computes $c \leftarrow (a \land b)$. The diagram below illustrates this graphically.
+
+![and](../../../../img/miden/vm/design/stack/field-operations/AND.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_i^2 - s_i = 0 \text{ for } i \in \{0, 1\} \text{ | degree} = 2
+$$
+
+>$$
+s_0' - s_0 \cdot s_1 = 0 \text{ | degree} = 2
+$$
+
+The first two constraints ensure that the value in $s_0$ and $s_1$ are binary, and the third constraint ensures the correctness of the boolean `AND` operation.
+
+The effect on the rest of the stack is:
+* **Left shift** starting from position $2$.
+
+## `OR`
+
+Assume $a$ and $b$ are binary values at the top of the stack. The `OR` operation computes $c \leftarrow (a \lor b)$ The diagram below illustrates this graphically.
+
+![or](../../../../img/miden/vm/design/stack/field-operations/OR.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_i^2 - s_i = 0 \text{ for } i \in \{0, 1\} \text{ | degree} = 2
+$$
+
+>$$
+s_{0}' - (s_{1} + s_{0} - s_{1} \cdot s_{0}) = 0 \text{ | degree} = 2
+$$
+
+The first two constraints ensure that the value in $s_0$ and $s_1$ are binary, and the third constraint ensures the correctness of the boolean `OR` operation.
+
+The effect on the rest of the stack is:
+* **Left shift** starting from position $2$.
+
+## `EQ`
+
+Assume $a$ and $b$ are the elements at the top of the stack. The `EQ` operation computes $c$ such that $c = 1$ if $a = b$, and $0$ otherwise. The diagram below illustrates this graphically.
+
+![eq](../../../../img/miden/vm/design/stack/field-operations/EQ.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' \cdot (s_0 - s_1) = 0 \text{ | degree} = 2
+$$
+
+>$$
+s_0' - (1 - (s_0 - s_1) \cdot h_0) = 0 \text{ | degree} = 2
+$$
+
+To satisfy the above constraints, the prover must populate the value of helper register $h_0$ as follows:
+
+* If $s_0 \neq s_1$, set $h_0 = \frac{1}{s_0 - s_1}$.
+* Otherwise, set $h_0$ to any value (e.g., $0$).
+
+The effect on the rest of the stack is:
+* **Left shift** starting from position $2$.
+
+## `EQZ`
+
+Assume $a$ is the element at the top of the stack. The `EQZ` operation computes $b$ such that $b = 1$ if $a = 0$, and $0$ otherwise. The diagram below illustrates this graphically.
+
+![eqz](../../../../img/miden/vm/design/stack/field-operations/EQZ.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' \cdot s_0 = 0 \text{ | degree} = 2
+$$
+
+>$$
+s_0' - (1 - s_0 \cdot h_0) = 0 \text{ | degree} = 2
+$$
+
+To satisfy the above constraints, the prover must populate the value of helper register $h_0$ as follows:
+* If $s_0 \neq 0$, set $h_0 = \frac{1}{s_0}$.
+* Otherwise, set $h_0$ to any value (e.g., $0$).
+
+The effect on the rest of the stack is:
+
+* **No change** starting from position $1$.
+
+## `EXPACC`
+
+The `EXPACC` operation pops top $4$ elements from the top of the stack, performs a single round of exponent aggregation, and pushes the resulting $4$ values onto the stack. The diagram below illustrates this graphically.
+
+![expacc](../../../../img/miden/vm/design/stack/field-operations/EXPACC.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+`bit` should be a binary.
+
+>$$
+s_0'^{2} - s_0' = 0 \text{ | degree} = 2
+$$
+
+The `exp` in the next frame should be the square of the `exp` in the current frame.
+
+>$$
+s_1' - s_1^{2} = 0 \text{ | degree} = 2
+$$
+
+The value `val` in the helper register is computed correctly using the `bit` and `exp` in next and current frame respectively.
+
+>$$
+h_0 - ((s_1 - 1) * s_0' + 1) = 0 \text{ | degree} = 2
+$$
+
+The `acc` in the next frame is the product of `val` and `acc` in the current frame.
+
+>$$
+s_2' - s_2 * h_0 = 0 \text{ | degree} = 2
+$$
+
+`b` in the next frame is the right shift of `b` in the current frame.
+
+>$$
+s_3' - (s_3 * 2 + s_0')  = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+
+* **No change** starting from position $4$.
+
+## `EXT2MUL`
+
+The `EXT2MUL` operation pops top $4$ values from the top of the stack, performs mulitplication between the two extension field elements, and pushes the resulting $4$ values onto the stack. The diagram below illustrates this graphically.
+
+![ext2mul](../../../../img/miden/vm/design/stack/field-operations/EXT2MUL.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+The first stack element should be unchanged in the next frame.
+
+>$$
+s_0' - s_0 = 0 \text{ | degree} = 1
+$$
+
+The second stack element should be unchanged in the next frame.
+
+>$$
+s_1' - s_1 = 0 \text{ | degree} = 1
+$$
+
+The third stack element should satisfy the following constraint.
+
+>$$
+s_2' - (s_0 + s_1) \cdot (s_2 + s_3) + s_0 \cdot s_2 = 0 \text{ | degree} = 2
+$$
+
+The fourth stack element should satisfy the following constraint.
+
+>$$
+s_3' - s_1 \cdot s_3 + 2 \cdot s_0 \cdot s_2 = 0 \text{ | degree} = 2
+$$
+
+The effect on the rest of the stack is:
+
+* **No change** starting from position $4$.
diff --git a/docs/miden/vm/design/stack/index.md b/docs/miden/vm/design/stack/index.md
new file mode 100644
index 000000000..e4c1715e0
--- /dev/null
+++ b/docs/miden/vm/design/stack/index.md
@@ -0,0 +1,217 @@
+Miden VM is a stack machine. The stack is a push-down stack of practically unlimited depth (in practical terms, the depth will never exceed $2^{32}$), but only the top $16$ items are directly accessible to the VM. Items on the stack are elements in a prime field with modulus $2^{64} - 2^{32} + 1$.
+
+To keep the constraint system for the stack manageable, we impose the following rules:
+
+1. All operations executed on the VM can shift the stack by at most one item. That is, the end result of an operation must be that the stack shrinks by one item, grows by one item, or the number of items on the stack stays the same.
+2. Stack depth must always be greater than or equal to $16$. At the start of program execution, the stack is initialized with exactly $16$ input values, all of which could be $0$'s.
+3. By the end of program execution, exactly $16$ items must remain on the stack (again, all of them could be $0$'s). These items comprise the output of the program.
+
+To ensure that managing stack depth does not impose significant burden, we adopt the following rule:
+
+* When the stack depth is $16$, removing additional items from the stack does not change its depth. To keep the depth at $16$, $0$'s are inserted into the deep end of the stack for each removed item.
+
+## Stack representation
+
+The VM allocates $19$ trace columns for the stack. The layout of the columns is illustrated below.
+
+![](../../../../img/miden/vm/design/stack/trace_layout.png)
+
+The meaning of the above columns is as follows:
+
+* $s_0 ... s_{15}$ are the columns representing the top $16$ slots of the stack.
+* Column $b_0$ contains the number of items on the stack (i.e., the stack depth). In the above picture, there are 16 items on the stacks, so $b_0 = 16$.
+* Column $b_1$ contains an address of a row in the "overflow table" in which we'll store the data that doesn't fit into the top $16$ slots. When $b_1 = 0$, it means that all stack data fits into the top $16$ slots of the stack.
+* Helper column $h_0$ is used to ensure that stack depth does not drop below $16$. Values in this column are set by the prover non-deterministically to $\frac{1}{b_0 - 16}$ when $b_0 \neq 16$, and to any other value otherwise.
+
+### Overflow table
+
+To keep track of the data which doesn't fit into the top $16$ stack slots, we'll use an overflow table. This will be a [virtual table](../lookups/multiset.md#virtual-tables). To represent this table, we'll use a single auxiliary column $p_1$.
+
+The table itself can be thought of as having 3 columns as illustrated below.
+
+![](../../../../img/miden/vm/design/stack/overflow_table_layout.png)
+
+The meaning of the columns is as follows:
+
+* Column $t_0$ contains row address. Every address in the table must be unique.
+* Column $t_1$ contains the value that overflowed the stack.
+* Column $t_2$ contains the address of the row containing the value that overflowed the stack right before the value in the current row. For example, in the picture above, first value $a$ overflowed the stack, then $b$ overflowed the stack, and then value $c$ overflowed the stack. Thus, row with value $b$ points back to the row with value $a$, and row with value $c$ points back to the row with value $b$.
+
+To reduce a table row to a single value, we'll compute a randomized product of column values as follows:
+
+$$
+r_i = \alpha_0 + \alpha_1 \cdot t_{0, i} + \alpha_2 \cdot t_{1, i} + \alpha_3 \cdot t_{2, i}
+$$
+
+Then, when row $i$ is added to the table, we'll update the value in the $p_1$ column like so:
+
+$$
+p_1' = p_1 \cdot r_i
+$$
+
+Analogously, when row $i$ is removed from the table, we'll update the value in column $p_1$ like so:
+
+$$
+p_1' = \frac{p_1}{r_i}
+$$
+
+The initial value of $p_1$ is set to $1$. Thus, if by the time Miden VM finishes executing a program the table is empty (we added and then removed exactly the same set of rows), $p_1$ will also be equal to $1$.
+
+There are a couple of other rules we'll need to enforce:
+
+* We can delete a row only after the row has been inserted into the table.
+* We can't insert a row with the same address twice into the table (even if the row was inserted and then deleted).
+
+How these are enforced will be described a bit later.
+
+## Right shift
+
+If an operation adds data to the stack, we say that the operation caused a right shift. For example, `PUSH` and `DUP` operations cause a right shift. Graphically, this looks like so:
+
+![](../../../../img/miden/vm/design/stack/stack_right_shift.png)
+
+Here, we pushed value $v_{17}$ onto the stack. All other values on the stack are shifted by one slot to the right and the stack depth increases by $1$. There is not enough space at the top of the stack for all $17$ values, thus, $v_1$ needs to be moved to the overflow table.
+
+To do this, we need to rely on another column: $k_0$. This is a system column which keeps track of the current VM cycle. The value in this column is simply incremented by $1$ with every step.
+
+The row we want to add to the overflow table is defined by tuple $(clk, v1, 0)$, and after it is added, the table would look like so:
+
+![](../../../../img/miden/vm/design/stack/stack_overflow_table_post_1_right_shift.png)
+
+The reason we use VM clock cycle as row address is that the clock cycle is guaranteed to be unique, and thus, the same row can not be added to the table twice.
+
+Let's push another item onto the stack:
+
+![](../../../../img/miden/vm/design/stack/stack_overflow_push_2nd_item.png)
+
+Again, as we push $v_{18}$ onto the stack, all items on the stack are shifted to the right, and now $v_2$ needs to be moved to the overflow table. The tuple we want to insert into the table now is $(clk+1, v2, clk)$. After the operation, the overflow table will look like so:
+
+![](../../../../img/miden/vm/design/stack/stack_overflow_table_post_2_right_shift.png)
+
+Notice that $t_2$ for row which contains value $v_2$ points back to the row with address $clk$.
+
+Overall, during a right shift we do the following:
+
+* Increment stack depth by $1$.
+* Shift stack columns $s_0, ..., s_{14}$ right by $1$ slot.
+* Add a row to the overflow table described by tuple $(k_0, s_{15}, b_0)$.
+* Set the next value of $b_1$ to the current value of $k_0$.
+
+Also, as mentioned previously, the prover sets values in $h_0$ non-deterministically to $\frac{1}{b_0 - 16}$.
+
+## Left shift
+If an operation removes an item from the stack, we say that the operation caused a left shift. For example, a `DROP` operation causes a left shift. Assuming the stack is in the state we left it at the end of the previous section, graphically, this looks like so:
+
+![](../../../../img/miden/vm/design/stack/stack_1st_left_shift.png)
+
+Overall, during the left shift we do the following:
+
+* When stack depth is greater than $16$:
+      * Decrement stack depth by $1$.
+      * Shift stack columns $s_1, ..., s_{15}$ left by $1$ slot.
+      * Remove a row from the overflow table with $t_0$ equal to the current value of $b_1$.
+      * Set the next value of $s_{15}$ to the value in $t_1$ of the removed overflow table row.
+      * Set the next value of $b_1$ to the value in $t_2$ of the removed overflow table row.
+* When the stack depth is equal to $16$:
+      * Keep the stack depth the same.
+      * Shift stack columns $s_1, ..., s_{15}$ left by $1$ slot.
+      * Set the value of $s_{15}$ to $0$.
+      * Set the value to $h_0$ to $0$ (or any other value).
+
+If the stack depth becomes (or remains) $16$, the prover can set $h_0$ to any value (e.g., $0$). But if the depth is greater than $16$ the prover sets $h_0$ to $\frac{1}{b_0 - 16}$.
+
+## AIR constraints
+
+To simplify constraint descriptions, we'll assume that the VM exposes two binary flag values described below.
+
+| Flag      | Degree | Description                                                                                      |
+| --------- | ------ | ------------------------------------------------------------------------------------------------ |
+| $f_{shr}$ | 6      | When this flag is set to $1$, the instruction executing on the VM is performing a "right shift". |
+| $f_{shl}$ | 5      | When this flag is set to $1$, the instruction executing on the VM is performing a "left shift".  |
+
+These flags are mutually exclusive. That is, if $f_{shl}=1$, then $f_{shr}=0$ and vice versa. However, both flags can be set to $0$ simultaneously. This happens when the executed instruction does not shift the stack. How these flags are computed is described [here](./op-constraints.md).
+
+### Stack overflow flag
+
+Additionally, we'll define a flag to indicate whether the overflow table contains values. This flag will be set to $0$ when the overflow table is empty, and to $1$ otherwise (i.e., when stack depth $>16$). This flag can be computed as follows:
+
+$$
+f_{ov} = (b_0 - 16) \cdot h_0 \text{ | degree} = 2
+$$
+
+To ensure that this flag is set correctly, we need to impose the following constraint:
+
+>$$
+(1 - f_{ov}) \cdot (b_0 - 16) = 0 \text{ | degree} = 3
+$$
+
+The above constraint can be satisfied only when either of the following holds:
+
+* $b_0 = 16$, in which case $f_{ov}$ evaluates to $0$, regardless of the value of $h_0$.
+* $f_{ov} = 1$, in which case $b_0$ cannot be equal to $16$ (and $h_0$ must be set to $\frac{1}{b_0 - 16}$).
+
+### Stack depth constraints
+
+To make sure stack depth column $b_0$ is updated correctly, we need to impose the following constraints:
+
+| Condition                   | Constraint__     | Description                                                                                                          |
+| --------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------- |
+| $f_{shr}=1$                 | $b'_0 = b_0 + 1$ | When the stack is shifted to the right, stack depth should be incremented by $1$.                                    |
+| $f_{shl}=1$ <br> $f_{ov}=1$ | $b'_0 = b_0 - 1$ | When the stack is shifted to the left and the overflow table is not empty, stack depth should be decremented by $1$. |
+| otherwise                   | $b'_0 = b_0$     | In all other cases, stack depth should not change.                                                                   |
+
+We can combine the above constraints into a single expression as follows:
+
+>$$
+b'_0 - b_0 + f_{shl} \cdot f_{ov} - f_{shr} = 0 \text{ | degree} = 7
+$$
+
+### Overflow table constraints
+
+When the stack is shifted to the right, a tuple $(k_0, s_{15}, b_1)$ should be added to the overflow table. We will denote value of the row to be added to the table as follows:
+
+$$
+v = \alpha_0 + \alpha_1 \cdot k_0 + \alpha_2 \cdot s_{15} + \alpha_3 \cdot b_1
+$$
+
+When the stack is shifted to the left, a tuple $(b_1, s'_{15}, b'_1)$ should be removed from the overflow table. We will denote value of the row to be removed from the table as follows.
+
+$$
+u = \alpha_0 + \alpha_1 \cdot b_1 + \alpha_2 \cdot s'_{15} + \alpha_3 \cdot b'_1
+$$
+
+Using the above variables, we can ensure that right and left shifts update the overflow table correctly by enforcing the following constraint:
+
+>$$
+p_1' \cdot (u \cdot f_{shl} \cdot f_{ov} + 1 - f_{shl} \cdot f_{ov}) = p_1 \cdot (v \cdot f_{shr} + 1 - f_{shr}) \text{ | degree} = 9
+$$
+
+The above constraint reduces to the following under various flag conditions:
+
+| Condition                                          | Applied constraint   |
+| -------------------------------------------------- | -------------------- |
+| $f_{shl}=1$, $f_{shr}=0$, $f_{ov}=0$               | $p_1' = p_1$         |
+| $f_{shl}=1$, $f_{shr}=0$, $f_{ov}=1$               | $p_1' \cdot u = p_1$ |
+| $f_{shl}=0$, $f_{shr}=1$, $f_{ov}=1 \text{ or } 0$ | $p_1' = p_1 \cdot v$ |
+| $f_{shl}=0$, $f_{shr}=0$, $f_{ov}=1 \text{ or } 0$ | $p_1' = p_1$         |
+
+Notice that in the case of the left shift, the constraint forces the prover to set the next values of $s_{15}$ and $b_1$ to values $t_1$ and $t_2$ of the row removed from the overflow table.
+
+In case of a right shift, we also need to make sure that the next value of $b_1$ is set to the current value of $k_0$. This can be done with the following constraint:
+
+>$$
+f_{shr} \cdot (b'_1 - k_0) = 0 \text{ | degree} = 7
+$$
+
+In case of a left shift, when the overflow table is empty, we need to make sure that a $0$ is "shifted in" from the right (i.e., $s_{15}$ is set to $0$). This can be done with the following constraint:
+
+>$$
+f_{shl} \cdot (1 - f_{ov}) \cdot s_{15}' = 0 \text{ | degree} = 8
+$$
+
+### Boundary constraints
+
+In addition to the constraints described above, we also need to enforce the following boundary constraints:
+* $b_0 = 16$ at the first and at the last row of execution trace.
+* $b_1 = 0$ at the first and at the last row of execution trace.
+* $p_1 = 1$ at the first and at the last row of execution trace.
diff --git a/docs/miden/vm/design/stack/io-ops.md b/docs/miden/vm/design/stack/io-ops.md
new file mode 100644
index 000000000..d4ec6a572
--- /dev/null
+++ b/docs/miden/vm/design/stack/io-ops.md
@@ -0,0 +1,235 @@
+In this section we describe the AIR constraints for Miden VM input / output operations. These operations move values between the stack and other components of the VM such as program code (i.e., decoder), memory, and advice provider.
+
+## `PUSH`
+
+The `PUSH` operation pushes the provided immediate value onto the stack (i.e., sets the value of $s_0$ register). Currently, it is the only operation in Miden VM which carries an immediate value. The semantics of this operation are explained in the [decoder section](../decoder/index.md#handling-immediate-values).
+
+The effect of this operation on the rest of the stack is:
+
+* **Right shift** starting from position $0$.
+
+## `SDEPTH`
+
+Assume $a$ is the current depth of the stack stored in the stack bookkeeping register $b_0$ (as described [here](./index.md#stack-representation)). The `SDEPTH` pushes $a$ onto the stack. The diagram below illustrates this graphically.
+
+![sdepth](../../../../img/miden/vm/design/stack/io-ops/SDEPTH.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' - b_0 = 0 \text{ | degree} = 1
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **Right shift** starting from position $0$.
+
+## `ADVPOP`
+
+Assume $a$ is an element at the top of the advice stack. The `ADVPOP` operation removes $a$ from the advice stack and pushes it onto the operand stack. The diagram below illustrates this graphically.
+
+![advpop](../../../../img/miden/vm/design/stack/io-ops/ADVPOP.png)
+
+The `ADVPOP` operation does not impose any constraints against the first element of the operand stack.
+
+The effect of this operation on the rest of the operand stack is:
+
+* **Right shift** starting from position $0$.
+
+## `ADVPOPW`
+
+Assume $a$, $b$, $c$, and $d$, are the elements at the top of the advice stack (with $a$ being on top). The `ADVPOPW` operation removes these elements from the advice stack and puts them onto the operand stack by overwriting the top $4$ stack elements. The diagram below illustrates this graphically.
+
+![advpopw](../../../../img/miden/vm/design/stack/io-ops/ADVPOPW.png)
+
+The `ADVPOPW` operation does not impose any constraints against the top $4$ elements of the operand stack.
+
+The effect of this operation on the rest of the operand stack is:
+
+* **No change** starting from position $4$.
+
+## Memory access operations
+
+Miden VM exposes several operations for reading from and writing to random access memory. Memory in Miden VM is managed by the [Memory chiplet](../chiplets/memory.md).
+
+Communication between the stack and the memory chiplet is accomplished via the chiplet bus $b_{chip}$. To make requests to the chiplet bus we need to divide its current value by the value representing memory access request. The structure of memory access request value is described [here](../chiplets/memory.md#memory-row-value).
+
+To enforce the correctness of memory access, we can use the following constraint:
+
+>$$
+b_{chip}' \cdot u_{mem} = b_{chip} \text{ | degree} = 2
+$$
+
+In the above, $u_{mem}$ is the value of memory access request. Thus, to describe AIR constraint for memory operations, it is sufficient to describe how $u_{mem}$ is computed. We do this in the following sections.
+
+### `MLOADW`
+
+Assume that the word with elements $v_0, v_1, v_2, v_3$ is located in memory at address $a$. The `MLOADW` operation pops an element off the stack, interprets it as a memory address, and replaces the remaining 4 elements at the top of the stack with values located at the specified address. The diagram below illustrates this graphically.
+
+![mloadw](../../../../img/miden/vm/design/stack/io-ops/MLOADW.png)
+
+To simplify description of the memory access request value, we first define a variable for the value that represents the state of memory after the operation:
+
+$$
+v = \sum_{i=0}^3\alpha_{i+5} \cdot s_{3-i}'
+$$
+
+Using the above variable, we define the value representing the memory access request as follows:
+
+$$
+u_{mem} = \alpha_0 + \alpha_1 \cdot op_{mem\_read} + \alpha_2 \cdot ctx + \alpha_3 \cdot s_0 + \alpha_4 \cdot clk + v
+$$
+
+In the above:
+- $op_{mem\_read}$ is the unique [operation label](../chiplets/index.md#operation-labels) of the memory read operation.
+- $ctx$ is the identifier of the current memory context.
+- $s_0$ is the memory address from which the values are to be loaded onto the stack.
+- $clk$ is the current clock cycle of the VM.
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $5$.
+
+### `MLOAD`
+
+Assume that the word with elements $v_0, v_1, v_2, v_3$ is located in memory at address $a$. The `MLOAD` operation pops an element off the stack, interprets it as a memory address, and pushes the first element of the word located at the specified address to the stack. The diagram below illustrates this graphically.
+
+![mload](../../../../img/miden/vm/design/stack/io-ops/MLOAD.png)
+
+To simplify description of the memory access request value, we first define a variable for the value that represents the state of memory after the operation:
+
+$$
+v = \alpha_5 \cdot s_0' + \sum_{i=1}^3\alpha_{i+5} \cdot h_{3-i}'
+$$
+
+!!! note
+    The values in registers $h_0, h_1, h_2$ are set by the prover non-deterministically.
+
+Using the above variable, we define the value representing the memory access request as follows:
+
+$$
+u_{mem} = \alpha_0 + \alpha_1 \cdot op_{mem\_read} + \alpha_2 \cdot ctx + \alpha_3 \cdot s_0 + \alpha_4 \cdot clk + v
+$$
+
+In the above:
+
+- $op_{mem\_read}$ is the unique [operation label](../chiplets/index.md#operation-labels) of the memory read operation.
+- $ctx$ is the identifier of the current memory context.
+- $s_0$ is the memory address from which the value is to be loaded onto the stack.
+- $clk$ is the current clock cycle of the VM.
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $1$.
+
+### `MSTOREW`
+
+The `MSTOREW` operation pops an element off the stack, interprets it as a memory address, and writes the remaining $4$ elements at the top of the stack into memory at the specified address. The stored elements are not removed from the stack. The diagram below illustrates this graphically.
+
+![mstorew](../../../../img/miden/vm/design/stack/io-ops/MSTOREW.png)
+
+After the operation the contents of memory at address $a$ would be set to $v_0, v_1, v_2, v_3$.
+
+To simplify description of the memory access request value, we first define a variable for the value that represents the state of memory after the operation:
+
+$$
+v = \sum_{i=0}^3\alpha_{i+5} \cdot s_{3-i}'
+$$
+
+Using the above variable, we define the value representing the memory access request as follows:
+
+$$
+u_{mem} = \alpha_0 + \alpha_1 \cdot op_{mem\_write} + \alpha_2 \cdot ctx + \alpha_3 \cdot s_0 + \alpha_4 \cdot clk + v
+$$
+
+In the above:
+
+- $op_{mem\_write}$ is the unique [operation label](../chiplets/index.md#operation-labels) of the memory write operation.
+- $ctx$ is the identifier of the current memory context.
+- $s_0$ is the memory address into which the values from the stack are to be saved.
+- $clk$ is the current clock cycle of the VM.
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $1$.
+
+### `MSTORE`
+
+The `MSTORE` operation pops an element off the stack, interprets it as a memory address, and writes the remaining element at the top of the stack into the first element of the word located at the specified memory address. The remaining $3$ elements of the word are not affected. The diagram below illustrates this graphically.
+
+![mstore](../../../../img/miden/vm/design/stack/io-ops/MSTORE.png)
+
+After the operation the contents of memory at address $a$ would be set to $b, v_1, v_2, v_3$.
+
+To simplify description of the memory access request value, we first define a variable for the value that represents the state of memory after the operation:
+
+$$
+v = \alpha_5 \cdot s_0' + \sum_{i=1}^3\alpha_{i+5} \cdot h_{3-i}'
+$$
+
+!!! note
+    The values in registers $h_0, h_1, h_2$ are set by the prover non-deterministically.
+
+Using the above variable, we define the value representing the memory access request as follows:
+
+$$
+u_{mem} = \alpha_0 + \alpha_1 \cdot op_{mem\_write} + \alpha_2 \cdot ctx + \alpha_3 \cdot s_0 + \alpha_4 \cdot clk + v
+$$
+
+In the above:
+
+- $op_{mem\_write} $ is the unique [operation label](../chiplets/index.md#operation-labels) of the memory write operation.
+- $ctx$ is the identifier of the current memory context.
+- $s_0$ is the memory address into which the value from the stack is to be saved.
+- $clk$ is the current clock cycle of the VM.
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $1$.
+
+### `MSTREAM`
+
+The `MSTREAM` operation loads two words from memory, and replaces the top 8 elements of the stack with them, element-wise, in stack order. The memory address from which the words are loaded is stored in the 13th stack element (position 12). The diagram below illustrates this graphically.
+
+![mstream](../../../../img/miden/vm/design/stack/io-ops/MSTREAM.png)
+
+After the operation, the memory address is incremented by 2.
+
+$$
+s_{12}' = s_{12} + 2
+$$
+
+To simplify description of the memory access request value, we first define variables for the values that represent the state of memory after the operation:
+
+$$
+v_1 = \sum_{i=0}^3\alpha_{i+5} \cdot s_{7-i}'
+$$
+
+$$
+v_2 = \sum_{i=0}^3\alpha_{i+5} \cdot s_{3-i}'
+$$
+
+Using the above variables, we define the values representing the memory access request as follows:
+
+$$
+u_{mem, 1} = \alpha_0 + \alpha_1 \cdot op_{mem\_read} + \alpha_2 \cdot ctx + \alpha_3 \cdot s_{12} + \alpha_4 \cdot clk + v_1
+$$
+
+$$
+u_{mem, 2} = \alpha_0 + \alpha_1 \cdot op_{mem\_read} + \alpha_2 \cdot ctx + \alpha_3 \cdot (s_{12} + 1) + \alpha_4 \cdot clk + v_2
+$$
+
+$$
+u_{mem} = u_{mem, 1} \cdot u_{mem, 2}
+$$
+
+In the above:
+
+- $op_{mem\_read}$ is the unique [operation label](../chiplets/index.md#operation-labels) of the memory read operation.
+- $ctx$ is the identifier of the current memory context.
+- $s_{12}$ and $s_{12} + 1$ are the memory addresses from which the values are to be loaded onto the stack.
+- $clk$ is the current clock cycle of the VM.
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $8$ except position $12$.
\ No newline at end of file
diff --git a/docs/miden/vm/design/stack/op-constraints.md b/docs/miden/vm/design/stack/op-constraints.md
new file mode 100644
index 000000000..87766b3c4
--- /dev/null
+++ b/docs/miden/vm/design/stack/op-constraints.md
@@ -0,0 +1,300 @@
+In addition to the constraints described in the previous section, we need to impose constraints to check that each VM operation is executed correctly.
+
+For this purpose the VM exposes a set of operation-specific flags. These flags are set to $1$ when a given operation is executed, and to $0$ otherwise. The naming convention for these flags is $f_{opname}$. For example, $f_{dup}$ would be set to $1$ when `DUP` operation is executed, and to $0$ otherwise. Operation flags are discussed in detail in the section [below](#operation-flags).
+
+To describe how operation-specific constraints work, let's use an example with `DUP` operation. This  operation pushes a copy of the top stack item onto the stack. The constraints we need to impose for this operation are as follows:
+
+$$
+f_{dup} \cdot (s'_0 - s_0) = 0 \\
+f_{dup} \cdot (s'_{i+1} - s_i) = 0 \ \text{ for } i \in [0, 15)
+$$
+
+The first constraint enforces that the top stack item in the next row is the same as the top stack item in the current row. The second constraint enforces that all stack items (starting from item $0$) are shifted to the right by $1$. We also need to impose all the constraints discussed in the previous section, be we omit them here.
+
+Let's write similar constraints for `DUP1` operation, which pushes a copy of the second stack item onto the stack:
+
+$$
+f_{dup1} \cdot (s'_0 - s_1) = 0 \\
+f_{dup1} \cdot (s'_{i+1} - s_i) = 0 \ \text{ for } i \in [0, 15)
+$$
+
+It is easy to notice that while the first constraint changed, the second constraint remained the same - i.e., we are still just shifting the stack to the right.
+
+In fact, for most operations it makes sense to make a distinction between constraints unique to the operation vs. more general constraints which enforce correct behavior for the stack items not affected by the operation. In the subsequent sections we describe in detail only the former constraints, and provide high-level descriptions of the more general constraints. Specifically, we indicate how the operation affects the rest of the stack (e.g., shifts right starting from position $0$).
+
+## Operation flags
+
+As mentioned above, operation flags are used as selectors to enforce operation-specific constraints. That is, they turn on relevant constraints for a given operation. In total, the VM provides $88$ unique operations, and thus, there are $88$ operation flags (not all of them currently used).
+
+Operation flags are mutually exclusive. That is, if one flag is set to $1$, all other flags are set to $0$. Also, one of the flags is always guaranteed to be set to $1$.
+
+To compute values of operation flags we use _op bits_ registers located in the [decoder](../decoder/index.md#decoder-trace). These registers contain binary representations of operation codes (opcodes). Each opcode consists of $7$ bits, and thus, there are $7$ _op bits_ registers. We denote these registers as $b_0, ..., b_6$. The values are computed by multiplying the op bit registers in various combinations. Notice that binary encoding down below is showed in big-endian order, so the flag bits correspond to the reverse order of the _op bits_ registers, from $b_6$ to $b_0$.
+
+For example, the value of the flag for `NOOP`, which is encoded as `0000000`, is computed as follows:
+
+$$
+f_{noop} = (1 - b_6) \cdot (1 - b_5) \cdot (1 - b_4) \cdot (1 - b_3) \cdot (1 - b_2) \cdot (1 - b_1) \cdot (1 - b_0)
+$$
+
+While the value of the `DROP` operation, which is encoded as `0101001` is computed as follows:
+
+$$
+f_{drop} = (1 - b_6) \cdot b_5 \cdot (1 - b_4) \cdot b_3 \cdot (1 - b_2) \cdot (1 - b_1) \cdot b_0
+$$
+
+As can be seen from above, the degree for both of these flags is $7$. Since degree of constraints in Miden VM can go up to $9$, this means that operation-specific constraints cannot exceed degree $2$. However, there are some operations which require constraints of higher degree (e.g., $3$ or even $5$). To support such constraints, we adopt the following scheme.
+
+We organize the operations into $4$ groups as shown below and also introduce two extra registers $e_0$ and $e_1$ for degree reduction:
+
+| $b_6$ | $b_5$ | $b_4$ | $b_3$ | $b_2$  | $b_1$ | $b_0$ | $e_0$ | $e_1$  |# of ops | degree  |
+| :---: | :---: | :---: | :---: | :----: | :---: | :---: | :---: | :----: | :-----: | :-----: |
+| 0     |  x    | x     | x     | x      | x     | x     | 0     | 0      | 64      | 7       |
+| 1     |  0    | 0     | x     | x      | x     | -     | 0     | 0      | 8       | 6       |
+| 1     |  0    | 1     | x     | x      | x     | x     | 1     | 0      | 16      | 5       |
+| 1     |  1    | x     | x     | x      | -     | -     | 0     | 1      | 8       | 4       |
+
+In the above:
+
+* Operation flags for operations in the first group (with prefix `0`), are computed using all $7$ op bits, and thus their degree is $7$.
+* Operation flags for operations in the second group (with prefix `100`), are computed using only the first $6$ op bits, and thus their degree is $6$.
+* Operation flags for operations in the third group (with prefix `101`), are computed using all $7$ op bits. We use the extra register $e_0$ (which is set to $b_6 \cdot (1-b_5) \cdot b_4$) to reduce the degree by $2$. Thus, the degree of op flags in this group is $5$.
+* Operation flags for operations in the fourth group (with prefix `11`), are computed using only the first $5$ op bits. We use the extra register $e_1$ (which is set to $b_6 \cdot b_5$) to reduce the degree by $1$. Thus, the degree of op flags in this group is $4$.
+
+How operations are distributed between these $4$ groups is described in the sections below.
+
+### No stack shift operations
+
+This group contains $32$ operations which do not shift the stack (this is almost all such operations). Since the op flag degree for these operations is $7$, constraints for these operations cannot exceed degree $2$.
+
+| Operation    | Opcode value | Binary encoding | Operation group               | Flag degree |
+| ------------ | :----------: | :-------------: | :---------------------------: | :---------: |
+| `NOOP`       | $0$          | `000_0000`      | [System ops](system-ops.md) | $7$         |
+| `EQZ `       | $1$          | `000_0001`      | [Field ops](field-ops.md)   | $7$         |
+| `NEG`        | $2$          | `000_0010`      | [Field ops](field-ops.md)   | $7$         |
+| `INV`        | $3$          | `000_0011`      | [Field ops](field-ops.md)   | $7$         |
+| `INCR`       | $4$          | `000_0100`      | [Field ops](field-ops.md)   | $7$         |
+| `NOT`        | $5$          | `000_0101`      | [Field ops](field-ops.md)   | $7$         |
+| `FMPADD`     | $6$          | `000_0110`      | [System ops](system-ops.md) | $7$         |
+| `MLOAD`      | $7$          | `000_0111`      | [I/O ops](io-ops.md)        | $7$         |
+| `SWAP`       | $8$          | `000_1000`      | [Stack ops](stack-ops.md)   | $7$         |
+| `CALLER`     | $9$          | `000_1001`      | [System ops](system-ops.md) | $7$         |
+| `MOVUP2`     | $10$         | `000_1010`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVDN2`     | $11$         | `000_1011`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVUP3`     | $12$         | `000_1100`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVDN3`     | $13$         | `000_1101`      | [Stack ops](stack-ops.md)   | $7$         |
+| `ADVPOPW`    | $14$         | `000_1110`      | [I/O ops](io-ops.md)        | $7$         |
+| `EXPACC`     | $15$         | `000_1111`      | [Field ops](field-ops.md)   | $7$         |
+| `MOVUP4`     | $16$         | `001_0000`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVDN4`     | $17$         | `001_0001`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVUP5`     | $18$         | `001_0010`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVDN5`     | $19$         | `001_0011`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVUP6`     | $20$         | `001_0100`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVDN6`     | $21$         | `001_0101`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVUP7`     | $22$         | `001_0110`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVDN7`     | $23$         | `001_0111`      | [Stack ops](stack-ops.md)   | $7$         |
+| `SWAPW`      | $24$         | `001_1000`      | [Stack ops](stack-ops.md)   | $7$         |
+| `EXT2MUL`    | $25$         | `001_1001`      | [Field ops](field-ops.md)   | $7$         |
+| `MOVUP8`     | $26$         | `001_1010`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MOVDN8`     | $27$         | `001_1011`      | [Stack ops](stack-ops.md)   | $7$         |
+| `SWAPW2`     | $28$         | `001_1100`      | [Stack ops](stack-ops.md)   | $7$         |
+| `SWAPW3`     | $29$         | `001_1101`      | [Stack ops](stack-ops.md)   | $7$         |
+| `SWAPDW`     | $30$         | `001_1110`      | [Stack ops](stack-ops.md)   | $7$         |
+| `<unused>`   | $31$         | `001_1111`      |                               | $7$         |
+
+### Left stack shift operations
+
+This group contains $16$ operations which shift the stack to the left (i.e., remove an item from the stack). Most of left-shift operations are contained in this group. Since the op flag degree for these operations is $7$, constraints for these operations cannot exceed degree $2$.
+
+| Operation    | Opcode value | Binary encoding | Operation group               | Flag degree |
+| ------------ | :----------: | :-------------: | :---------------------------: | :---------: |
+| `ASSERT`     | $32$         | `010_0000`      | [System ops](system-ops.md) | $7$         |
+| `EQ`         | $33$         | `010_0001`      | [Field ops](field-ops.md)   | $7$         |
+| `ADD`        | $34$         | `010_0010`      | [Field ops](field-ops.md)   | $7$         |
+| `MUL`        | $35$         | `010_0011`      | [Field ops](field-ops.md)   | $7$         |
+| `AND`        | $36$         | `010_0100`      | [Field ops](field-ops.md)   | $7$         |
+| `OR`         | $37$         | `010_0101`      | [Field ops](field-ops.md)   | $7$         |
+| `U32AND`     | $38$         | `010_0110`      | [u32 ops](u32-ops.md)       | $7$         |
+| `U32XOR`     | $39$         | `010_0111`      | [u32 ops](u32-ops.md)       | $7$         |
+| `FRIE2F4`    | $40$         | `010_1000`      | [Crypto ops](crypto-ops.md) | $7$         |
+| `DROP`       | $41$         | `010_1001`      | [Stack ops](stack-ops.md)   | $7$         |
+| `CSWAP`      | $42$         | `010_1010`      | [Stack ops](stack-ops.md)   | $7$         |
+| `CSWAPW`     | $43$         | `010_1011`      | [Stack ops](stack-ops.md)   | $7$         |
+| `MLOADW`     | $44$         | `010_1100`      | [I/O ops](io-ops.md)        | $7$         |
+| `MSTORE`     | $45$         | `010_1101`      | [I/O ops](io-ops.md)        | $7$         |
+| `MSTOREW`    | $46$         | `010_1110`      | [I/O ops](io-ops.md)        | $7$         |
+| `FMPUPDATE`  | $47$         | `010_1111`      | [System ops](system-ops.md) | $7$         |
+
+### Right stack shift operations
+
+This group contains $16$ operations which shift the stack to the right (i.e., push a new item onto the stack). Most of right-shift operations are contained in this group. Since the op flag degree for these operations is $7$, constraints for these operations cannot exceed degree $2$.
+
+| Operation    | Opcode value | Binary encoding | Operation group               | Flag degree |
+| ------------ | :----------: | :-------------: | :---------------------------: | :---------: |
+| `PAD`        | $48$         | `011_0000`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP`        | $49$         | `011_0001`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP1`       | $50$         | `011_0010`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP2`       | $51$         | `011_0011`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP3`       | $52$         | `011_0100`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP4`       | $53$         | `011_0101`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP5`       | $54$         | `011_0110`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP6`       | $55$         | `011_0111`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP7`       | $56$         | `011_1000`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP9`       | $57$         | `011_1001`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP11`      | $58$         | `011_1010`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP13`      | $59$         | `011_1011`      | [Stack ops](stack-ops.md)   | $7$         |
+| `DUP15`      | $60$         | `011_1100`      | [Stack ops](stack-ops.md)   | $7$         |
+| `ADVPOP`     | $61$         | `011_1101`      | [I/O ops](io-ops.md)        | $7$         |
+| `SDEPTH`     | $62$         | `011_1110`      | [I/O ops](io-ops.md)        | $7$         |
+| `CLK`        | $63$         | `011_1111`      | [System ops](system-ops.md) | $7$         |
+
+### u32 operations
+
+This group contains $8$ u32 operations. These operations are grouped together because all of them require range checks. The constraints for range checks are of degree $5$, however, since all these operations require them, we can define a flag with common prefix `100` to serve as a selector for the range check constraints. The value of this flag is computed as follows:
+
+$$
+f_{u32rc} = b_6 \cdot (1 - b_5) \cdot (1 - b_4)
+$$
+
+The degree of this flag is $3$, which is acceptable for a selector for degree $5$ constraints.
+
+| Operation    | Opcode value | Binary encoding | Operation group               | Flag degree |
+| ------------ | :----------: | :-------------: | :---------------------------: | :---------: |
+| `U32ADD`     | $64$         | `100_0000`      | [u32 ops](u32-ops.md)       | $6$         |
+| `U32SUB`     | $66$         | `100_0010`      | [u32 ops](u32-ops.md)       | $6$         |
+| `U32MUL`     | $68$         | `100_0100`      | [u32 ops](u32-ops.md)       | $6$         |
+| `U32DIV`     | $70$         | `100_0110`      | [u32 ops](u32-ops.md)       | $6$         |
+| `U32SPLIT`   | $72$         | `100_1000`      | [u32 ops](u32-ops.md)       | $6$         |
+| `U32ASSERT2` | $74$         | `100_1010`      | [u32 ops](u32-ops.md)       | $6$         |
+| `U32ADD3`    | $76$         | `100_1100`      | [u32 ops](u32-ops.md)       | $6$         |
+| `U32MADD`    | $78$         | `100_1110`      | [u32 ops](u32-ops.md)       | $6$         |
+
+As mentioned previously, the last bit of the opcode is not used in computation of the flag for these operations. We force this bit to always be set to $0$ with the following constraint:
+
+>$$
+b_6 \cdot (1 - b_5) \cdot (1 - b_4) \cdot b_0 = 0 \text{ | degree} = 4
+$$
+
+Putting these operations into a group with flag degree $6$ is important for two other reasons:
+* Constraints for the `U32SPLIT` operation have degree $3$. Thus, the degree of the op flag for this operation cannot exceed $6$.
+* Operations `U32ADD3` and `U32MADD` shift the stack to the left. Thus, having these two operations in this group and putting them under the common prefix `10011` allows us to create a common flag for these operations of degree $5$ (recall that the left-shift flag cannot exceed degree $5$).
+
+### High-degree operations
+This group contains operations which require constraints with degree up to $3$. All $7$ operation bits are used for these flags. The extra $e_0$ column is used for degree reduction of the three high-degree bits.
+
+| Operation    | Opcode value | Binary encoding | Operation group                        | Flag degree |
+| ------------ | :----------: | :-------------: | :-------------------------------------:| :---------: |
+| `HPERM`      | $80$         | `101_0000`      | [Crypto ops](crypto-ops.md)          | $5$         |
+| `MPVERIFY`   | $81$         | `101_0001`      | [Crypto ops](crypto-ops.md)          | $5$         |
+| `PIPE`       | $82$         | `101_0010`      | [I/O ops](io-ops.md)                 | $5$         |
+| `MSTREAM`    | $83$         | `101_0011`      | [I/O ops](io-ops.md)                 | $5$         |
+| `SPLIT`      | $84$         | `101_0100`      | [Flow control ops](../decoder/index.md) | $5$         |
+| `LOOP`       | $85$         | `101_0101`      | [Flow control ops](../decoder/index.md) | $5$         |
+| `SPAN`       | $86$         | `101_0110`      | [Flow control ops](../decoder/index.md) | $5$         |
+| `JOIN`       | $87$         | `101_0111`      | [Flow control ops](../decoder/index.md) | $5$         |
+| `DYN`        | $88$         | `101_1000`      | [Flow control ops](../decoder/index.md) | $5$         |
+| `<unused>`   | $89$         | `101_1001`      |                                        | $5$         |
+| `<unused>`   | $90$         | `101_1010`      |                                        | $5$         |
+| `<unused>`   | $91$         | `101_1011`      |                                        | $5$         |
+| `<unused>`   | $92$         | `101_1100`      |                                        | $5$         |
+| `<unused>`   | $93$         | `101_1101`      |                                        | $5$         |
+| `<unused>`   | $94$         | `101_1110`      |                                        | $5$         |
+| `<unused>`   | $95$         | `101_1111`      |                                        | $5$         |
+
+Note that the `SPLIT` and `LOOP` operations are grouped together under the common prefix `101010`, and thus can have a common flag of degree $4$ (using $e_0$ for degree reduction). This is important because both of these operations shift the stack to the left.
+
+
+Also, we need to make sure that `extra` register $e_0$, which is used to reduce the flag degree by $2$, is set to $1$ when $b_6 = 1$, $b_5 = 0$, and $b_4 = 1$:
+
+>$$
+e_0 - b_6 \cdot (1 - b_5) \cdot b_4 = 0 \text{ | degree} = 3
+$$
+
+### Very high-degree operations
+This group contains operations which require constraints with degree up to $5$.
+
+| Operation    | Opcode value | Binary encoding | Operation group                        | Flag degree |
+| ------------ | :----------: | :-------------: | :-------------------------------------:| :---------: |
+| `MRUPDATE`   | $96$         | `110_0000`      | [Crypto ops](crypto-ops.md)          | $4$         |
+| `PUSH`       | $100$        | `110_0100`      | [I/O ops](io-ops.md)                 | $4$         |
+| `SYSCALL`    | $104$        | `110_1000`      | [Flow control ops](../decoder/index.md) | $4$         |
+| `CALL`       | $108$        | `110_1100`      | [Flow control ops](../decoder/index.md) | $4$         |
+| `END`        | $112$        | `111_0000`      | [Flow control ops](../decoder/index.md) | $4$         |
+| `REPEAT`     | $116$        | `111_0100`      | [Flow control ops](../decoder/index.md) | $4$         |
+| `RESPAN`     | $120$        | `111_1000`      | [Flow control ops](../decoder/index.md) | $4$         |
+| `HALT`       | $124$        | `111_1100`      | [Flow control ops](../decoder/index.md) | $4$         |
+
+As mentioned previously, the last two bits of the opcode are not used in computation of the flag for these operations. We force these bits to always be set to $0$ with the following constraints:
+
+>$$
+b_6 \cdot b_5 \cdot b_0 = 0 \text{ | degree} = 3
+$$
+
+>$$
+b_6 \cdot b_5 \cdot b_1 = 0 \text{ | degree} = 3
+$$
+
+Also, we need to make sure that `extra` register $e_1$, which is used to reduce the flag degree by $1$, is set to $1$ when both $b_6$ and $b_5$ columns are set to $1$:
+
+>$$
+e_1 - b_6 \cdot b_5 = 0 \text{ | degree} = 2
+$$
+
+## Composite flags
+Using the operation flags defined above, we can compute several composite flags which are used by various constraints in the VM.
+
+### Shift right flag
+The right-shift flag indicates that an operation shifts the stack to the right. This flag is computed as follows:
+
+$$
+f_{shr} = (1 - b_6) \cdot b_5 \cdot b_4 + f_{u32split} + f_{push} \text{ | degree} = 6
+$$
+
+In the above, $(1 - b_6) \cdot b_5 \cdot b_4$ evaluates to $1$ for all [right stack shift](#right-stack-shift-operations) operations described previously. This works because all these operations have a common prefix `011`. We also need to add in flags for other operations which shift the stack to the right but are not a part of the above group (e.g., `PUSH` operation).
+
+### Shift left flag
+The left-shift flag indicates that a given operation shifts the stack to the left. To simplify the description of this flag, we will first compute the following intermediate variables:
+
+A flag which is set to $1$ when $f_{u32add3} = 1$ or $f_{u32madd} = 1$:
+
+$$
+f_{add3\_madd} = b_6 \cdot (1 - b_5) \cdot (1 - b_4) \cdot b_3 \cdot b_2 \text{ | degree} = 5
+$$
+
+A flag which is set to $1$ when $f_{split} = 1$ or $f_{loop} = 1$:
+
+$$
+f_{split\_loop} = e_0 \cdot (1 - b_3) \cdot b_2 \cdot (1 - b_1) \text{ | degree} = 4
+$$
+
+Using the above variables, we compute left-shift flag as follows:
+
+$$
+f_{shl} = (1 - b_6) \cdot b_5 \cdot (1 - b_4) + f_{add3\_madd} + f_{split\_loop} + f_{repeat} + f_{end} \cdot h_5 \text{ | degree} = 5
+$$
+
+In the above:
+* $(1 - b_6) \cdot b_5 \cdot (1 - b_4)$ evaluates to $1$ for all [left stack shift](#left-stack-shift-operations) operations described previously. This works because all these operations have a common prefix `010`.
+* $h_5$ is the helper register in the decoder which is set to $1$ when we are exiting a `LOOP` block, and to $0$ otherwise.
+
+Thus, similarly to the right-shift flag, we compute the value of the left-shift flag based on the prefix of the operation group which contains most left shift operations, and add in flag values for other operations which shift the stack to the left but are not a part of this group.
+
+### Control flow flag
+The control flow flag $f_{ctrl}$ is set to $1$ when a control flow operation is being executed by the VM, and to $0$ otherwise. Naively, this flag can be computed as follows:
+
+$$
+f_{ctrl} = f_{join} + f_{split} + f_{loop} + f_{repeat} + f_{span} + f_{respan} + f_{call} + f_{syscall} + f_{end} + f_{halt} \text{ | degree} = 6
+$$
+
+However, this can be computed more efficiently via the common operation prefixes for the two groups of control flow operations as follows.
+
+$$
+f_{span,join,split,loop} = e_0 \cdot (1 - b_3) \cdot b_2 \text{ | degree} = 3
+$$
+
+$$
+f_{end,repeat,respan,halt} = e_1 \cdot b_4  \text{ | degree} = 2
+$$
+
+$$
+f_{ctrl} = f_{span,join,split,loop} + f_{end,repeat,respan,halt} + f_{dyn} + f_{call} + f_{syscall} \text{ | degree} = 5
+$$
diff --git a/docs/miden/vm/design/stack/stack-ops.md b/docs/miden/vm/design/stack/stack-ops.md
new file mode 100644
index 000000000..2ef4f4d5e
--- /dev/null
+++ b/docs/miden/vm/design/stack/stack-ops.md
@@ -0,0 +1,250 @@
+In this section we describe the AIR constraints for Miden VM stack manipulation operations.
+
+## `PAD`
+
+The `PAD` operation pushes a $0$ onto the stack. The diagram below illustrates this graphically.
+
+![pad](../../../../img/miden/vm/design/stack/stack-ops/PAD.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_{0}' = 0 \text{ | degree} = 1
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **Right shift** starting from position $0$.
+
+## `DROP`
+
+The `DROP` operation removes an element from the top of the stack. The diagram below illustrates this graphically.
+
+![drop](../../../../img/miden/vm/design/stack/stack-ops/DROP.png)
+
+The `DROP` operation shifts the stack by $1$ element to the left, but does not impose any additional constraints. The degree of left shift constraints is $1$.
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $1$.
+
+## DUP(n)
+
+The `DUP(n)` operations push a copy of the $n$-th stack element onto the stack. Eg. `DUP` (same as `DUP0`) pushes a copy of the top stack element onto the stack. Similarly, `DUP5` pushes a copy of the $6$-th stack element onto the stack. This operation is valid for $n \in \{0, ..., 7, 9, 11, 13, 15\}$. The diagram below illustrates this graphically.
+
+![dupn](../../../../img/miden/vm/design/stack/stack-ops/DUP(n).png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_{0}' - s_{n} = 0 \text{ for } n \in \{0, ..., 7, 9, 11, 13, 15\} \text{ | degree} = 1
+$$
+
+where $n$ is the depth of the stack from where the element has been copied.
+
+The effect of this operation on the rest of the stack is:
+
+* **Right shift** starting from position $0$.
+
+## `SWAP`
+
+The `SWAP` operations swaps the top two elements of the stack. The diagram below illustrates this graphically.
+
+![swap](../../../../img/miden/vm/design/stack/stack-ops/SWAP.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_{0}' - s_{1} = 0 \text{ | degree} = 1
+$$
+
+>$$
+s_{1}' - s_{0} = 0 \text{ | degree} = 1
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $2$.
+
+## `SWAPW`
+
+The `SWAPW` operation swaps stack elements $0, 1, 2, 3$ with elements $4, 5, 6, 7$. The diagram below illustrates this graphically.
+
+![swapw](../../../../img/miden/vm/design/stack/stack-ops/SWAPW.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_{i}' - s_{i+4} = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 1
+$$
+
+>$$
+s_{i + 4}' - s_i = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 1
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $8$.
+
+## `SWAPW2`
+
+The `SWAPW2` operation swaps stack elements $0, 1, 2, 3$ with elements $8, 9, 10, 11$. The diagram below illustrates this graphically.
+
+![swapw2](../../../../img/miden/vm/design/stack/stack-ops/SWAPW2.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_i' - s_{i+8} = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 1
+$$
+
+>$$
+s_{i + 8}' - s_i = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 1
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** for elements $4, 5, 6, 7$.
+* **No change** starting from position $12$.
+
+## `SWAPW3`
+
+The `SWAPW3` operation swaps stack elements $0, 1, 2, 3$ with elements $12, 13, 14, 15$. The diagram below illustrates this graphically.
+
+![swapw3](../../../../img/miden/vm/design/stack/stack-ops/SWAPW3.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_i' - s_{i+12} = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 1
+$$
+
+>$$
+s_{i+12}' - s_i = 0 \text{ for } i \in \{0, 1, 2, 3\} \text{ | degree} = 1
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** for elements $4, 5, 6, 7, 8, 9, 10, 11$.
+* **No change** starting from position $16$.
+
+## `SWAPDW`
+
+The `SWAPDW` operation swaps stack elements $[0, 8)$ with elements $[8, 16)$. The diagram below illustrates this graphically.
+
+![swapdw](../../../../img/miden/vm/design/stack/stack-ops/SWAPDW.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_i' - s_{i+8} = 0 \text{ for } i \in [0, 8)   \text{ | degree} = 1
+$$
+
+>$$
+s_{i+8}' - s_i = 0 \text{ for } i \in [0, 8)   \text{ | degree} = 1
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $16$.
+
+## `MOVUP(n)`
+
+The `MOVUP(n)` operation moves the $n$-th element of the stack to the top of the stack. For example, `MOVUP2` moves element at depth $2$ to the top of the stack. All elements with depth less than $n$ are shifted to the right by one, while elements with depth greater than $n$ remain in place, and the depth of the stack does not change. This operation is valid for $n \in [2, 9)$. The diagram below illustrates this graphically.
+
+![movup](../../../../img/miden/vm/design/stack/stack-ops/MOVUP(n).png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' - s_n = 0 \text{ for } n \in [2, 9) \text{ | degree} = 1
+$$
+
+where $n$ is the depth of the element which is moved moved to the top of the stack.
+
+The effect of this operation on the rest of the stack is:
+
+* **Right shift** for elements between $0$ and $n-1$.
+* **No change** starting from position $n+1$.
+
+## `MOVDN(n)`
+
+The `MOVDN(n)` operation moves the top element of the stack to the $n$-th position. For example, `MOVDN2` moves the top element of the stack to depth $2$. All the elements with depth less than $n$ are shifted to the left by one, while elements with depth greater than $n$ remain in place, and the depth of the stack does not change. This operation is valid for $n \in [2, 9)$. The diagram below illustrates this graphically.
+
+![movdn](../../../../img/miden/vm/design/stack/stack-ops/MOVDN(n).png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_n' - s_0 = 0 \text{ for } n \in [2, 9) \text{ | degree} = 1
+$$
+
+where $n$ is the depth to which the top stack element is moved.
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** for elements between $1$ and $n$.
+* **No change** starting from position $n+1$.
+
+## `CSWAP`
+
+The `CSWAP` operation pops an element off the stack and if the element is $1$, swaps the top two remaining elements. If the popped element is $0$, the rest of the stack remains unchanged. The diagram below illustrates this graphically.
+
+![cswap](../../../../img/miden/vm/design/stack/stack-ops/CSWAP.png)
+
+In the above:
+
+$$
+d = \begin{cases} a, & \text{if}\ c = 0 \\ b, & \text{if}\ c = 1\ \end{cases} e = \begin{cases} b, & \text{if}\ c = 0 \\ a, & \text{if}\ c = 1\ \end{cases}
+$$
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' - s_{0} \cdot s_{2} - (1-s_0) \cdot s_1 = 0 \text{ | degree} = 2
+$$
+
+>$$
+s_1' - s_0 \cdot s_{1} - (1-s_0) \cdot s_2 = 0 \text{ | degree} = 2
+$$
+
+We also need to enforce that the value in $s_0$ is binary. This can be done with the following constraint:
+
+>$$
+s_0^2 - s_0 = 0 \text{ | degree} = 2
+$$
+
+The effect of this operation on the rest of the stack is:
+* **Left shift** starting from position $3$.
+
+## `CSWAPW`
+
+The `CSWAPW` operation pops an element off the stack and if the element is $1$, swaps elements $1, 2, 3, 4$ with elements $5, 6, 7, 8$. If the popped element is $0$, the rest of the stack remains unchanged. The diagram below illustrates this graphically.
+
+![cswapw](../../../../img/miden/vm/design/stack/stack-ops/CSWAPW.png)
+
+In the above:
+
+$$
+D = \begin{cases} A, & \text{if}\ c = 0 \\ B, & \text{if}\ c = 1\ \end{cases} E = \begin{cases} B, & \text{if}\ c = 0 \\ A, & \text{if}\ c = 1\ \end{cases}
+$$
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_i' - s_0 \cdot s_{i+5} - (1-s_0) \cdot s_{i+1} = 0 \text{ for } i \in [0, 4)  \text{ | degree} = 2
+$$
+
+>$$
+s_{i+4}' - s_0 \cdot s_{i+1} - (1-s_0) \cdot s_{i+5} = 0 \text{ for } i \in [0, 4) \text{ | degree} = 2
+$$
+
+We also need to enforce that the value in $s_0$ is binary. This can be done with the following constraint:
+
+>$$
+s_0^2 - s_0 = 0 \text{ | degree} = 2
+$$
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $9$.
diff --git a/docs/miden/vm/design/stack/system-ops.md b/docs/miden/vm/design/stack/system-ops.md
new file mode 100644
index 000000000..b73136446
--- /dev/null
+++ b/docs/miden/vm/design/stack/system-ops.md
@@ -0,0 +1,71 @@
+In this section we describe the AIR constraints for Miden VM system operations.
+
+## `NOOP`
+
+The `NOOP` operation advances the cycle counter but does not change the state of the operand stack (i.e., the depth of the stack and the values on the stack remain the same).
+
+The `NOOP` operation does not impose any constraints besides the ones needed to ensure that the entire state of the stack is copied over. This constraint looks like so:
+
+>$$
+s'_i - s_i = 0 \ \text{ for } i \in [0, 16) \text { | degree} = 1
+$$
+
+## `ASSERT`
+
+The `ASSERT` operation pops an element off the stack and checks if the popped element is equal to $1$. If the element is not equal to $1$, program execution fails.
+
+![assert](../../../../img/miden/vm/design/stack/system-ops/ASSERT.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0 - 1 = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+* **Left shift** starting from position $1$.
+
+## `FMPADD`
+
+The `FMPADD` operation pops an element off the stack, adds the current value of the `fmp` register to it, and pushes the result back onto the stack. The diagram below illustrates this graphically.
+
+![fmpadd](../../../../img/miden/vm/design/stack/system-ops/FMPADD.png)
+
+Stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' - (s_0 + fmp) = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+* **No change** starting from position $1$.
+
+## `FMPUPDATE`
+
+The `FMPUPDATE` operation pops an element off the stack and adds it to the current value of the `fmp` register. The diagram below illustrates this graphically.
+
+![fmpupdate](../../../../img/miden/vm/design/stack/system-ops/FMPUPDATE.png)
+
+The stack transition for this operation must follow the following constraint:
+
+>$$
+fmp' - (fmp + s_0) = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+* **Left shift** starting from position $1$.
+
+## `CLK`
+
+The `CLK` operation pushes the current value of the clock cycle onto the stack. The diagram below illustrates this graphically.
+
+![clk](../../../../img/miden/vm/design/stack/system-ops/CLK.png)
+
+The stack transition for this operation must follow the following constraint:
+
+>$$
+s_0' - clk = 0 \text{ | degree} = 1
+$$
+
+The effect on the rest of the stack is:
+* **Right shift** starting from position $0$.
diff --git a/docs/miden/vm/design/stack/u32-ops.md b/docs/miden/vm/design/stack/u32-ops.md
new file mode 100644
index 000000000..00ef6015a
--- /dev/null
+++ b/docs/miden/vm/design/stack/u32-ops.md
@@ -0,0 +1,304 @@
+In this section we describe semantics and AIR constraints of operations over u32 values (i.e., 32-bit unsigned integers) as they are implemented in Miden VM.
+
+## Range checks
+
+Most operations described below require some number of 16-bit range checks (i.e., verifying that the value of a field element is smaller than $2^{16}$). The number of required range checks varies between $2$ and $4$, depending on the operation. However, to simplify the constraint system, we force each relevant operation to consume exactly $4$ range checks.
+
+To perform these range checks, the prover puts the values to be range-checked into helper registers $h_0, ..., h_3$, and then updates the range checker bus column $b_{range}$ according to the LogUp construction described in the [range checker](../range.md) documentation, using multiplicity $1$ for each value.
+
+This operation is enforced via the following constraint. Note that since constraints cannot include divisions, the actual constraint which is enforced will be expressed equivalently with all denominators multiplied through, resulting in a constraint of degree 5.
+
+>$$
+b_{range}' = b_{range} - \frac{1}{(\alpha - h_0)} - \frac{1}{(\alpha - h_1)} - \frac{1}{(\alpha - h_2)} - \frac{1}{(\alpha - h_3)} \text{ | degree} = 5
+$$
+
+The above is just a partial constraint as it does not show the range checker's part of the constraint, which adds the required values into the bus column. It also omits the [selector flag](./op-constraints.md#operation-flags) which is used to turn this constraint on only when executing relevant operations.
+
+### Checking element validity
+
+Another primitive which is required by most of the operations described below is checking whether four 16-bit values form a valid field element. Assume $t_0$, $t_1$, $t_2$, and $t_3$ are known to be 16-bit values, and we want to verify that $2^{48} \cdot t_3 + 2^{32} \cdot t_2 + 2^{16} \cdot t_1 + t_0$ is a valid field element.
+
+For simplicity, let's denote:
+
+$$
+v_{hi} = 2^{16} \cdot t_3 + t_2 \\
+v_{lo} = 2^{16} \cdot t_1 + t_0
+$$
+
+We can then impose the following constraint to verify element validity:
+
+> $$
+\left(1 - m \cdot (2^{32} - 1 - v_{hi})\right) \cdot v_{lo} = 0 \text{ | degree} = 3
+$$
+
+Where $m$ is a value set non-deterministically by the prover.
+
+The above constraint should hold only if either of the following hold:
+
+* $v_{lo} = 0$
+* $v_{hi} \ne 2^{32} - 1$
+
+To satisfy the latter equation, the prover needs to set $m = (2^{32} - 1 - v_{hi})^{-1}$, which is possible only when $v_{hi} \ne 2^{32} - 1$.
+
+This constraint is sufficient because modulus $2^{64} - 2^{32} + 1$ in binary representation is 32 ones, followed by 31 zeros, followed by a single one:
+
+$$
+1111111111111111111111111111111100000000000000000000000000000001
+$$
+
+This implies that the largest possible 64-bit value encoding a valid field element would be 32 ones, followed by 32 zeros:
+
+$$
+1111111111111111111111111111111100000000000000000000000000000000
+$$
+
+Thus, for a 64-bit value to encode a valid field element, either the lower 32 bits must be all zeros, or the upper 32 bits must not be all ones (which is $2^{32} - 1$).
+
+## `U32SPLIT`
+
+Assume $a$ is the element at the top of the stack. The `U32SPLIT` operation computes $(b,c) \leftarrow a$, where $b$ contains the lower 32 bits of $a$, and $c$ contains the upper 32 bits of $a$. The diagram below illustrates this graphically.
+
+![u32split](../../../../img/miden/vm/design/stack/u32-operations/U32SPLIT.png)
+
+To facilitate this operation, the prover sets values in $h_0, ..., h_3$ to 16-bit limbs of $a$ with $h_0$ being the least significant limb. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_{0} = 2^{48} \cdot h_3 + 2^{32} \cdot h_2 + 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+>$$
+s_{1}' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+>$$
+s_{0}' = 2^{16} \cdot h_3 + h_2 \text{ | degree} = 1
+$$
+
+In addition to the above constraints, we also need to verify that values in $h_0, ..., h_3$ are smaller than $2^{16}$, which we can do using 16-bit range checks as described [previously](#range-checks). Also, we need to make sure that values in $h_0, ..., h_3$, when combined, form a valid field element, which we can do by putting a nondeterministic value $m$ into helper register $h_4$ and using the technique described [here](#checking-element-validity).
+
+The effect of this operation on the rest of the stack is:
+
+* **Right shift** starting from position $1$.
+
+## `U32ASSERT2`
+
+Assume $a$ and $b$ are the elements at the top of the stack. The `U32ASSERT2` verifies that both $a$ and $b$ are smaller than $2^{32}$. The diagram below illustrates this graphically.
+
+![u32assert2](../../../../img/miden/vm/design/stack/u32-operations/U32ASSERT2.png)
+
+To facilitate this operation, the prover sets values in $h_0$ and $h_1$ to low and high 16-bit limbs of $a$, and values in $h_2$ and $h_3$ to to low and high 16-bit limbs of $b$. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0' = 2^{16} \cdot h_3 + h_2 \text{ | degree} = 1
+$$
+
+>$$
+s_1' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+In addition to the above constraints, we also need to verify that values in $h_0, ..., h_3$ are smaller than $2^{16}$, which we can do using 16-bit range checks as described [previously](#range-checks).
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $0$ - i.e., the state of the stack does not change.
+
+## `U32ADD`
+
+Assume $a$ and $b$ are the values at the top of the stack which are known to be smaller than $2^{32}$. The `U32ADD` operation computes $(c,d) \leftarrow a + b$, where $c$ contains the low 32-bits of the result, and $d$ is the carry bit. The diagram below illustrates this graphically.
+
+![u32add](../../../../img/miden/vm/design/stack/u32-operations/U32ADD.png)
+
+To facilitate this operation, the prover sets values in $h_0$, $h_1$, and $h_2$ to 16-bit limbs of $a+b$ with $h_0$ being the least significant limb. Value in $h_3$ is set to $0$. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0 + s_1 = 2^{32} \cdot h_2 + 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+>$$
+s_0' = h_2 \text{ | degree} = 1
+$$
+
+>$$
+s_1' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+In addition to the above constraints, we also need to verify that values in $h_0, ..., h_3$ are smaller than $2^{16}$, which we can do using 16-bit range checks as described [previously](#range-checks).
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $2$.
+
+## `U32ADD3`
+
+Assume $a$, $b$, $c$ are the values at the top of the stack which are known to be smaller than $2^{32}$. The `U32ADD3` operation computes $(d, e) \leftarrow a + b + c$, where $c$ and $d$ contains the low and the high 32-bits of the result respectively. The diagram below illustrates this graphically.
+
+![u32add3](../../../../img/miden/vm/design/stack/u32-operations/U32ADD3.png)
+
+To facilitate this operation, the prover sets values in $h_0$, $h_1$, and $h_2$ to 16-bit limbs of $a+b+c$ with $h_0$ being the least significant limb. Value in $h_3$ is set to $0$. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0 + s_1 + s_2 = 2^{32} \cdot h_2 + 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+>$$
+s_0' = h_2 \text{ | degree} = 1
+$$
+
+>$$
+s_1' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+In addition to the above constraints, we also need to verify that values in $h_0, ..., h_3$ are smaller than $2^{16}$, which we can do using 16-bit range checks as described [previously](#range-checks).
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $3$.
+
+## `U32SUB`
+
+Assume $a$ and $b$ are the values at the top of the stack which are known to be smaller than $2^{32}$. The `U32SUB` operation computes $(c, d) \leftarrow a - b$, where $c$ contains the 32-bit result in two's complement, and $d$ is the borrow bit. The diagram below illustrates this graphically.
+
+![u32sub](../../../../img/miden/vm/design/stack/u32-operations/U32SUB.png)
+
+To facilitate this operation, the prover sets values in $h_0$ and $h_1$ to the low and the high 16-bit limbs of $a-b$ respectively. Values in $h_2$ and $h_3$ are set to $0$. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_1 = s_0 + s_1' + 2^{32} \cdot s_0' \text{ | degree} = 1
+$$
+
+>$$
+s_0'^2 - s_0' = 0 \text{ | degree} = 2
+$$
+
+>$$
+s_1' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+In addition to the above constraints, we also need to verify that values in $h_0, ..., h_3$ are smaller than $2^{16}$, which we can do using 16-bit range checks as described [previously](#range-checks).
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $2$.
+
+## `U32MUL`
+
+Assume $a$ and $b$ are the values at the top of the stack which are known to be smaller than $2^{32}$. The `U32MUL` operation computes $(c, d) \leftarrow a \cdot b$, where $c$ and $d$ contain the low and the high 32-bits of the result respectively. The diagram below illustrates this graphically.
+
+![u32mul](../../../../img/miden/vm/design/stack/u32-operations/U32MUL.png)
+
+To facilitate this operation, the prover sets values in $h_0, ..., h_3$ to 16-bit limbs of $a \cdot b$ with $h_0$ being the least significant limb. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0 \cdot s_1 = 2^{48} \cdot h_3 + 2^{32} \cdot h_2 + 2^{16} \cdot h_1 + h_0 \text{ | degree} = 2
+$$
+
+>$$
+s_1' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+>$$
+s_0' = 2^{16} \cdot h_3 + h_2 \text{ | degree} = 1
+$$
+
+In addition to the above constraints, we also need to verify that values in $h_0, ..., h_3$ are smaller than $2^{16}$, which we can do using 16-bit range checks as described [previously](#range-checks). Also, we need to make sure that values in $h_0, ..., h_3$, when combined, form a valid field element, which we can do by putting a nondeterministic value $m$ into helper register $h_4$ and using the technique described [here](#checking-element-validity).
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $2$.
+
+## `U32MADD`
+
+Assume $a$, $b$, $c$ are the values at the top of the stack which are known to be smaller than $2^{32}$. The `U32MADD` operation computes $(d, e) \leftarrow a +b \cdot c$, where $c$ and $d$ contains the low and the high 32-bits of $a + b \cdot c$. The diagram below illustrates this graphically.
+
+![u32madd](../../../../img/miden/vm/design/stack/u32-operations/U32MADD.png)
+
+To facilitate this operation, the prover sets values in $h_0, ..., h_3$ to 16-bit limbs of $a + b \cdot c$ with $h_0$ being the least significant limb. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_0 \cdot s_1 + s_2 = 2^{48} \cdot h_3 + 2^{32} \cdot h_2 + 2^{16} \cdot h_1 + h_0 \text{ | degree} = 2
+$$
+
+>$$
+s_1' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+>$$
+s_0' = 2^{16} \cdot h_3 + h_2 \text{ | degree} = 1
+$$
+
+In addition to the above constraints, we also need to verify that values in $h_0, ..., h_3$ are smaller than $2^{16}$, which we can do using 16-bit range checks as described [previously](#range-checks). Also, we need to make sure that values in $h_0, ..., h_3$, when combined, form a valid field element, which we can do by putting a nondeterministic value $m$ into helper register $h_4$ and using the technique described [here](#checking-element-validity).
+
+!!! note
+    The above constraints guarantee the correctness of the operation iff $a + b \cdot c$ cannot overflow field modules (which is the case for the field with modulus $2^{64} - 2^{32} + 1$).
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $3$.
+
+## `U32DIV`
+
+Assume $a$ and $b$ are the values at the top of the stack which are known to be smaller than $2^{32}$. The `U32DIV` operation computes $(c, d) \leftarrow a / b$, where $c$ contains the quotient and $d$ contains the remainder. The diagram below illustrates this graphically.
+
+![u32div](../../../../img/miden/vm/design/stack/u32-operations/U32DIV.png)
+
+To facilitate this operation, the prover sets values in $h_0$ and $h_1$ to 16-bit limbs of $a - c$, and values in $h_2$ and $h_3$ to 16-bit limbs of $b - d - 1$. Thus, stack transition for this operation must satisfy the following constraints:
+
+>$$
+s_1 = s_0 \cdot s_1' + s_0' \text{ | degree} = 2
+$$
+
+>$$
+s_1 - s_1' = 2^{16} \cdot h_1 + h_0 \text{ | degree} = 1
+$$
+
+>$$
+s_0 - s_0' - 1= 2^{16} \cdot h_2 + h_3 \text{ | degree} = 1
+$$
+
+The second constraint enforces that $s_1' \leq s_1$, while the third constraint enforces that $s_0' < s_0$.
+
+The effect of this operation on the rest of the stack is:
+
+* **No change** starting from position $2$.
+
+## `U32AND`
+
+Assume $a$ and $b$ are the values at the top of the stack. The `U32AND` operation computes $c \leftarrow (a \land b)$, where $c$ is the result of performing a bitwise AND on $a$ and $b$. The diagram below illustrates this graphically.
+
+![u32and](../../../../img/miden/vm/design/stack/u32-operations/U32AND.png)
+
+To facilitate this operation, we will need to make a request to the chiplet bus $b_{chip}$ by dividing its current value by the value representing bitwise operation request. This can be enforced with the following constraint:
+
+>$$
+b_{chip}' \cdot \left(\alpha_0 + \alpha_1 \cdot op_{u32and} + \alpha_2 \cdot s_0 + \alpha_3 \cdot s_1 +  \alpha_4 \cdot s_0'  \right) = b_{chip} \text{ | degree} = 2
+$$
+
+In the above, $op_{u32and}$ is the unique [operation label](../chiplets/index.md#operation-labels) of the bitwise `AND` operation.
+
+!!! note
+    Unlike for many other u32 operations, bitwise AND operation does not assume that the values at the top of the stack are smaller than $2^{32}$. This is because the lookup will fail for any inputs which are not 32-bit integers.
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $2$.
+
+## `U32XOR`
+
+Assume $a$ and $b$ are the values at the top of the stack. The `U32XOR` operation computes $c \leftarrow (a \oplus b)$, where $c$ is the result of performing a bitwise XOR on $a$ and $b$. The diagram below illustrates this graphically.
+
+![u32xor](../../../../img/miden/vm/design/stack/u32-operations/U32XOR.png)
+
+To facilitate this operation, we will need to make a request to the chiplet bus $b_{chip}$ by dividing its current value by the value representing bitwise operation request. This can be enforced with the following constraint:
+
+> $$
+b_{chip}' \cdot \left(\alpha_0 + \alpha_1 \cdot op_{u32xor} + \alpha_2 \cdot s_0 + \alpha_3 \cdot s_1 +  \alpha_4 \cdot s_0'  \right) = b_{chip} \text{ | degree} = 2
+$$
+
+In the above, $op_{u32xor}$ is the unique [operation label](../chiplets/index.md#operation-labels) of the bitwise `XOR` operation.
+
+!!! note
+    Unlike for many other u32 operations, bitwise XOR operation does not assume that the values at the top of the stack are smaller than $2^{32}$. This is because the lookup will fail for any inputs which are not 32-bit integers.
+
+The effect of this operation on the rest of the stack is:
+
+* **Left shift** starting from position $2$.
diff --git a/docs/miden/vm/intro/index.md b/docs/miden/vm/intro/index.md
new file mode 100644
index 000000000..adfa724f9
--- /dev/null
+++ b/docs/miden/vm/intro/index.md
@@ -0,0 +1,48 @@
+## Welcome to Polygon Miden VM
+
+Miden VM is a zero-knowledge virtual machine written in Rust. For any program executed on Miden VM, a STARK-based proof of execution is automatically generated. This proof can then be used by anyone to verify that the program was executed correctly without the need for re-executing the program or even knowing the contents of the program.
+
+!!! warning
+    - Miden is currently in development.
+    - A public testnet for Polygon Miden is expected in Q1 2024.
+
+## Status and features
+
+Miden VM is currently on release v0.7. In this release, most of the core features of the VM have been stabilized, and most of the STARK proof generation has been implemented. While we expect to keep making changes to the VM internals, the external interfaces should remain relatively stable, and we will do our best to minimize the amount of breaking changes going forward.
+
+At this point, Miden VM is good enough for experimentation, and even for real-world applications, but it is not yet ready for production use. The codebase has not been audited and contains known and unknown bugs and security flaws.
+
+### Feature highlights
+
+Miden VM is a fully-featured virtual machine. Despite being optimized for zero-knowledge proof generation, it provides all the features one would expect from a regular VM. To highlight a few:
+
+* **Flow control.** Miden VM is Turing-complete and supports familiar flow control structures such as conditional statements and counter/condition-controlled loops. There are no restrictions on the maximum number of loop iterations or the depth of control flow logic.
+* **Procedures.** Miden assembly programs can be broken into subroutines called *procedures*. This improves code modularity and helps reduce the size of Miden VM programs.
+* **Execution contexts.** Miden VM program execution can span multiple isolated contexts, each with its own dedicated memory space. The contexts are separated into the *root context* and *user contexts*. The root context can be accessed from user contexts via customizable kernel calls.
+* **Memory.** Miden VM supports read-write random-access memory. Procedures can reserve portions of global memory for easier management of local variables.
+* **u32 operations.** Miden VM supports native operations with 32-bit unsigned integers. This includes basic arithmetic, comparison, and bitwise operations.
+* **Cryptographic operations.** Miden assembly provides built-in instructions for computing hashes and verifying Merkle paths. These instructions use Rescue Prime Optimized hash function (which is the native hash function of the VM).
+* **External libraries.** Miden VM supports compiling programs against pre-defined libraries. The VM ships with one such library: Miden `stdlib` which adds support for such things as 64-bit unsigned integers. Developers can build other similar libraries to extend the VM's functionality in ways which fit their use cases.
+* **Nondeterminism**. Unlike traditional virtual machines, Miden VM supports nondeterministic programming. This means a prover may do additional work outside of the VM and then provide execution *hints* to the VM. These hints can be used to dramatically speed up certain types of computations, as well as to supply secret inputs to the VM.
+* **Customizable hosts.** Miden VM can be instantiated with user-defined hosts. These hosts are used to supply external data to the VM during execution/proof generation (via nondeterministic inputs) and can connect the VM to arbitrary data sources (e.g., a database or RPC calls).
+
+### Planned features
+
+In the coming months we plan to finalize the design of the VM and implement support for the following features:
+
+* **Recursive proofs.** Miden VM will soon be able to verify a proof of its own execution. This will enable infinitely recursive proofs, an extremely useful tool for real-world applications.
+* **Better debugging.** Miden VM will provide a better debugging experience including the ability to place breakpoints, better source mapping, and more complete program analysis info.
+* **Faulty execution.** Miden VM will support generating proofs for programs with faulty execution (a notoriously complex task in ZK context). That is, it will be possible to prove that execution of some program resulted in an error.
+
+## Structure of the documentation
+
+This documentation is meant to provide an in-depth description of Miden VM. It is organized as follows:
+
+* In the introduction, we provide a high-level overview of Miden VM and describe how to run simple programs.
+* In the user documentation section, we provide developer-focused documentation useful to those who want to develop on Miden VM or build compilers from higher-level languages to Miden assembly (the native language of Miden VM).
+* In the design section, we provide in-depth descriptions of the VM's internals, including all AIR constraints for the proving system. We also provide the rationale for settling on specific design choices.
+* Finally, in the background material section, we provide references to materials which could be useful for learning more about STARKs - the proving system behind Miden VM.
+
+## License
+
+Licensed under the [MIT license](http://opensource.org/licenses/MIT).
diff --git a/docs/miden/vm/intro/overview.md b/docs/miden/vm/intro/overview.md
new file mode 100644
index 000000000..e2f8f7f45
--- /dev/null
+++ b/docs/miden/vm/intro/overview.md
@@ -0,0 +1,52 @@
+Miden VM is a stack machine. The base data type of the MV is a field element in a 64-bit [prime field](https://en.wikipedia.org/wiki/Finite_field) defined by modulus $p = 2^{64} - 2^{32} + 1$. This means that all values that the VM operates with are field elements in this field (i.e., values between $0$ and $2^{64} - 2^{32}$, both inclusive).
+
+Miden VM consists of four high-level components as illustrated below.
+
+![](../../../img/miden/vm/intro/vm_components.png)
+
+These components are:
+
+* **Stack**: A push-down stack where each item is a field element. Most assembly instructions operate with values located on the stack. The stack can grow up to $2^{32}$ items deep, however, only the top 16 items are directly accessible.
+* **Memory**: A linear random-access read-write memory. The memory is word-addressable, meaning, four elements are located at each address, and we can read and write elements to/from memory in batches of four. Memory addresses can be in the range $[0, 2^{32}]$.
+* **Chiplets**: Specialized circuits for accelerating certain types of computations. These include Rescue Prime Optimized (RPO) hash function, 32-bit binary operations, and 16-bit range checks.
+* **Host**: A way for the prover to communicate with the VM during runtime. This includes responding to the VM's requests for non-deterministic inputs and handling messages sent by the VM (e.g., for debugging purposes). The requests for non-deterministic inputs are handled by the host's *advice provider*.
+
+Miden VM comes with a default implementation of the host interface (with an in-memory advice provider). However, the users are able to provide their own implementations which can connect the VM to arbitrary data sources (e.g., a database or RPC calls) and define custom logic for handling events emitted by the VM.
+
+## Writing programs
+
+Our goal is to make Miden VM an easy compilation target for high-level languages such as Rust, Move, Sway, and others. We believe it is important to let people write programs in the languages of their choice. However, compilers to help with this have not been developed yet. Thus, for now, the primary way to write programs for Miden VM is to use [Miden assembly](../user-docs/assembly/index.md).
+
+While writing programs in assembly is far from ideal, Miden assembly does make this task a little bit easier by supporting high-level flow control structures and named procedures.
+
+## Inputs and outputs
+
+External inputs can be provided to Miden VM in two ways:
+
+1. Public inputs can be supplied to the VM by initializing the stack with desired values before a program starts executing. Any number of stack items can be initialized in this way, but providing a large number of public inputs will increase the cost for the verifier.
+2. Secret (or nondeterministic) inputs can be supplied to the VM via the [*advice provider*](#nondeterministic-inputs). There is no limit on how much data the advice provider can hold.
+
+After a program finishes executing, the elements remaining on the stack become the outputs of the program. Since these outputs will be public inputs for the verifier, having a large stack at the end of execution will increase cost to the verifier. Therefore, it's best to drop unneeded output values. We've provided the [`truncate_stack`](../user-docs/stdlib/sys.md) utility function in the standard library for this purpose.
+
+The number of public inputs and outputs of a program can be reduced by making use of the advice stack and Merkle trees. Just 4 elements are sufficient to represent a root of a Merkle tree, which can be expanded into an arbitrary number of values.
+
+For example, if we wanted to provide a thousand public input values to the VM, we could put these values into a Merkle tree, initialize the stack with the root of this tree, initialize the advice provider with the tree itself, and then retrieve values from the tree during program execution using `mtree_get` instruction (described [here](../user-docs/assembly/cryptographic-operations.md#hashing-and-merkle-trees)).
+
+### Stack depth restrictions
+
+For reasons explained [here](../design/stack/index.md), the VM imposes the restriction that the stack depth cannot be smaller than $16$. This has the following effects:
+
+- When initializing a program with fewer than $16$ inputs, the VM will pad the stack with zeros to ensure the depth is $16$ at the beginning of execution.
+- If an operation would result in the stack depth dropping below $16$, the VM will insert a zero at the deep end of the stack to make sure the depth stays at $16$.
+
+### Nondeterministic inputs
+
+The *advice provider* component is responsible for supplying nondeterministic inputs to the VM. These inputs only need to be known to the prover (i.e., they do not need to be shared with the verifier).
+
+The advice provider consists of three components:
+
+* **Advice stack**: A one-dimensional array of field elements. Being a stack, the VM can either push new elements onto the advice stack, or pop the elements from its top.
+* **Advice map**: A key-value map where keys are words and values are vectors of field elements. The VM can copy values from the advice map onto the advice stack as well as insert new values into the advice map (e.g., from a region of memory).
+* **Merkle store**: Contains structured data reducible to Merkle paths. Some examples of such structures are: Merkle tree, Sparse Merkle Tree, and a collection of Merkle paths. The VM can request Merkle paths from the Merkle store, as well as mutate it by updating or merging nodes contained in the store.
+
+The prover initializes the advice provider prior to executing a program, and from that point on the advice provider is manipulated solely by executing operations on the VM.
diff --git a/docs/miden/vm/intro/performance.md b/docs/miden/vm/intro/performance.md
new file mode 100644
index 000000000..df67981b2
--- /dev/null
+++ b/docs/miden/vm/intro/performance.md
@@ -0,0 +1,68 @@
+The benchmarks below should be viewed only as a rough guide for expected future performance. The reasons for this are twofold:
+
+1. Not all constraints have been implemented yet, and we expect that there will be some slowdown once constraint evaluation is completed.
+2. Many optimizations have not been applied yet, and we expect that there will be some speedup once we dedicate some time to performance optimizations.
+
+Overall, we don't expect the benchmarks to change significantly, but there will definitely be some deviation from the below numbers in the future.
+
+A few general notes on performance:
+
+* Execution time is dominated by proof generation time. In fact, the time needed to run the program is usually under 1% of the time needed to generate the proof.
+* Proof verification time is really fast. In most cases it is under 1 ms, but sometimes gets as high as 2 ms or 3 ms.
+* Proof generation process is dynamically adjustable. In general, there is a trade-off between execution time, proof size, and security level (i.e. for a given security level, we can reduce proof size by increasing execution time, up to a point).
+* Both proof generation and proof verification times are greatly influenced by the hash function used in the STARK protocol. In the benchmarks below, we use BLAKE3, which is a really fast hash function.
+
+## Single-core prover performance
+
+When executed on a single CPU core, the current version of Miden VM operates at around 20 - 25 KHz. In the benchmarks below, the VM executes a Fibonacci calculator program on Apple M1 Pro CPU in a single thread. The generated proofs have a target security level of 96 bits.
+
+| VM cycles       | Execution time | Proving time | RAM consumed  | Proof size |
+| :-------------: | :------------: | :----------: | :-----------: | :--------: |
+| 2<sup>10</sup>  |  1 ms          | 60 ms        | 20 MB         | 46 KB      |
+| 2<sup>12</sup>  |  2 ms          | 180 ms       | 52 MB         | 56 KB      |
+| 2<sup>14</sup>  |  8 ms          | 680 ms       | 240 MB        | 65 KB      |
+| 2<sup>16</sup>  |  28 ms         | 2.7 sec      | 950 MB        | 75 KB      |
+| 2<sup>18</sup>  |  81 ms         | 11.4 sec     | 3.7 GB        | 87 KB      |
+| 2<sup>20</sup>  |  310 ms        | 47.5 sec     | 14 GB         | 100 KB     |
+
+As can be seen from the above, proving time roughly doubles with every doubling in the number of cycles, but proof size grows much slower.
+
+We can also generate proofs at a higher security level. The cost of doing so is roughly doubling of proving time and roughly 40% increase in proof size. In the benchmarks below, the same Fibonacci calculator program was executed on Apple M1 Pro CPU at 128-bit target security level:
+
+| VM cycles       | Execution time | Proving time | RAM consumed  | Proof size |
+| :-------------: | :------------: | :----------: | :-----------: | :--------: |
+| 2<sup>10</sup>  | 1 ms           | 120 ms       | 30 MB         | 61 KB      |
+| 2<sup>12</sup>  | 2 ms           | 460 ms       | 106 MB        | 77 KB      |
+| 2<sup>14</sup>  | 8 ms           | 1.4 sec      | 500 MB        | 90 KB      |
+| 2<sup>16</sup>  | 27 ms          | 4.9 sec      | 2.0 GB        | 103 KB     |
+| 2<sup>18</sup>  | 81 ms          | 20.1 sec     | 8.0 GB        | 121 KB     |
+| 2<sup>20</sup>  | 310 ms         | 90.3 sec     | 20.0 GB       | 138 KB     |
+
+## Multi-core prover performance
+STARK proof generation is massively parallelizable. Thus, by taking advantage of multiple CPU cores we can dramatically reduce proof generation time. For example, when executed on an 8-core CPU (Apple M1 Pro), the current version of Miden VM operates at around 100 KHz. And when executed on a 64-core CPU (Amazon Graviton 3), the VM operates at around 250 KHz.
+
+In the benchmarks below, the VM executes the same Fibonacci calculator program for 2<sup>20</sup> cycles at 96-bit target security level:
+
+| Machine                        | Execution time | Proving time | Execution % | Implied Frequency |
+| ------------------------------ | :------------: | :----------: | :---------: | :---------------: |
+| Apple M1 Pro (16 threads)      | 310 ms         | 7.0 sec      | 4.2%        | 140 KHz           |
+| Apple M2 Max (16 threads)      | 280 ms         | 5.8 sec      | 4.5%        | 170 KHz           |
+| AMD Ryzen 9 5950X (16 threads) | 270 ms         | 10.0 sec     | 2.6%        | 100 KHz           |
+| Amazon Graviton 3 (64 threads) | 330 ms         | 3.6 sec      | 8.5%        | 265 KHz           |
+
+### Recursive proofs
+Proofs in the above benchmarks are generated using BLAKE3 hash function. While this hash function is very fast, it is not very efficient to execute in Miden VM. Thus, proofs generated using BLAKE3 are not well-suited for recursive proof verification. To support efficient recursive proofs, we need to use an arithmetization-friendly hash function. Miden VM natively supports Rescue Prime Optimized (RPO), which is one such hash function. One of the downsides of arithmetization-friendly hash functions is that they are considerably slower than regular hash functions.
+
+In the benchmarks below we execute the same Fibonacci calculator program for 2<sup>20</sup> cycles at 96-bit target security level using RPO hash function instead of BLAKE3:
+
+| Machine                        | Execution time | Proving time | Proving time (HW) |
+| ------------------------------ | :------------: | :----------: | :---------------: |
+| Apple M1 Pro (16 threads)      | 310 ms         | 94.3 sec     | 42.0 sec          |
+| Apple M2 Max (16 threads)      | 280 ms         | 75.1 sec     | 20.9 sec          |
+| AMD Ryzen 9 5950X (16 threads) | 270 ms         | 59.3 sec     |                   |
+| Amazon Graviton 3 (64 threads) | 330 ms         | 21.7 sec     | 14.9 sec          |
+
+In the above, proof generation on some platforms can be hardware-accelerated. Specifically:
+
+* On Apple M1/M2 platforms the built-in GPU is used for a part of proof generation process.
+* On the Graviton platform, SVE vector extension is used to accelerate RPO computations.
diff --git a/docs/miden/vm/intro/usage.md b/docs/miden/vm/intro/usage.md
new file mode 100644
index 000000000..c00ee4f07
--- /dev/null
+++ b/docs/miden/vm/intro/usage.md
@@ -0,0 +1,136 @@
+Before using Miden VM, make sure you have Rust [installed](https://www.rust-lang.org/tools/install). Miden VM v0.7 requires Rust version **1.67** or later.
+
+Miden VM consists of several crates, each of which exposes a small set of functionality. The most notable of these crates are:
+
+* [miden-processor](https://crates.io/crates/miden-processor): Execute Miden VM programs.
+* [miden-prover](https://crates.io/crates/miden-prover): Execute Miden VM programs and generate proofs of their execution.
+* [miden-verifier](https://crates.io/crates/miden-verifier): Verify proofs of program execution generated by Miden VM prover.
+
+The above functionality is also exposed via the single [miden-vm](https://crates.io/crates/miden-vm) crate, which also provides a CLI interface for interacting with Miden VM.
+
+## CLI interface
+
+### Compiling Miden VM
+
+To compile Miden VM into a binary, we have a [Makefile](https://www.gnu.org/software/make/manual/make.html) with the following tasks:
+
+```sh
+make exec
+```
+
+This will place an optimized, multi-threaded `miden` executable into the `./target/optimized` directory. It is equivalent to executing:
+
+```sh
+cargo build --profile optimized --features concurrent,executable
+```
+
+If you would like to enable single-threaded mode, you can compile Miden VM using the following command:
+
+```sh
+cargo build --profile optimized --features executable
+```
+
+For a faster build, you can compile with less optimizations, replacing `--profile optimized` by `--release`. Example:
+
+```sh
+cargo build --release --features concurrent,executable
+```
+
+In this case, the `miden` executable will be placed in the `./target/release` directory.
+
+### Controlling parallelism
+
+Internally, Miden VM uses [rayon](https://github.com/rayon-rs/rayon) for parallel computations. To control the number of threads used to generate a STARK proof, you can use `RAYON_NUM_THREADS` environment variable.
+
+### GPU acceleration
+
+Miden VM proof generation can be accelerated via GPUs. Currently, GPU acceleration is enabled only on Apple silicon hardware (via [Metal](https://en.wikipedia.org/wiki/Metal_(API))). To compile Miden VM with Metal acceleration enabled, you can run the following command:
+
+```
+make exec-metal
+```
+
+Similar to `make exec` command, this will place the resulting `miden` executable into the `./target/optimized` directory.
+
+Currently, GPU acceleration is applicable only to recursive proofs which can be generated using the `-r` flag.
+
+### SIMD acceleration
+
+Miden VM execution and proof generation can be accelerated via vectorized instructions. Currently, SIMD acceleration can be enabled only on platforms supporting [SVE](https://en.wikipedia.org/wiki/AArch64#Scalable_Vector_Extension_(SVE)) instructions (e.g., Graviton 3). To compile Miden VM with SVE acceleration enabled, you can run the following command:
+
+```sh
+make exec-graviton
+```
+
+This will place the resulting `miden` executable into the `./target/optimized` directory.
+
+Similar to Metal acceleration, SVE acceleration is currently applicable only to recursive proofs which can be generated using the `-r` flag.
+
+### Running Miden VM
+
+Once the executable has been compiled, you can run Miden VM like so:
+
+```sh
+./target/optimized/miden [subcommand] [parameters]
+```
+
+Currently, Miden VM can be executed with the following subcommands:
+
+* `run`: Executes a Miden assembly program and output the result, but will not generate a proof of execution.
+* `prove`: Executes a Miden assembly program, and will also generate a STARK proof of execution.
+* `verify`: Verifies a previously generated proof of execution for a given program.
+* `compile`: Compiles a Miden assembly program (i.e., build a program [MAST](../design/programs.md)) and outputs stats about the compilation process.
+* `debug`: Instantiates a [Miden debugger](../tools/debugger.md) against the specified Miden assembly program and inputs.
+* `analyze`: Runs a Miden assembly program against specific inputs and will output stats about its execution.
+* `repl`: Initiates the [Miden REPL](../tools/repl.md) tool.
+
+All of the above subcommands require various parameters to be provided. To get more detailed help on what is needed for a given subcommand, you can run the following:
+
+```sh
+./target/optimized/miden [subcommand] --help
+```
+
+For example:
+
+```sh
+./target/optimized/miden prove --help
+```
+
+To execute a program using the Miden VM there needs to be a `.masm` file containing the Miden Assembly code and a `.inputs` file containing the inputs.
+
+### Inputs
+
+As described [here](overview.md#inputs-and-outputs) the Miden VM can consume public and secret inputs.
+
+* Public inputs:
+      * `operand_stack` - can be supplied to the VM to initialize the stack with the desired values before a program starts executing. There is no limit on the number of stack inputs that can be initialized in this way, although increasing the number of public inputs increases the cost to the verifier.
+* Secret (or nondeterministic) inputs:
+      * `advice_stack` - can be supplied to the VM. There is no limit on how much data the advice provider can hold. This is provided as a string array where each string entry represents a field element.
+      * `advice_map` - is supplied as a map of 64-character hex keys, each mapped to an array of numbers.  The hex keys are interpreted as 4 field elements and the arrays of numbers are interpreted as arrays of field elements.
+      * `merkle_store` - the Merkle store is container that allows the user to define `merkle_tree`, `sparse_merkle_tree` and `partial_merkle_tree` data structures.
+        * `merkle_tree` - is supplied as an array of 64-character hex values where each value represents a leaf (4 elements) in the tree.
+        * `sparse_merkle_tree` - is supplied as an array of tuples of the form (number, 64-character hex string).  The number represents the leaf index and the hex string represents the leaf value (4 elements).
+        * `partial_merkle_tree` - is supplied as an array of tuples of the form ((number, number), 64-character hex string). The internal tuple represents the leaf depth and index at this depth, and the hex string represents the leaf value (4 elements).
+
+!!! info
+    Check out the [comparison example](https://github.com/0xPolygonMiden/examples/blob/main/examples/comparison.masm) to see how secret inputs work.
+
+After a program finishes executing, the elements that remain on the stack become the outputs of the program, along with the overflow addresses (`overflow_addrs`) that are required to reconstruct the [stack overflow table](../design/stack/index.md#overflow-table).
+
+## Fibonacci example
+
+In the `miden/examples/fib` directory, we provide a very simple Fibonacci calculator example. This example computes the 1001st term of the Fibonacci sequence. You can execute this example on Miden VM like so:
+
+```sh
+./target/optimized/miden run -a miden/examples/fib/fib.masm -n 1
+```
+
+This will run the example code to completion and will output the top element remaining on the stack.
+
+If you want the output of the program in a file, you can use the `--output` or `-o` flag and specify the path to the output file. For example:
+
+```sh
+./target/optimized/miden run -a miden/examples/fib/fib.masm -o fib.out
+```
+
+This will dump the output of the program into the `fib.out` file. The output file will contain the state of the stack at the end of the program execution.
diff --git a/docs/miden/vm/tools/debugger.md b/docs/miden/vm/tools/debugger.md
new file mode 100644
index 000000000..c0134698a
--- /dev/null
+++ b/docs/miden/vm/tools/debugger.md
@@ -0,0 +1,55 @@
+The Miden debugger is a command-line interface (CLI) application, inspired by [GNU gdb](https://sourceware.org/gdb/), which allows debugging of Miden assembly (MASM) programs. The debugger allows the user to step through the execution of the program, both forward and backward, either per clock cycle tick, or via breakpoints.
+
+The Miden debugger supports the following commands:
+
+| Command | Shortcut | Arguments | Description |
+| --- | --- | --- | --- |
+| next | n | count? | Steps `count` clock cycles. Will step `1` cycle of `count` is ommitted. |
+| continue | c | - | Executes the program until completion, failure or a breakpoint. |
+| back | b | count? | Backward step `count` clock cycles. Will back-step `1` cycle of `count` is ommitted. |
+| rewind | r | - | Executes the program backwards until the beginning, failure or a breakpoint. |
+| print | p | - | Displays the complete state of the virtual machine. |
+| print mem | p m | address? | Displays the memory value at `address`. If `address` is ommitted, didisplays all the memory values. |
+| print stack | p s | index? | Displays the stack value at `index`. If `index` is ommitted, displays all the stack values. |
+| clock | c | - | Displays the current clock cycle. |
+| quit | q | - | Quits the debugger. |
+| help | h | - | Displays the help message. |
+
+In order to start debugging, the user should provide a `MASM` program:
+
+```shell
+cargo run --features executable -- debug --assembly miden/examples/nprime/nprime.masm
+```
+
+The expected output is:
+
+```txt
+============================================================
+Debug program
+============================================================
+Reading program file `miden/examples/nprime/nprime.masm`
+Compiling program... done (16 ms)
+Debugging program with hash 11dbbddff27e26e48be3198133df8cbed6c5875d0fb
+606c9f037c7893fde4118...
+Reading input file `miden/examples/nprime/nprime.inputs`
+Welcome! Enter `h` for help.
+>>
+```
+
+In order to add a breakpoint, the user should insert a `breakpoint` instruction into the MASM file. This will generate a `Noop` operation that will be decorated with the debug break configuration. This is a provisory solution until the source mapping is implemented.
+
+The following example will halt on the third instruction of `foo`:
+
+```
+proc.foo
+    dup
+    dup.2
+    breakpoint
+    swap
+    add.1
+end
+
+begin
+    exec.foo
+end
+```
diff --git a/docs/miden/vm/tools/index.md b/docs/miden/vm/tools/index.md
new file mode 100644
index 000000000..cb7b927bf
--- /dev/null
+++ b/docs/miden/vm/tools/index.md
@@ -0,0 +1,13 @@
+The following tools are available for interacting with Miden VM:
+
+* Via the [miden-vm](https://crates.io/crates/miden-vm) crate (or within the Miden VM repo):
+    * [CLI](../intro/usage.md#cli-interface)
+    * [Debugger](./debugger.md)
+    * [REPL](./repl.md)
+* Via your browser:
+    * The interactive [Miden VM Playground](https://0xpolygonmiden.github.io/examples/) for writing, executing, proving, and verifying programs from your browser.
+
+The following resources are available to help you get started programming with Miden VM more quickly:
+
+* The [Miden VM examples repo](https://github.com/0xPolygonMiden/examples) contains examples of programs written in Miden Assembly.
+* Our [Scaffolded repo](https://github.com/0xPolygonMiden/zkhack-scaffold/) can be cloned for starting a new Rust project using Miden VM.
diff --git a/docs/miden/vm/tools/repl.md b/docs/miden/vm/tools/repl.md
new file mode 100644
index 000000000..284782c4f
--- /dev/null
+++ b/docs/miden/vm/tools/repl.md
@@ -0,0 +1,137 @@
+The Miden read–eval–print loop (REPL) is a Miden shell that allows for quick and easy debugging of Miden assembly. After the REPL gets initialized, you can execute any Miden instruction, undo executed instructions, check the state of the stack and memory at a given point, and do many other useful things! When the REPL is exited, a `history.txt` file is saved. One thing to note is that all the REPL native commands start with an `!` to differentiate them from regular assembly instructions.
+
+Miden REPL can be started via the CLI [repl](../intro/usage.md#cli-interface) command like so:
+
+```sh
+./target/optimized/miden repl
+```
+
+## Miden assembly instruction
+
+All Miden instructions mentioned in the [Miden Assembly sections](../user-docs/assembly/index.md) are valid. One can either input instructions one by one or multiple instructions in one input.
+
+For example, the below two commands will result in the same output.
+
+```sh
+>> push.1
+>> push.2
+>> push.3
+```
+
+```sh
+push.1 push.2 push.3
+```
+
+To execute a control flow operation, one must write the entire statement in a single line with spaces between individual operations.
+
+```sh
+repeat.20
+    pow2
+end
+```
+
+The above example should be written as follows in the REPL tool:
+
+```sh
+repeat.20 pow2 end
+```
+
+### !help
+
+The `!help` command prints out all the available commands in the REPL tool.
+
+### !program
+
+The `!program` command prints out the entire Miden program being executed. E.g., in the below scenario:
+
+```sh
+>> push.1.2.3.4
+>> repeat.16 pow2 end
+>> u32checked_add
+
+>> !program
+begin
+    push.1.2.3.4
+    repeat.16 pow2 end
+    u32checked_add
+end
+```
+
+### !stack
+
+The `!stack` command prints out the state of the stack at the last executed instruction. Since the stack always contains at least 16 elements, 16 or more elements will be printed out (even if all of them are zeros).
+
+```sh
+>> push.1 push.2 push.3 push.4 push.5
+>> exp
+>> u32checked_mul
+>> swap
+>> eq.2
+>> assert
+```
+
+The `!stack` command will print out the following state of the stack:
+
+```sh
+>> !stack
+3072 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+```
+
+### !mem
+
+The `!mem` command prints out the contents of all initialized memory locations. For each such location, the address, along with its memory values, is printed. Recall that four elements are stored at each memory address.
+
+If the memory has at least one value that has been initialized:
+
+```sh
+>> !mem
+7: [1, 2, 0, 3]
+8: [5, 7, 3, 32]
+9: [9, 10, 2, 0]
+```
+
+If the memory is not yet been initialized:
+
+```sh
+>> !mem
+The memory has not been initialized yet
+```
+
+### !mem[addr]
+
+The `!mem[addr]` command prints out memory contents at the address specified by `addr`.
+
+If the `addr` has been initialized:
+
+```sh
+>> !mem[9]
+9: [9, 10, 2, 0]
+```
+
+If the `addr` has not been initialized:
+
+```sh
+>> !mem[87]
+Memory at address 87 is empty
+```
+
+### !undo
+
+The `!undo` command reverts to the previous state of the stack and memory by dropping off the last executed assembly instruction from the program. One could use `!undo` as often as they want to restore the state of a stack and memory $n$ instructions ago (provided there are $n$ instructions in the program). The `!undo` command will result in an error if no remaining instructions are left in the Miden program.
+
+```sh
+>> push.1 push.2 push.3
+>> push.4
+>> !stack
+4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0
+
+>> push.5
+>> !stack
+5 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0
+
+>> !undo
+4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0
+
+>> !undo
+3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+```
diff --git a/docs/miden/vm/user-docs/assembly/code-organization.md b/docs/miden/vm/user-docs/assembly/code-organization.md
new file mode 100644
index 000000000..dbdcfce1a
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/code-organization.md
@@ -0,0 +1,194 @@
+A Miden assembly program is just a sequence of instructions each describing a specific directive or an operation. You can use any combination of whitespace characters to separate one instruction from another.
+
+In turn, Miden assembly instructions are just keywords which can be parameterized by zero or more parameters. The notation for specifying parameters is *keyword.param1.param2* - i.e., the parameters are separated by periods. For example, `push.123` instruction denotes a `push` operation which is parameterized by value `123`.
+
+Miden assembly programs are organized into procedures. Procedures, in turn, can be grouped into modules.
+
+## Procedures
+
+A *procedure* can be used to encapsulate a frequently-used sequence of instructions which can later be invoked via a label. A procedure must start with a `proc.<label>.<number of locals>` instruction and terminate with an `end` instruction. For example:
+
+```
+proc.foo.2
+    <instructions>
+end
+```
+
+A procedure label must start with a letter and can contain any combination of numbers, ASCII letters, and underscores (`_`). The number of characters in the procedure label cannot exceed 100.
+
+The number of locals specifies the number of memory-based local words a procedure can access (via `loc_load`, `loc_store`, and [other instructions](io-operations.md#random-access-memor)). If a procedure doesn't need any memory-based locals, this parameter can be omitted or set to `0`. A procedure can have at most $2^{16}$ locals, and the total number of locals available to all procedures at runtime is limited to $2^{30}$.
+
+To execute a procedure, the `exec.<label>`, `call.<label>`, and `syscall.<label>` instructions can be used. For example:
+
+```
+exec.foo
+```
+The difference between using each of these instructions is explained in the [next section](execution-contexts.md#procedure-invocation-semantics).
+
+A procedure may execute any other previously defined procedure, but it cannot execute itself or any of the subsequent procedures. Thus, recursive procedure calls are not possible. For example, the following code block defines a program with two procedures:
+
+```
+proc.foo
+    <instructions>
+end
+
+proc.bar
+    <instructions>
+    exec.foo
+    <instructions>
+end
+
+begin
+    <instructions>
+    exec.bar
+    <instructions>
+    exec.foo
+end
+```
+
+### Dynamic procedure invocation
+
+It is also possible to invoke procedures dynamically - i.e., without specifying target procedure labels at compile time. There are two instructions, `dynexec` and `dyncall`, which can be used to execute dynamically-specified code targets. Both instructions expect [MAST root](../../design/programs.md) of the target to be provided via the stack. The difference between `dynexec` and `dyncall` is that `dyncall` will [change context](execution-contexts.md) before executing the dynamic code target, while `dynexec` will cause the code target to be executed in the current context.
+
+Dynamic code execution in the same context is achieved by setting the top $4$ elements of the stack to the hash of the dynamic code block and then executing the following instruction:
+
+```sh
+dynexec
+```
+
+This causes the VM to do the following:
+
+1. Read the top 4 elements of the stack to get the hash of the dynamic target (leaving the stack unchanged).
+2. Execute the code block which hashes to the specified target. The VM must know the specified code block and hash (they must be in the CodeBlockTable of the executing Program).
+
+Dynamic code execution in a new context can be achieved similarly by setting the top $4$ elements of the stack to the hash of the dynamic code block and then executing the following instruction:
+
+```sh
+dyncall
+```
+
+!!! note
+    In both cases, the stack is left unchanged. Therefore, if the dynamic code is intended to manipulate the stack, it should start by either dropping or moving the code block hash from the top of the stack.
+
+## Modules
+
+A *module* consists of one or more procedures. There are two types of modules: *library modules* and *executable modules* (also called *programs*).
+
+### Library modules
+
+Library modules contain zero or more internal procedures and one or more exported procedures. For example, the following module defines one internal procedure (defined with `proc` instruction) and one exported procedure (defined with `export` instruction):
+
+```sh
+proc.foo
+    <instructions>
+end
+
+export.bar
+    <instructions>
+    exec.foo
+    <instructions>
+end
+```
+
+### Programs
+
+Executable modules are used to define programs. A program contains zero or more internal procedures (defined with `proc` instruction) and exactly one main procedure (defined with `begin` instruction). For example, the following module defines one internal procedure and a main procedure:
+
+```sh
+proc.foo
+    <instructions>
+end
+
+begin
+    <instructions>
+    exec.foo
+    <instructions>
+end
+```
+
+A program cannot contain any exported procedures.
+
+When a program is executed, the execution starts at the first instruction following the `begin` instruction. The main procedure is expected to be the last procedure in the program and can be followed only by comments.
+
+### Importing modules
+
+To invoke a procedure from an external module, the module first needs to be imported using a `use` instruction. Once a module is imported, procedures from this module can be invoked via the regular `exec` or `call` instructions as `exec|call.<module>::<label>` where `label` is the name of the procedure. For example:
+
+```sh
+use.std::math::u64
+
+begin
+    push.1.0
+    push.2.0
+    exec.u64::checked_add
+end
+```
+
+In the above example we import `std::math::u64` module from the [standard library](../stdlib/index.md). We then execute a program which pushes two 64-bit integers onto the stack, and then invokes a 64-bit addition procedure from the imported module.
+
+We can also define aliases for imported modules. For example:
+
+```sh
+use.std::math::u64->bigint
+
+begin
+    push.1.0
+    push.2.0
+    exec.bigint::checked_add
+end
+```
+
+The set of modules which can be imported by a program can be specified via a Module Provider when instantiating the [Miden Assembler](https://crates.io/crates/miden-assembly) used to compile the program.
+
+#### Re-exporting procedures
+
+A procedure defined in one module can be re-exported from a different module under the same or a different name. For example:
+
+```sh
+use.std::math::u64
+
+export.u64::add
+export.u64::mul->mul64
+
+export.foo
+    <instructions>
+end
+```
+
+In addition to the locally-defined procedure `foo`, the above module also exports procedures `add` and `mul64` implementations of which will be identical to `add` and `mul` procedures from the `std::math::u64` module respectively.
+
+## Constants
+
+Miden assembly supports constant declarations. These constants are scoped to the module they are defined in and can be used as immediate parameters for Miden assembly instructions. Constants are supported as immediate values for the following instructions: `push`, `assert`, `assertz`, `asert_eq`, `assert_eqw`, `locaddr`, `loc_load`, `loc_loadw`, `loc_store`, `loc_storew`, `mem_load`, `mem_loadw`, `mem_store`, `mem_storew`.
+
+Constants must be declared right after module imports and before any procedures or program bodies. A constant's name must start with an upper-case letter and can contain any combination of numbers, upper-case ASCII letters, and underscores (`_`). The number of characters in a constant name cannot exceed 100. 
+
+A constant's value must be in the range between $0$ and $2^{64} - 2^{32}$ (both inclusive) and can be defined by an arithmetic expression using `+`, `-`, `*`, `/`, `//`, `(`, `)` operators and references to the previously defined constants. Here `/` is a field division and `//` is an integer division. Note that the arithmetic expression cannot contain spaces.
+
+```sh
+use.std::math::u64
+
+const.CONSTANT_1=100
+const.CONSTANT_2=200+(CONSTANT_1-50)
+const.ADDR_1=3
+
+begin
+    push.CONSTANT_1.CONSTANT_2
+    exec.u64::checked_add
+    mem_store.ADDR_1
+end
+```
+
+## Comments
+
+Miden assembly allows annotating code with simple comments. There are two types of comments: single-line comments which start with a `#` (pound) character, and documentation comments which start with `#!` characters. For example:
+
+```sh
+#! This is a documentation comment
+export.foo
+    # this is a comment
+    push.1
+end
+```
+
+Documentation comments must precede a procedure declaration. Using them inside a procedure body is an error.
diff --git a/docs/miden/vm/user-docs/assembly/cryptographic-operations.md b/docs/miden/vm/user-docs/assembly/cryptographic-operations.md
new file mode 100644
index 000000000..71471028d
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/cryptographic-operations.md
@@ -0,0 +1,15 @@
+Miden assembly provides a set of instructions for performing common cryptographic operations. These instructions are listed in the table below.
+
+## Hashing and Merkle trees
+
+[Rescue-Prime Optimized](https://eprint.iacr.org/2022/1577) is the native hash function of Miden VM. The parameters of the hash function were chosen to provide 128-bit security level against preimage and collision attacks. The function operates over a state of 12 field elements, and requires 7 rounds for a single permutation. However, due to its special status within the VM, computing Rescue Prime Optimized hashes can be done very efficiently. For example, applying a permutation of the hash function can be done in a single VM cycle.
+
+| Instruction                      | Stack_input        | Stack_output      | Notes                                                                                                                                                                                                                                                                                                                                                  |
+| -------------------------------- | ------------------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| hash <br> - *(20 cycles)*        | [A, ...]           | [B, ...]          | $\{B\} \leftarrow hash(A)$ <BR> where, $hash()$ computes a 1-to-1 Rescue Prime Optimized hash.                                                                                                                                                                                                                                                         |
+| hperm  <br> - *(1 cycle)*        | [C, B, A, ...]     | [F, E, D, ...]    | $\{D, E, F\} \leftarrow permute(A, B, C)$ <br> Performs a Rescue Prime Optimized permutation on the top 3 words of the operand stack, where the top 2 words elements are the rate (words C and B), the deepest word is the capacity (word A), the digest output is the word E.                                                                         |
+| hmerge  <br> - *(16 cycles)*     | [B, A, ...]        | [C, ...]          | $C \leftarrow hash(A,B)$ <br> where, $hash()$ computes a 2-to-1 Rescue Prime Optimized hash.                                                                                                                                                                                                                                                           |
+| mtree_get  <br> - *(9 cycles)*   | [d, i, R, ...]     | [V, R, ...]       | Fetches the node value from the advice provider and runs a verification equivalent to `mtree_verify`, returning the value if succeeded.                                                                                                                                                                                                                |
+| mtree_set <br> - *(29 cycles)*   | [d, i, R, V', ...] | [V, R', ...]      | Updates a node in the Merkle tree with root $R$ at depth $d$ and index $i$ to value $V'$. $R'$ is the Merkle root of the resulting tree and $V$ is old value of the node. Merkle tree with root $R$ must be present in the advice provider, otherwise execution fails. At the end of the operation the advice provider will contain both Merkle trees. |
+| mtree_merge <br> - *(16 cycles)* | [R, L, ...]        | [M, ...]          | Merges two Merkle trees with the provided roots R (right), L (left) into a new Merkle tree with root M (merged). The input trees are retained in the advice provider.                                                                                                                                                                                  |
+| mtree_verify  <br> - *(1 cycle)* | [V, d, i, R, ...]  | [V, d, i, R, ...] | Verifies that a Merkle tree with root $R$ opens to node $V$ at depth $d$ and index $i$. Merkle tree with root $R$ must be present in the advice provider, otherwise execution fails.                                                                                                                                                                   |
diff --git a/docs/miden/vm/user-docs/assembly/debugging.md b/docs/miden/vm/user-docs/assembly/debugging.md
new file mode 100644
index 000000000..0f0f2c153
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/debugging.md
@@ -0,0 +1,8 @@
+To support basic debugging capabilities, Miden assembly provides a `debug` instruction. This instruction prints out the state of the VM at the time when the `debug` instruction is executed. The instruction can be parameterized as follows:
+
+- `debug.stack` prints out the entire contents of the stack.
+- `debug.stack.<n>` prints out the top $n$ items of the stack. $n$ must be an integer greater than $0$ and smaller than $256$.
+
+Debug instructions do not affect the VM state and do not change the program hash.
+
+To make use of the `debug` instruction, programs must be compiled with an assembler instantiated in the debug mode. Otherwise, the assembler will simply ignore the `debug` instructions.
\ No newline at end of file
diff --git a/docs/miden/vm/user-docs/assembly/execution-contexts.md b/docs/miden/vm/user-docs/assembly/execution-contexts.md
new file mode 100644
index 000000000..4f5e0ed4f
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/execution-contexts.md
@@ -0,0 +1,115 @@
+Miden assembly program execution can span multiple isolated contexts. An execution context defines its own memory space which is not accessible from other execution contexts.
+
+All programs start executing in a *root* context. Thus, the main procedure of a program is always executed in the root context. To move execution into a different context, we can invoke a procedure using the `call` instruction. In fact, any time we invoke a procedure using the `call` instruction, the procedure is executed in a new context. We refer to all non-root contexts as *user contexts*.
+
+While executing in a user context, we can request to execute some procedures in the root context. This can be done via the `syscall` instruction. The set of procedures which can be invoked via the `syscall` instruction is limited by the [kernel](#kernels) against which a program is compiled. Once the procedure called via `syscall` returns, the execution moves back to the user context from which it was invoked. The diagram below illustrates this graphically:
+
+<center>
+![context transitions](../../../../img/miden/vm/user-docs/assembly/execution_contexts/context_transitions.png)
+</center>
+
+## Procedure invocation semantics
+
+As mentioned in the [previous section](./code-organization.md), procedures in Miden assembly can be invoked via three different instructions: `exec`, `call`, and `syscall`. Invocation semantics of `call` and `syscall` instructions are basically the same, the only difference being that the `syscall` instruction can be used only with procedures which are defined in the program's kernel. The `exec` instruction is different, and we explain these differences below.
+
+### Invoking via `call` and `syscall` instructions
+
+When a procedure is invoked via a `call` or a `syscall` instruction, the following happens:
+* Execution moves into a different context. In case of a `call` instruction, a new user context is created. In case of a `syscall` instruction, the execution moves back into the root context.
+* All stack items beyond the 16th item get "hidden" from the invoked procedure. That is, from the standpoint of the invoked procedure, the initial stack depth is set to 16.
+
+When a procedure returns from a `call` or a `syscall`, the following happens:
+* Execution moves back to the context from which the procedure was invoked.
+* Stack depth is set to its original depth. Before the stack depth is reset, the VM checks if the current stack depth is exactly 16, and fails otherwise.
+
+The manipulations of the stack depth described above have the following implications:
+- The top 16 elements of the stack can be used to pass parameters and return values between the caller and the callee.
+- Caller's stack beyond the top 16 elements is inaccessible to the callee, and thus, is guaranteed not to change as the result of the call.
+- At the end of its execution, the callee must ensure that stack depth is exactly 16. If this is difficult to ensure manually, the [`truncate_stack`](../stdlib/sys.md) procedure can be used to drop all elements from the stack except for the top 16.
+
+### Invoking via `exec` instruction
+
+Procedures invoked via the `exec` instruction, are inlined at their call sites during compilation. Thus, from the standpoint of the final program, executing procedures this way is indistinguishable from manually including procedure code in place of the `exec` instruction. This also means that procedures invoked via the `exec` instruction are executed in the same context as the caller.
+
+## Kernels
+
+A *kernel* defines a set of procedures which can be invoked from user contexts to be executed in the root context. Miden assembly programs are always compiled against some kernel. The default kernel is empty - i.e., it does not contain any procedures. To compile a program against a non-empty kernel, the kernel needs to be specified when instantiating the [Miden Assembler](https://crates.io/crates/miden-assembly).
+
+A kernel can be defined similarly to a regular [library module](code-organization.md#library-modules) - i.e., it can have internal and exported procedures. However, there are some small differences between what procedures can do in a kernel module vs. what they can do in a regular library module. Specifically:
+
+- Procedures in a kernel module cannot use `call` or `syscall` instructions. This means that creating a new context from within a `syscall` is not possible.
+- Unlike procedures in regular library modules, procedures in a kernel module can use the `caller` instruction. This instruction puts the hash of the procedure which initiated the parent context onto the stack.
+
+## Memory layout
+
+As mentioned earlier, procedures executed within a given context can access memory only of that context. This is true for both memory reads and memory writes.
+
+Address space of every context is the same: the smallest accessible address is $0$ and the largest accessible address is $2^{32} - 1$. Any code executed in a given context has access to its entire address space. However, by convention, we assign different meanings to different regions of the address space.
+
+For user contexts we have the following:
+
+- The first $2^{30}$ words (each word is 4 field elements) are assumed to be global memory.
+- The next $2^{30}$ words are reserved for memory locals of procedures executed in the same context (i.e., via the `exec` instruction).
+- The remaining address space has no special meaning.
+
+![user memory layout](../../../../img/miden/vm/user-docs/assembly/execution_contexts/user_mem_layout.png)
+
+For the root context we have the following:
+
+- The first $2^{30}$ words are assumed to be global memory.
+- The next $2^{30}$ words are reserved for memory locals of procedures executed in the root context.
+- The next $2^{30}$ words are reserved for memory locals of procedures executed from within a `syscall`.
+- The remaining address space has no special meaning.
+
+![root memory layout](../../../../img/miden/vm/user-docs/assembly/execution_contexts/root_mem_layout.png)
+
+For both types of contexts, writing directly into regions of memory reserved for procedure locals is not advisable. Instead, `loc_load`, `loc_store` and other similar dedicated instructions should be used to access procedure locals.
+
+## Example
+
+To better illustrate what happens as we execute procedures in different contexts, let's go over the following example.
+
+```sh
+kernel
+--------------------
+export.baz.2
+    <instructions>
+    caller
+    <instructions>
+end
+
+program
+--------------------
+proc.bar.1
+    <instructions>
+    syscall.baz
+    <instructions>
+end
+
+proc.foo.3
+    <instructions>
+    call.bar
+    <instructions>
+    exec.bar
+    <instructions>
+end
+
+begin
+    <instructions>
+    call.foo
+    <instructions>
+end
+```
+
+Execution of the above program proceeds as follows:
+
+1. The VM starts executing instructions immediately following the `begin` statement. These instructions are executed in the *root* context (let's call this context `ctx0`).
+2. When `call.foo` is executed, a new context is created (`ctx1`). Memory in this context is isolated from `ctx0`. Additionally, any elements on the stack beyond the top 16 are hidden from `foo`.
+3. Instructions executed inside `foo` can access memory of `ctx1` only. The address of the first procedure local in `foo` (e.g., accessed via `loc_load.0`) is $2^{30}$.
+4. When `call.bar` is executed, a new context is created (`ctx2`). The stack depth is set to 16 again, and any instruction executed in this context can access memory of `ctx2` only. The first procedure local of `bar` is also located at address $2^{30}$.
+5. When `syscall.baz` is executed, the execution moves back into the root context. That is, instructions executed inside `baz` have access to the memory of `ctx0`. The first procedure local of `baz` is located at address $2^{31}$. When `baz` starts executing, the stack depth is again set to 16.
+6. When `caller` is executed inside `baz`, the first 4 elements of the stack are populated with the hash of `bar` since `baz` was invoked from `bar`'s context.
+7. Once `baz` returns, execution moves back to `ctx2`, and then, when `bar` returns, execution moves back to `ctx1`. We assume that instructions executed right before each procedure returns ensure that the stack depth is exactly 16 right before procedure's end.
+8. Next, when `exec.bar` is executed, `bar` is executed again, but this time it is executed in the same context as `foo`. Thus, it can access memory of `ctx1`. Moreover, the stack depth is not changed, and thus, `bar` can access the entire stack of `foo`. Lastly, this first procedure local of `bar` now will be at address $2^{30} + 3$ (since the first 3 locals in this context are reserved for `foo`).
+9. When `syscall.baz` is executed the second time, execution moves into the root context again. However, now, when `caller` is executed inside `baz`, the first 4 elements of the stack are populated with the hash of `foo` (not `bar`). This happens because this time around `bar` does not have its own context and `baz` is invoked from `foo`'s context.
+10. Finally, when `baz` returns, execution moves back to `ctx1`, and then as `bar` and `foo` return, back to `ctx0`, and the program terminates.
\ No newline at end of file
diff --git a/docs/miden/vm/user-docs/assembly/field-operations.md b/docs/miden/vm/user-docs/assembly/field-operations.md
new file mode 100644
index 000000000..bf4debcbc
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/field-operations.md
@@ -0,0 +1,64 @@
+Miden assembly provides a set of instructions which can perform operations with raw field elements. These instructions are described in the tables below.
+
+While most operations place no restrictions on inputs, some operations expect inputs to be binary values, and fail if executed with non-binary inputs.
+
+For instructions where one or more operands can be provided as immediate parameters (e.g., `add` and `add.b`), we provide stack transition diagrams only for the non-immediate version. For the immediate version, it can be assumed that the operand with the specified name is not present on the stack.
+
+## Assertions and tests
+
+| Instruction                     | Stack_input | Stack_output  | Notes                                                            |
+| ------------------------------- | ----------- | ------------- | ---------------------------------------------------------------- |
+| assert <br> - *(1 cycle)*       | [a, ...]    | [...]         | If $a = 1$, removes it from the stack. <br> Fails if $a \ne 1$   |
+| assertz <br> - *(2 cycles)*     | [a, ...]    | [...]         | If $a = 0$, removes it from the stack, <br> Fails if $a \ne 0$   |
+| assert_eq <br> - *(2 cycles)*   | [b, a, ...] | [...]         | If $a = b$, removes them from the stack. <br> Fails if $a \ne b$ |
+| assert_eqw <br> - *(11 cycles)* | [B, A, ...] | [...]         | If $A = B$, removes them from the stack. <br> Fails if $A \ne B$ |
+
+The above instructions can also be parametrized with an error code which can be any 32-bit value specified either directly or via a [named constant](code-organization.md#constants). For example:
+
+```sh
+assert.err=123
+assert.err=MY_CONSTANT
+```
+
+If the error code is omitted, the default value of $0$ is assumed.
+
+## Arithmetic and boolean operations
+
+| Instruction                                                                    | Stack_input | Stack_output  | Notes                                                                                                        |
+| ------------------------------------------------------------------------------ | ----------- | ------------- | ------------------------------------------------------------------------------------------------------------ |
+| add <br> - *(1 cycle)*  <br> add.*b* <br> - *(1-2 cycle)*                      | [b, a, ...] | [c, ...]      | $c \leftarrow (a + b) \mod p$                                                                                |
+| sub <br> - *(2 cycles)*  <br> sub.*b* <br> - *(2 cycles)*                      | [b, a, ...] | [c, ...]      | $c \leftarrow (a - b) \mod p$                                                                                |
+| mul <br> - *(1 cycle)*  <br> mul.*b* <br> - *(2 cycles)*                       | [b, a, ...] | [c, ...]      | $c \leftarrow (a \cdot b) \mod p$                                                                            |
+| div <br> - *(2 cycles)*  <br> div.*b* <br> - *(2 cycles)*                      | [b, a, ...] | [c, ...]      | $c \leftarrow (a \cdot b^{-1}) \mod p$ <br> Fails if $b = 0$                                                 |
+| neg <br> - *(1 cycle)*                                                         | [a, ...]    | [b, ...]      | $b \leftarrow -a \mod p$                                                                                     |
+| inv <br> - *(1 cycle)*                                                         | [a, ...]    | [b, ...]      | $b \leftarrow a^{-1} \mod p$ <br> Fails if $a = 0$                                                           |
+| pow2 <br> - *(16 cycles)*                                                      | [a, ...]    | [b, ...]      | $b \leftarrow 2^a$ <br> Fails if $a > 63$                                                                    |
+| exp.*uxx* <br> - *(9 + xx cycles)*  <br> exp.*b* <br> - *(9 + log2(b) cycles)* | [b, a, ...] | [c, ...]      | $c \leftarrow a^b$ <br> Fails if xx is outside [0, 63) <br> exp is equivalent to exp.u64 and needs 73 cycles |
+| not <br> - *(1 cycle)*                                                         | [a, ...]    | [b, ...]      | $b \leftarrow 1 - a$ <br> Fails if $a > 1$                                                                   |
+| and <br> - *(1 cycle)*                                                         | [b, a, ...] | [c, ...]      | $c \leftarrow a \cdot b$ <br> Fails if $max(a, b) > 1$                                                       |
+| or <br> - *(1 cycle)*                                                          | [b, a, ...] | [c, ...]      | $c \leftarrow a + b - a \cdot b$ <br> Fails if $max(a, b) > 1$                                               |
+| xor <br> - *(7 cycles)*                                                        | [b, a, ...] | [c, ...]      | $c \leftarrow a + b - 2 \cdot a \cdot b$ <br> Fails if $max(a, b) > 1$                                       |
+
+## Comparison operations
+
+| Instruction                                                | Stack_input | Stack_output   | Notes                                                                                                                        |
+| ---------------------------------------------------------- | ----------- | -------------- | ---------------------------------------------------------------------------------------------------------------------------- |
+| eq <br> - *(1 cycle)*  <br> eq.*b* <br> - *(1-2 cycles)*   | [b, a, ...] | [c, ...]       | $c \leftarrow \begin{cases} 1, & \text{if}\ a=b \\ 0, & \text{otherwise}\ \end{cases}$                                       |
+| neq <br> - *(2 cycle)*  <br> neq.*b* <br> - *(2-3 cycles)* | [b, a, ...] | [c, ...]       | $c \leftarrow \begin{cases} 1, & \text{if}\ a \ne b \\ 0, & \text{otherwise}\ \end{cases}$                                   |
+| lt <br> - *(17 cycles)*                                    | [b, a, ...] | [c, ...]       | $c \leftarrow \begin{cases} 1, & \text{if}\ a < b \\ 0, & \text{otherwise}\ \end{cases}$                                     |
+| lte <br> - *(18 cycles)*                                   | [b, a, ...] | [c, ...]       | $c \leftarrow \begin{cases} 1, & \text{if}\ a \le b \\ 0, & \text{otherwise}\ \end{cases}$                                   |
+| gt <br> - *(18 cycles)*                                    | [b, a, ...] | [c, ...]       | $c \leftarrow \begin{cases} 1, & \text{if}\ a > b \\ 0, & \text{otherwise}\ \end{cases}$                                     |
+| gte <br> - *(19 cycles)*                                   | [b, a, ...] | [c, ...]       | $c \leftarrow \begin{cases} 1, & \text{if}\ a \ge b \\ 0, & \text{otherwise}\ \end{cases}$                                   |
+| is_odd <br> - *(5 cycles)*                                 | [a, ...]    | [b, ...]       | $b \leftarrow \begin{cases} 1, & \text{if}\ a \text{ is odd} \\ 0, & \text{otherwise}\ \end{cases}$                          |
+| eqw <br> - *(15 cycles)*                                   | [A, B, ...] | [c, A, B, ...] | $c \leftarrow \begin{cases} 1, & \text{if}\ a_i = b_i \; \forall i \in \{0, 1, 2, 3\} \\ 0, & \text{otherwise}\ \end{cases}$ |
+
+## Extension field operations
+
+| Instruction                        | Stack Input           | Stack Output    | Notes                                                                                                               |
+| ---------------------------------- | --------------------- | --------------- | ------------------------------------------------------------------------------------------------------------------- |
+| ext2add <br> - *(5 cycles)*   <br> | [b1, b0, a1, a0, ...] | [c1, c0, ...]   | $c1 \leftarrow (a1 + b1) \mod p$ and <br> $c0 \leftarrow (a0 + b0) \mod p$                                          |
+| ext2sub <br> - *(7 cycles)*   <br> | [b1, b0, a1, a0, ...] | [c1, c0, ...]   | $c1 \leftarrow (a1 - b1) \mod p$ and <br> $c0 \leftarrow (a0 - b0) \mod p$                                          |
+| ext2mul <br> - *(3 cycles)*   <br> | [b1, b0, a1, a0, ...] | [c1, c0, ...]   | $c1 \leftarrow (a0 + a1) * (b0 + b1) \mod p$ and <br> $c0 \leftarrow (a0 * b0) - 2 * (a1 * b1) \mod p$              |
+| ext2neg <br> - *(4 cycles)*   <br> | [a1, a0, ...]         | [a1', a0', ...] | $a1' \leftarrow -a1$ and $a0' \leftarrow -a0$                                                                       |
+| ext2inv <br> - *(8 cycles)*   <br> | [a1, a0, ...]         | [a1', a0', ...] | $a' \leftarrow a^{-1} \mod q$ <br> Fails if $a = 0$                                                                 |
+| ext2div <br> - *(11 cycles)*  <br> | [b1, b0, a1, a0, ...] | [c1, c0,]       | $c \leftarrow a * b^{-1}$ fails if $b=0$, where multiplication and inversion are as defined by the operations above |
diff --git a/docs/miden/vm/user-docs/assembly/flow-control.md b/docs/miden/vm/user-docs/assembly/flow-control.md
new file mode 100644
index 000000000..6f8313cfd
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/flow-control.md
@@ -0,0 +1,75 @@
+As mentioned above, Miden assembly provides high-level constructs to facilitate flow control. These constructs are:
+
+- *if-else* expressions for conditional execution.
+- *repeat* expressions for bounded counter-controlled loops.
+- *while* expressions for unbounded condition-controlled loops.
+
+## Conditional execution
+
+Conditional execution in Miden VM can be accomplished with *if-else* statements. These statements look like so:
+
+```sh
+if.true
+    <instructions>
+else
+    <instructions>
+end
+```
+
+where `instructions` can be a sequence of any instructions, including nested control structures; the `else` clause is optional. The above does the following:
+
+1. Pops the top item from the stack.
+2. If the value of the item is $1$, instructions in the `if.true` branch are executed.
+3. If the value of the item is $0$, instructions in the `else` branch are executed.
+4. If the value is not binary, the execution fails.
+
+A note on performance: using *if-else* statements incurs a small, but non-negligible overhead. Thus, for simple conditional statements, it may be more efficient to compute the result of both branches, and then select the result using [conditional drop](stack-manipulation.md#conditional-manipulation) instructions.
+
+## Counter-controlled loops
+
+Executing a sequence of instructions a predefined number of times can be accomplished with *repeat* statements. These statements look like so:
+```
+repeat.<count>
+    <instructions>
+end
+```
+where:
+
+* `instructions` can be a sequence of any instructions, including nested control structures.
+* `count` is the number of times the `instructions` sequence should be repeated (e.g. `repeat.10`). `count` must be an integer greater than $0$.
+
+!!! note
+    During compilation the `repeat.<count>` blocks are unrolled and expanded into `<count>` copies of its inner block, there is no additional cost for counting variables in this case.
+
+## Condition-controlled loops
+
+Executing a sequence of instructions zero or more times based on some condition can be accomplished with *while loop* expressions. These expressions look like so:
+
+```sh
+while.true
+    <instructions>
+end
+```
+
+where `instructions` can be a sequence of any instructions, including nested control structures. The above does the following:
+
+1. Pops the top item from the stack.
+2. If the value of the item is $1$, `instructions` in the loop body are executed.
+    a. After the body is executed, the stack is popped again, and if the popped value is $1$, the body is executed again.
+    b. If the popped value is $0$, the loop is exited.
+    c. If the popped value is not binary, the execution fails.
+3. If the value of the item is $0$, execution of loop body is skipped.
+4. If the value is not binary, the execution fails.
+
+Example:
+
+```sh
+# push the boolean true to the stack
+push.1
+
+# pop the top element of the stack and loop while it is true
+while.true
+    # push the boolean false to the stack, finishing the loop for the next iteration
+    push.0
+end
+```
\ No newline at end of file
diff --git a/docs/miden/vm/user-docs/assembly/index.md b/docs/miden/vm/user-docs/assembly/index.md
new file mode 100644
index 000000000..81c44d4c9
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/index.md
@@ -0,0 +1,42 @@
+Miden assembly is a simple, low-level language for writing programs for Miden VM. It stands just above raw Miden VM instruction set, and in fact, many instructions of Miden assembly map directly to raw instructions of Miden VM.
+
+Before Miden assembly can be executed on Miden VM, it needs to be compiled into a [Program MAST](../../design/programs.md) (Merkelized Abstract Syntax Tree) which is a binary tree of code blocks each containing raw Miden VM instructions.
+
+![assembly_to_VM](../../../../img/miden/vm/user-docs/assembly/assembly_to_VM.png)
+
+As compared to raw Miden VM instructions, Miden assembly has several advantages:
+
+1. Miden assembly is intended to be a more stable external interface for the VM. That is, while we plan to make significant changes to the underlying VM to optimize it for stability, performance etc., we intend to make very few breaking changes to Miden assembly.
+2. Miden assembly natively supports control flow expressions which the assembler automatically transforms into a program MAST. This greatly simplifies writing programs with complex execution logic.
+3. Miden assembly supports *macro instructions*. These instructions expand into short sequences of raw Miden VM instructions making it easier to encode common operations.
+4. Miden assembly supports *procedures*. These are stand-alone blocks of code which the assembler inlines into program MAST at compile time. This improves program modularity and code organization.
+
+The last two points also make Miden assembly much more concise as compared to the raw program MAST. This may be important in the blockchain context where pubic programs need to be stored on chain.
+
+## Terms and notations
+
+In this document we use the following terms and notations:
+
+- $p$ is the modulus of the VM's base field which is equal to $2^{64} - 2^{32} + 1$.
+- A *binary* value means a field element which is either $0$ or $1$.
+- Inequality comparisons are assumed to be performed on integer representations of field elements in the range $[0, p)$.
+
+Throughout this document, we use lower-case letters to refer to individual field elements (e.g., $a$). Sometimes it is convenient to describe operations over groups of elements. For these purposes we define a *word* to be a group of four elements. We use upper-case letters to refer to words (e.g., $A$). To refer to individual elements within a word, we use numerical subscripts. For example, $a_0$ is the first element of word $A$, $b_3$ is the last element of word $B$, etc.
+
+## Design goals
+
+The design of Miden assembly tries to achieve the following goals:
+
+1. Miden assembly should be an easy compilation target for high-level languages.
+2. Programs written in Miden assembly should be readable, even if the code is generated by a compiler from a high-level language.
+3. Control flow should be easy to understand to help in manual inspection, formal verification, and optimization.
+4. Compilation of Miden assembly into Miden program MAST should be as straight-forward as possible.
+5. Serialization of Miden assembly into a binary representation should be as compact and as straight-forward as possible.
+
+In order to achieve the first goal, Miden assembly exposes a set of native operations over 32-bit integers and supports linear read-write memory. Thus, from the stand-point of a higher-level language compiler, Miden VM can be viewed as a regular 32-bit stack machine with linear read-write memory.
+
+In order to achieve the second and third goals, Miden assembly facilitates flow control via high-level constructs like `while` loops, `if-else` statements, and function calls with statically defined targets. Thus, for example, there are no explicit `jump` instructions.
+
+In order to achieve the fourth goal, Miden assembly retains direct access to the VM stack rather than abstracting it away with higher-level constructs and named variables.
+
+Lastly, in order to achieve the fifth goal, each instruction of Miden assembly can be encoded using a single byte. The resulting byte-code is simply a one-to-one mapping of instructions to their binary values.
diff --git a/docs/miden/vm/user-docs/assembly/io-operations.md b/docs/miden/vm/user-docs/assembly/io-operations.md
new file mode 100644
index 000000000..d000f2746
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/io-operations.md
@@ -0,0 +1,88 @@
+Miden assembly provides a set of instructions for moving data between the operand stack and several other sources. These sources include:
+
+* **Program code**: values to be moved onto the operand stack can be hard-coded in a program's source code.
+* **Environment**: values can be moved onto the operand stack from environment variables. These include current clock cycle, current stack depth, and a few others.
+* **Advice provider**: values can be moved onto the operand stack from the advice provider by popping them from the advice stack (see more about the advice provider [here](../../intro/overview.md#nondeterministic-inputs)). The VM can also inject new data into the advice provider via *advice injector* instructions.
+* **Memory**: values can be moved between the stack and random-access memory. The memory is word-addressable, meaning, four elements are located at each address, and we can read and write elements to/from memory in batches of four. Memory can be accessed via absolute memory references (i.e., via memory addresses) as well as via local procedure references (i.e., local index). The latter approach ensures that a procedure does not access locals of another procedure.
+
+## Constant inputs
+
+| Instruction                                                               | Stack_input | Stack_output                                     | Notes                                                                                                                                                                                               |
+| ------------------------------------------------------------------------- | ----------- | ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| push.*a* <br> - *(1-2 cycles)* <br> push.*a*.*b* <br> push.*a*.*b*.*c*... | [ ... ]     | [a, ... ] <br> [b, a, ... ] <br> [c, b, a, ... ] | Pushes values $a$, $b$, $c$ etc. onto the stack. Up to $16$ values can be specified. All values must be valid field elements in decimal (e.g., $123$) or hexadecimal (e.g., $0x7b$) representation. |
+
+The value can be specified in hexadecimal form without periods between individual values as long as it describes a full word ($4$ field elements or $32$ bytes). Note that hexadecimal values separated by periods (short hexadecimal strings) are assumed to be in big-endian order, while the strings specifying whole words (long hexadecimal strings) are assumed to be in little-endian order. That is, the following are semantically equivalent:
+
+```
+push.0x00001234.0x00005678.0x00009012.0x0000abcd
+push.0x341200000000000078560000000000001290000000000000cdab000000000000
+push.4660.22136.36882.43981
+```
+In both case the values must still encode valid field elements.
+
+## Environment inputs
+
+| Instruction                     | Stack_input  | Stack_output | Notes                                                                                                                                                                                                             |
+| ------------------------------- | ------------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| clk <br> - *(1 cycle)*          | [ ... ]      | [t, ... ]    | $t \leftarrow clock\_value()$ <br> Pushes the current value of the clock cycle counter onto the stack.                                                                                                            |
+| sdepth <br> - *(1 cycle)*       | [ ... ]      | [d, ... ]    | $d \leftarrow stack.depth()$ <br> Pushes the current depth of the stack onto the stack.                                                                                                                           |
+| caller <br> - *(1 cycle)*       | [A, b, ... ] | [H, b, ... ] | $H \leftarrow context.fn\_hash()$ <br> Overwrites the top four stack items with the hash of a function which initiated the current SYSCALL. <br> Executing this instruction outside of SYSCALL context will fail. |
+| locaddr.*i* <br> - *(2 cycles)* | [ ... ]      | [a, ... ]    | $a \leftarrow address\_of(i)$ <br> Pushes the absolute memory address of local memory at index $i$ onto the stack.                                                                                                |
+
+## Nondeterministic inputs
+
+As mentioned above, nondeterministic inputs are provided to the VM via the advice provider. Instructs which access the advice provider fall into two categories. The first category consists of instructions which move data from the advice stack onto the operand stack and/or memory.
+
+| Instruction                      | Stack_input        | Stack_output        | Notes                                                                                                                                                                                                                                                                                                                    |
+| -------------------------------- | ------------------ | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| adv_push.*n* <br> - *(n cycles)* | [ ... ]            | [a, ... ]           | $a \leftarrow stack.pop()$ <br> Pops $n$ values from the advice stack and pushes them onto the operand stack. Valid for $n \in \{1, ..., 16\}$. <br> Fails if the advice stack has fewer than $n$ values.                                                                                                                |
+| adv_loadw <br> - *(1 cycle)*     | [0, 0, 0, 0, ... ] | [A, ... ]           | $A \leftarrow stack.pop(4)$ <br> Pop the next word (4 elements) from the advice stack and overwrites the first word of the operand stack (4 elements) with them. <br> Fails if the advice stack has fewer than $4$ values.                                                                                               |
+| adv_pipe <br> - *(1 cycle)*      | [C, B, A, a, ... ] | [E, D, A, a', ... ] | $[D, E] \leftarrow [adv\_stack.pop(4), adv\_stack.pop(4)]$ <br> $a' \leftarrow a + 2$ <br> Pops the next two words from the advice stack, overwrites the top of the operand stack with them and also writes these words into memory at address $a$ and $a + 1$.<br> Fails if the advice stack has fewer than $8$ values. |
+
+> **Note**: The opcodes above always push data onto the operand stack so that the first element is placed deepest in the stack. For example, if the data on the stack is `a,b,c,d` and you use the opcode `adv_push.4`, the data will be `d,c,b,a` on your stack. This is also the behavior of the other opcodes.
+
+The second category injects new data into the advice provider. These operations are called *advice injectors* and they affect only the advice provider state. That is, the state of all other VM components (e.g., stack, memory) are unaffected. Executing advice injectors does not consume any VM cycles (i.e., these instructions are executed in $0$ cycles).
+
+Advice injectors fall into two categories: (1) injectors which push new data onto the advice stack, and (2) injectors which insert new data into the advice map.
+
+| Instruction                                  | Stack_input                | Stack_output               | Notes                                                                                                                                                                                                                                           |
+| -------------------------------------------- | -------------------------- | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| adv.push_mapval <br> adv.push_mapval.*s*     | [K, ... ]                  | [K, ... ]                  | Pushes a list of field elements onto the advice stack. The list is looked up in the advice map using word $K$ as the key. If offset $s$ is provided, the key is taken starting from item $s$ on the stack. |
+| adv.push_mapvaln <br> adv.push_mapvaln.*s*   | [K, ... ]                  | [K, ... ]                  | Pushes a list of field elements together with the number of elements onto the advice stack. The list is looked up in the advice map using word $K$ as the key. If offset $s$ is provided, the key is taken starting from item $s$ on the stack. |
+| adv.push_mtnode                              | [d, i, R, ... ]            | [d, i, R, ... ]            | Pushes a node of a Merkle tree with root $R$ at depth $d$ and index $i$ from Merkle store onto the advice stack. |
+| adv.push_u64div                              | [b1, b0, a1, a0, ...]      | [b1, b0, a1, a0, ...]      | Pushes the result of `u64` division $a / b$ onto the advice stack. Both $a$ and $b$ are represented using 32-bit limbs. The result consists of both the quotient and the remainder. |
+| adv.push_ext2intt                            | [osize, isize, iptr, ... ] | [osize, isize, iptr, ... ] | Given evaluations of a polynomial over some specified domain, interpolates the evaluations into a polynomial in coefficient form and pushes the result into the advice stack. |
+| adv.push_sig.*kind*                          | [K, M, ...]                | [K, M, ...]                | Pushes values onto the advice stack which are required for verification of a DSA with scheme specified by *kind* against the public key commitment $K$ and message $M$. |
+| adv.smt_get                                  | [K, R, ... ]               | [K, R, ... ]               | Pushes values onto the advice stack which are required for successful retrieval of a value under the key $K$ from a Sparse Merkle Tree with root $R$. |
+| adv.smt_set                                  | [V, K, R, ...]             | [V, K, R, ...]             | Pushes values onto the advice stack which are required for successful insertion of a key-value pair $(K, V)$ into a Sparse Merkle Tree with root $R$. |
+| adv.smt_peek                                 | [K, R, ... ]               | [K, R, ... ]               | Pushes value onto the advice stack which is associated with key $K$ in a Sparse Merkle Tree with root $R$. |
+| adv.insert_mem                               | [K, a, b, ... ]            | [K, a, b, ... ]            | Reads words $data \leftarrow mem[a] .. mem[b]$ from memory, and save the data into $advice\_map[K] \leftarrow data$. |
+| adv.insert_hdword <br> adv.insert_hdword.*d* | [B, A, ... ]               | [B, A, ... ]               | Reads top two words from the stack, computes a key as $K \leftarrow hash(A || b, d)$, and saves the data into $advice\_map[K] \leftarrow [A, B]$. $d$ is an optional domain value which can be between $0$ and $255$, default value $0$. |
+| adv.insert_hperm                             | [B, A, C, ...]             | [B, A, C, ...]             | Reads top three words from the stack, computes a key as $K \leftarrow permute(C, A, B).digest$, and saves data into $advice\_mpa[K] \leftarrow [A, B]$. |
+
+## Random access memory
+
+As mentioned above, there are two ways to access memory in Miden VM. The first way is via memory addresses using the instructions listed below. The addresses are absolute - i.e., they don't depend on the procedure context. Memory addresses can be in the range $[0, 2^{32})$.
+
+Memory is guaranteed to be initialized to zeros. Thus, when reading from memory address which hasn't been written to previously, zero elements will be returned.
+
+| Instruction                                                              | Stack_input           | Stack_output        | Notes                                                                                                                                                                                                                                                                                          |
+| ------------------------------------------------------------------------ | --------------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| mem_load <br> - *(1 cycle)*  <br> mem_load.*a* <br> - *(2 cycles)*       | [a, ... ]             | [v, ... ]           | $v \leftarrow mem[a][0]$ <br> Reads a word (4 elements) from memory at address *a*, and pushes the first element of the word onto the stack. If $a$ is provided via the stack, it is removed from the stack first. <br> Fails if $a \ge 2^{32}$                                                |
+| mem_loadw <br> - *(1 cycle)*  <br> mem_loadw.*a* <br> - *(2 cycles)*     | [a, 0, 0, 0, 0, ... ] | [A, ... ]           | $A \leftarrow mem[a]$ <br> Reads a word from memory at address $a$ and overwrites top four stack elements with it. If $a$ is provided via the stack, it is removed from the stack first. <br> Fails if $a \ge 2^{32}$                                                                          |
+| mem_store <br> - *(2 cycles)*  <br> mem_store.*a*  <br> - *(3-4 cycles)* | [a, v, ... ]          | [ ... ]             | $v \rightarrow mem[a][0]$ <br> Pops the top element off the stack and stores it as the first element of the word in memory at address $a$. All other elements of the word are not affected. If $a$ is provided via the stack, it is removed from the stack first. <br> Fails if $a \ge 2^{32}$ |
+| mem_storew <br> - *(1 cycle)*  <br> mem_storew.*a* <br> - *(2-3 cycles)* | [a, A, ... ]          | [A, ... ]           | $A \rightarrow mem[a]$ <br> Stores the top four elements of the stack in memory at address $a$. If $a$ is provided via the stack, it is removed from the stack first. <br> Fails if $a \ge 2^{32}$                                                                                             |
+| mem_stream <br> - *(1 cycle)*                                            | [C, B, A, a, ... ]    | [E, D, A, a', ... ] | $[E, D] \leftarrow [mem[a], mem[a+1]]$ <br> $a' \leftarrow a + 2$ <br> Read two sequential words from memory starting at address $a$ and overwrites the first two words in the operand stack.                                                                                                  |
+
+The second way to access memory is via procedure locals using the instructions listed below. These instructions are available only in procedure context. The number of locals available to a given procedure must be specified at [procedure declaration](./code-organization.md#procedures) time, and trying to access more locals than was declared will result in a compile-time error. The number of locals per procedure is not limited, but the total number of locals available to all procedures at runtime must be smaller than $2^{32}$.
+
+| Instruction                          | Stack_input        | Stack_output | Notes                                                                                                                                                                                             |
+| ------------------------------------ | ------------------ | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| loc_load.*i* <br> - *(3-4 cycles)*   | [ ... ]            | [v, ... ]    | $v \leftarrow local[i][0]$ <br> Reads a word (4 elements) from local memory at index *i*, and pushes the first element of the word onto the stack.                                                |
+| loc_loadw.*i*  <br> - *(3-4 cycles)* | [0, 0, 0, 0, ... ] | [A, ... ]    | $A \leftarrow local[i]$ <br> Reads a word from local memory at index $i$ and overwrites top four stack elements with it.                                                                          |
+| loc_store.*i* <br> - *(4-5 cycles)*  | [v, ... ]          | [ ... ]      | $v \rightarrow local[i][0]$ <br> Pops the top element off the stack and stores it as the first element of the word in local memory at index $i$. All other elements of the word are not affected. |
+| loc_storew.*i* <br> - *(3-4 cycles)* | [A, ... ]          | [A, ... ]    | $A \rightarrow local[i]$ <br> Stores the top four elements of the stack in local memory at index $i$.                                                                                             |
+
+Unlike regular memory, procedure locals are not guaranteed to be initialized to zeros. Thus, when working with locals, one must assume that before a local memory address has been written to, it contains "garbage".
+
+Internally in the VM, procedure locals are stored at memory offset stating at $2^{30}$. Thus, every procedure local has an absolute address in regular memory. The `locaddr.i` instruction is provided specifically to map an index of a procedure's local to an absolute address so that it can be passed to downstream procedures, when needed.
\ No newline at end of file
diff --git a/docs/miden/vm/user-docs/assembly/stack-manipulation.md b/docs/miden/vm/user-docs/assembly/stack-manipulation.md
new file mode 100644
index 000000000..1e846ca98
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/stack-manipulation.md
@@ -0,0 +1,29 @@
+Miden VM stack is a push-down stack of field elements. The stack has a maximum depth of $2^{32}$, but only the top $16$ elements are directly accessible via the instructions listed below.
+
+In addition to the typical stack manipulation instructions such as `drop`, `dup`, `swap` etc., Miden assembly provides several conditional instructions which can be used to manipulate the stack based on some condition - e.g., conditional swap `cswap` or conditional drop `cdrop`.
+
+## Stack manipulation
+
+| Instruction                      | Stack_input        | Stack_output       | Notes                                                                                                                                |
+| -------------------------------- | ------------------ | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------ |
+| drop <br> - *(1 cycle)*          | [a, ... ]          | [ ... ]            | Deletes the top stack item.                                                                                                          |
+| dropw <br> - *(4 cycles)*        | [A, ... ]          | [ ... ]            | Deletes a word (4 elements) from the top of the stack.                                                                               |
+| padw  <br> - *(4 cycles)*        | [ ... ]            | [0, 0, 0, 0, ... ] | Pushes four $0$ values onto the stack. <br> Note: simple `pad` is not provided because `push.0` does the same thing.                 |
+| dup.*n* <br> - *(1-3 cycles)*    | [ ..., a, ... ]    | [a, ..., a, ... ]  | Pushes a copy of the $n$th stack item onto the stack. `dup` and `dup.0` are the same instruction. Valid for $n \in \{0, ..., 15\}$   |
+| dupw.*n* <br> - *(4 cycles)*     | [ ..., A, ... ]    | [A, ..., A, ... ]  | Pushes a copy of the $n$th stack word onto the stack. `dupw` and `dupw.0` are the same instruction. Valid for $n \in \{0, 1, 2, 3\}$ |
+| swap.*n* <br> - *(1-6 cycles)*   | [a, ..., b, ... ]  | [b, ..., a, ... ]  | Swaps the top stack item with the $n$th stack item. `swap` and `swap.1` are the same instruction. Valid for $n \in \{1, ..., 15\}$   |
+| swapw.*n* <br> - *(1 cycle)*     | [A, ..., B, ... ]  | [B, ..., A, ... ]  | Swaps the top stack word with the $n$th stack word. `swapw` and `swapw.1` are the same instruction. Valid for $n \in \{1, 2, 3\}$    |
+| swapdw <br> - *(1 cycle)*        | [D, C, B, A, ... ] | [B, A, D, C ... ]  | Swaps words on the top of the stack. The 1st with the 3rd, and the 2nd with the 4th.                                                 |
+| movup.*n* <br> - *(1-4 cycles)*  | [ ..., a, ... ]    | [a, ... ]          | Moves the $n$th stack item to the top of the stack. Valid for $n \in \{2, ..., 15\}$                                                 |
+| movupw.*n* <br> - *(2-3 cycles)* | [ ..., A, ... ]    | [A, ... ]          | Moves the $n$th stack word to the top of the stack. Valid for $n \in \{2, 3\}$                                                       |
+| movdn.*n* <br> - *(1-4 cycles)*  | [a, ... ]          | [ ..., a, ... ]    | Moves the top stack item to the $n$th position of the stack. Valid for $n \in \{2, ..., 15\}$                                        |
+| movdnw.*n* <br> - *(2-3 cycles)* | [A, ... ]          | [ ..., A, ... ]    | Moves the top stack word to the $n$th word position of the stack. Valid for $n \in \{2, 3\}$                                         |
+
+## Conditional manipulation
+
+| Instruction                 | Stack_input       | Stack_output       | Notes                                                                                                                                                                                       |
+| --------------------------- | ----------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| cswap  <br> - *(1 cycle)*   | [c, b, a, ... ]   | [e, d, ... ]       | $d = \begin{cases} a, & \text{if}\ c = 0 \\ b, & \text{if}\ c = 1\ \end{cases}$ <br> $e = \begin{cases} b, & \text{if}\ c = 0 \\ a, & \text{if}\ c = 1\ \end{cases}$  <br> Fails if $c > 1$ |
+| cswapw  <br> - *(1 cycle)*  | [c, B, A, ... ]   | [E, D, ... ]       | $D = \begin{cases} A, & \text{if}\ c = 0 \\ B, & \text{if}\ c = 1\ \end{cases}$ <br> $E = \begin{cases} B, & \text{if}\ c = 0 \\ A, & \text{if}\ c = 1\ \end{cases}$  <br> Fails if $c > 1$ |
+| cdrop   <br> - *(2 cycles)* | [c, b, a, ... ]   | [d, ... ]          | $d = \begin{cases} a, & \text{if}\ c = 0 \\ b, & \text{if}\ c = 1\ \end{cases}$ <br> Fails if $c > 1$                                                                                       |
+| cdropw  <br> - *(5 cycles)* | [c, B, A, ... ]   | [D, ... ]          | $D = \begin{cases} A, & \text{if}\ c = 0 \\ B, & \text{if}\ c = 1\ \end{cases}$ <br> Fails if $c > 1$                                                                                       |
diff --git a/docs/miden/vm/user-docs/assembly/u32-operations.md b/docs/miden/vm/user-docs/assembly/u32-operations.md
new file mode 100644
index 000000000..dd45ac4df
--- /dev/null
+++ b/docs/miden/vm/user-docs/assembly/u32-operations.md
@@ -0,0 +1,92 @@
+Miden assembly provides a set of instructions which can perform operations on regular two-complement 32-bit integers. These instructions are described in the tables below.
+
+Most instructions have _checked_ variants. These variants ensure that input values are 32-bit integers, and fail if that's not the case. All other variants do not perform these checks, and thus, should be used only if the inputs are known to be 32-bit integers. Supplying inputs which are greater than or equal to $2^{32}$ to unchecked operations results in undefined behavior.
+
+The primary benefit of using unchecked operations is performance: they can frequently be executed $2$ or $3$ times faster than their checked counterparts. In general, vast majority of the unchecked operations listed below can be executed in a single VM cycle.
+
+For instructions where one or more operands can be provided as immediate parameters (e.g., `u32checked_add` and `u32checked_add.b`), we provide stack transition diagrams only for the non-immediate version. For the immediate version, it can be assumed that the operand with the specified name is not present on the stack.
+
+In all the table below, the number of cycles it takes for the VM to execute each instruction is listed beneath the instruction.
+
+## Conversions and tests
+
+| Instruction                                    | Stack input | Stack output  | Notes                                                                                                                          |
+| ---------------------------------------------- | ----------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| u32test <br> - *(5 cycles)*                    | [a, ...]    | [b, a, ...]   | $b \leftarrow \begin{cases} 1, & \text{if}\ a < 2^{32} \\ 0, & \text{otherwise}\ \end{cases}$                                  |
+| u32testw <br> - *(23 cycles)*                  | [A, ...]    | [b, A, ...]   | $b \leftarrow \begin{cases} 1, & \text{if}\ \forall\ i \in \{0, 1, 2, 3\}\ a_i < 2^{32} \\ 0, & \text{otherwise}\ \end{cases}$ |
+| u32assert <br> - *(3 cycles)* | [a, ...]    | [a, ...]      | Fails if $a \ge 2^{32}$                                                                                                        |
+| u32assert2 <br> - *(1 cycle)*                 | [b, a,...]  | [b, a,...]    | Fails if $a \ge 2^{32}$ or $b \ge 2^{32}$                                                                                      |
+| u32assertw <br> - *(6 cycles)*                 | [A, ...]    | [A, ...]      | Fails if $\exists\ i \in \{0, 1, 2, 3\} : a_i \ge 2^{32}$                                                                    |
+| u32cast <br> - *(2 cycles)*                    | [a, ...]    | [b, ...]      | $b \leftarrow a \mod 2^{32}$                                                                                                   |
+| u32split <br> - *(1 cycle)*                    | [a, ...]    | [c, b, ...]   | $b \leftarrow a \mod 2^{32}$, $c \leftarrow \lfloor{a / 2^{32}}\rfloor$                                                        |
+
+The instructions `u32assert`, `u32assert2` and `u32assertw` can also be parametrized with an error code which can be any 32-bit value specified either directly or via a [named constant](./code-organization.md#constants). For example:
+
+```sh
+u32assert.err=123
+u32assert.err=MY_CONSTANT
+```
+
+If the error code is omitted, the default value of $0$ is assumed.
+
+## Arithmetic operations
+
+| Instruction                                                                               | Stack input    | Stack output  | Notes                                                                                                                                                                                  |
+| ----------------------------------------------------------------------------------------- | -------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| u32checked_add <br> - *(4 cycles)* <br> u32checked_add.*b* <br> - *(5-6 cycles)*          | [b, a, ...]    | [c, ...]      | $c \leftarrow a + b$ <br> Fails if $max(a, b, c) \ge 2^{32}$                                                                                                                           |
+| u32overflowing_add <br> - *(1 cycle)* <br> u32overflowing_add.*b* <br> - *(2-3 cycles)*   | [b, a, ...]    | [d, c, ...]   | $c \leftarrow (a + b) \mod 2^{32}$ <br> $d \leftarrow \begin{cases} 1, & \text{if}\ (a + b) \ge 2^{32} \\ 0, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$ |
+| u32wrapping_add <br> - *(2 cycles)* <br> u32wrapping_add.*b* <br> - *(3-4 cycles)*        | [b, a, ...]    | [c, ...]      | $c \leftarrow (a + b) \mod 2^{32}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                                            |
+| u32overflowing_add3 <br> - *(1 cycle)*                                                    | [c, b, a, ...] | [e, d, ...]   | $d \leftarrow (a + b + c) \mod 2^{32}$, <br> $e \leftarrow \lfloor (a + b + c) / 2^{32}\rfloor$ <br> Undefined if $max(a, b, c) \ge 2^{32}$ <br>                                       |
+| u32wrapping_add3 <br> - *(2 cycles)*                                                      | [c, b, a, ...] | [d, ...]      | $d \leftarrow (a + b + c) \mod 2^{32}$, <br> Undefined if $max(a, b, c) \ge 2^{32}$ <br>                                                                                               |
+| u32checked_sub <br> - *(4 cycles)* <br> u32checked_sub.*b*  <br> - *(5-6 cycles)*         | [b, a, ...]    | [c, ...]      | $c \leftarrow (a - b)$ <br> Fails if $max(a, b) \ge 2^{32}$ or $a < b$                                                                                                                 |
+| u32overflowing_sub <br> - *(1 cycle)* <br> u32overflowing_sub.*b* <br> - *(2-3 cycles)*   | [b, a, ...]    | [d, c, ...]   | $c \leftarrow (a - b) \mod 2^{32}$ <br> $d \leftarrow \begin{cases} 1, & \text{if}\ a < b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$              |
+| u32wrapping_sub <br> - *(2 cycles)* <br> u32wrapping_sub.*b* <br> - *(3-4 cycles)*        | [b, a, ...]    | [c, ...]      | $c \leftarrow (a - b) \mod 2^{32}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                                            |
+| u32checked_mul <br> - *(4 cycles)* <br> u32checked_mul.*b* <br> - *(5-6 cycles)*          | [b, a, ...]    | [c, ...]      | $c \leftarrow a \cdot b$ <br> Fails if $max(a, b, c) \ge 2^{32}$                                                                                                                       |
+| u32overflowing_mul <br> - *(1 cycle)* <br> u32overflowing_mul.*b* <br> - *(2-3 cycles)*   | [b, a, ...]    | [d, c, ...]   | $c \leftarrow (a \cdot b) \mod 2^{32}$ <br> $d \leftarrow \lfloor(a \cdot b) / 2^{32}\rfloor$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                 |
+| u32wrapping_mul <br> - *(2 cycles)* <br> u32wrapping_mul.*b* <br> - *(3-4 cycles)*        | [b, a, ...]    | [c, ...]      | $c \leftarrow (a \cdot b) \mod 2^{32}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                                        |
+| u32overflowing_madd <br> - *(1 cycle)*                                                    | [b, a, c, ...] | [e, d, ...]   | $d \leftarrow (a \cdot b + c) \mod 2^{32}$ <br> $e \leftarrow \lfloor(a \cdot b + c) / 2^{32}\rfloor$ <br> Undefined if $max(a, b, c) \ge 2^{32}$                                      |
+| u32wrapping_madd <br> - *(2 cycles)*                                                      | [b, a, c, ...] | [d, ...]      | $d \leftarrow (a \cdot b + c) \mod 2^{32}$ <br> Undefined if $max(a, b, c) \ge 2^{32}$                                                                                                 |
+| u32checked_div <br> - *(3 cycles)* <br> u32checked_div.*b* <br> - *(4-5 cycles)*          | [b, a, ...]    | [c, ...]      | $c \leftarrow \lfloor a / b\rfloor$ <br> Fails if $max(a, b) \ge 2^{32}$ or $b = 0$                                                                                                    |
+| u32unchecked_div <br> - *(2 cycles)* <br> u32unchecked_div.*b* <br> - *(3-4 cycles)*      | [b, a, ...]    | [c, ...]      | $c \leftarrow \lfloor a / b\rfloor$ <br> Fails if $b = 0$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                     |
+| u32checked_mod <br> - *(4 cycles)* <br> u32checked_mod.*b* <br> - *(5-6 cycles)*          | [b, a, ...]    | [c, ...]      | $c \leftarrow a \mod b$ <br> Fails if $max(a, b) \ge 2^{32}$ or $b = 0$                                                                                                                |
+| u32unchecked_mod <br> - *(3 cycles)* <br> u32unchecked_mod.*b* <br> - *(4-5 cycles)*      | [b, a, ...]    | [c, ...]      | $c \leftarrow a \mod b$ <br> Fails if $b = 0$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                                 |
+| u32checked_divmod <br> - *(2 cycles)* <br> u32checked_divmod.*b* <br> - *(3-4 cycles)*    | [b, a, ...]    | [d, c, ...]   | $c \leftarrow \lfloor a / b\rfloor$ <br> $d \leftarrow a \mod b$ <br> Fails if $max(a, b) \ge 2^{32}$ or $b = 0$                                                                       |
+| u32unchecked_divmod <br> - *(1 cycle)* <br> u32unchecked_divmod.*b* <br> - *(2-3 cycles)* | [b, a, ...]    | [d, c, ...]   | $c \leftarrow \lfloor a / b\rfloor$ <br> $d \leftarrow a \mod b$ <br> Fails if $b = 0$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                        |
+
+## Bitwise operations
+
+| Instruction                                                                           | Stack input    | Stack output  | Notes                                                                                                                          |
+| ------------------------------------------------------------------------------------- | -------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| u32checked_and <br> - *(1 cycle)*                                                     | [b, a, ...]    | [c, ...]      | Computes $c$ as a bitwise `AND` of binary representations of $a$ and $b$. <br> Fails if $max(a,b) \ge 2^{32}$                  |
+| u32checked_or <br> - *(6 cycle)s*                                                     | [b, a, ...]    | [c, ...]      | Computes $c$ as a bitwise `OR` of binary representations of $a$ and $b$. <br> Fails if $max(a,b) \ge 2^{32}$                   |
+| u32checked_xor <br> - *(1 cycle)*                                                     | [b, a, ...]    | [c, ...]      | Computes $c$ as a bitwise `XOR` of binary representations of $a$ and $b$. <br> Fails if $max(a,b) \ge 2^{32}$                  |
+| u32checked_not <br> - *(5 cycles)*                                                    | [a, ...]       | [b, ...]      | Computes $b$ as a bitwise `NOT` of binary representation of $a$. <br> Fails if $a \ge 2^{32}$                                  |
+| u32checked_shl <br> - *(47 cycles)* <br> u32checked_shl.*b*  <br> - *(4 cycles)*      | [b, a, ...]    | [c, ...]      | $c \leftarrow (a \cdot 2^b) \mod 2^{32}$ <br> Fails if $a \ge 2^{32}$ or $b > 31$                                              |
+| u32unchecked_shl <br> - *(40 cycles)* <br> u32unchecked_shl.*b* <br> - *(3 cycles)*   | [b, a, ...]    | [c, ...]      | $c \leftarrow (a \cdot 2^b) \mod 2^{32}$ <br> Undefined if $a \ge 2^{32}$ or $b > 31$                                          |
+| u32checked_shr <br> - *(47 cycles)*<br> u32checked_shr.*b* <br> - *(4 cycles)*        | [b, a, ...]    | [c, ...]      | $c \leftarrow \lfloor a/2^b \rfloor$ <br> Fails if $a \ge 2^{32}$ or $b > 31$                                                  |
+| u32unchecked_shr <br> - *(40 cycles)* <br> u32unchecked_shr.*b* <br> - *(3 cycles)*   | [b, a, ...]    | [c, ...]      | $c \leftarrow \lfloor a/2^b \rfloor$ <br> Undefined if $a \ge 2^{32}$ or $b > 31$                                              |
+| u32checked_rotl <br> - *(47 cycles)* <br> u32checked_rotl.*b* <br> - *(4 cycles)*     | [b, a, ...]    | [c, ...]      | Computes $c$ by rotating a 32-bit representation of $a$ to the left by $b$ bits. <br> Fails if $a \ge 2^{32}$ or $b > 31$      |
+| u32unchecked_rotl <br> - *(40 cycles)* <br> u32unchecked_rotl.*b* <br> - *(3 cycles)* | [b, a, ...]    | [c, ...]      | Computes $c$ by rotating a 32-bit representation of $a$ to the left by $b$ bits. <br> Undefined if $a \ge 2^{32}$ or $b > 31$  |
+| u32checked_rotr <br> - *(59 cycles)* <br> u32checked_rotr.*b* <br> - *(6 cycles)*     | [b, a, ...]    | [c, ...]      | Computes $c$ by rotating a 32-bit representation of $a$ to the right by $b$ bits. <br> Fails if $a \ge 2^{32}$ or $b > 31$     |
+| u32unchecked_rotr <br> - *(44 cycles)* <br> u32unchecked_rotr.*b* <br> - *(3 cycles)* | [b, a, ...]    | [c, ...]      | Computes $c$ by rotating a 32-bit representation of $a$ to the right by $b$ bits. <br> Undefined if $a \ge 2^{32}$ or $b > 31$ |
+| u32checked_popcnt <br> - *(36 cycles)*                                                | [a, ...]       | [b, ...]      | Computes $b$ by counting the number of set bits in $a$ (hamming weight of $a$). <br> Fails if $a \ge 2^{32}$                   |
+| u32unchecked_popcnt <br> - *(33 cycles)*                                              | [a, ...]       | [b, ...]      | Computes $b$ by counting the number of set bits in $a$ (hamming weight of $a$). <br> Undefined if $a \ge 2^{32}$               |
+
+## Comparison operations
+
+| Instruction                                                                      | Stack input  | Stack output    | Notes                                                                                                                                                                                                                  |
+| -------------------------------------------------------------------------------- | ------------ | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| u32checked_eq <br> - *(2 cycles)* <br> u32checked_eq.*b*  <br> - *(3-4 cycles)*  | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a=b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$ <br> Note: unchecked version is not provided because it is equivalent to simple `eq`.      |
+| u32checked_neq <br> - *(3 cycles)* <br> u32checked_neq.*b* <br> - *(4-5 cycles)* | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a \ne b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$ <br> Note: unchecked version is not provided because it is equivalent to simple `neq`. |
+| u32checked_lt <br> - *(6 cycles)*                                                | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a < b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$                                                                                          |
+| u32unchecked_lt <br> - *(5 cycles)*                                              | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a < b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                      |
+| u32checked_lte <br> - *(8 cycles)*                                               | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a \le b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$                                                                                        |
+| u32unchecked_lte <br> - *(7 cycles)*                                             | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a \le b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                    |
+| u32checked_gt <br> - *(7 cycles)*                                                | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a > b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$                                                                                          |
+| u32unchecked_gt <br> - *(6 cycles)*                                              | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a > b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                      |
+| u32checked_gte <br> - *(7 cycles)*                                               | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a \ge b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$                                                                                        |
+| u32unchecked_gte <br> - *(6 cycles)*                                             | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} 1, & \text{if}\ a \ge b \\ 0, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                    |
+| u32checked_min <br> - *(9 cycles)*                                               | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} a, & \text{if}\ a < b \\ b, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$                                                                                          |
+| u32unchecked_min <br> - *(8 cycles)*                                             | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} a, & \text{if}\ a < b \\ b, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                      |
+| u32checked_max <br> - *(10 cycles)*                                              | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} a, & \text{if}\ a > b \\ b, & \text{otherwise}\ \end{cases}$ <br> Fails if $max(a, b) \ge 2^{32}$                                                                                          |
+| u32unchecked_max <br> - *(9 cycles)*                                             | [b, a, ...]  | [c, ...]        | $c \leftarrow \begin{cases} a, & \text{if}\ a > b \\ b, & \text{otherwise}\ \end{cases}$ <br> Undefined if $max(a, b) \ge 2^{32}$                                                                                      |
diff --git a/docs/miden/vm/user-docs/index.md b/docs/miden/vm/user-docs/index.md
new file mode 100644
index 000000000..7a71f4cc8
--- /dev/null
+++ b/docs/miden/vm/user-docs/index.md
@@ -0,0 +1,8 @@
+In the following sections, we provide developer-focused documentation useful to those who want to develop on Miden VM or build compilers from higher-level languages to Miden VM.
+
+This documentation consists of two high-level sections:
+
+- [Miden assembly](./assembly/index.md) which provides a detailed description of Miden assembly language, which is the native language of Miden VM.
+- [Miden standard library](./stdlib/index.md) which provides descriptions of all procedures available in Miden standard library.
+
+For info on how to run programs on Miden VM, please refer to the [usage](../intro/usage.md) section in the introduction.
diff --git a/docs/miden/vm/user-docs/stdlib/collections.md b/docs/miden/vm/user-docs/stdlib/collections.md
new file mode 100644
index 000000000..a3c567c67
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/collections.md
@@ -0,0 +1,42 @@
+Namespace `std::collections` contains modules for commonly-used authenticated data structures. This includes:
+
+- A Merkle Mountain range.
+- A Sparse Merkle Tree with 64-bit keys.
+- A Sparse Merkle Tree with 256-bit keys.
+
+## Merkle mountain range
+
+Module `std::collections::mmr` contains procedures for manipulating [Merkle mountain range](https://github.com/opentimestamps/opentimestamps-server/blob/master/doc/merkle-mountain-range.md) data structure which can be used as an append-only log.
+
+The following procedures are available to read data from and make updates to a Merkle Mountain Range.
+
+| Procedure   | Description   |
+| ----------- | ------------- |
+| get         | Loads the leaf at the absolute position `pos` in the MMR onto the stack.<br /><br />Valid range for `pos` is between $0$ and $2^{32} - 1$ (both inclusive).<br /><br />Inputs: `[pos, mmr_ptr, ...]`<br />Output: `[N, ...]`<br /><br />Where `N` is the leaf loaded from the MMR whose memory location starts at `mmr_ptr`. |
+| add         | Adds a new leaf to the MMR.<br /><br />This will update the MMR peaks in the VM's memory and the advice provider with any merged nodes.<br /><br />Inputs: `[N, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where `N` is the leaf added to the MMR whose memory locations starts at `mmr_ptr`. |
+| pack        | Computes a commitment to the given MMR and copies the MMR to the Advice Map using the commitment as a key.<br /><br />Inputs: `[mmr_ptr, ...]`<br />Outputs: `[HASH, ...]`<br /><br /> |
+| unpack      | Load the MMR peak data based on its hash.<br /><br />Inputs: `[HASH, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where:<br />- `HASH`: is the MMR peak hash, the hash is expected to be padded to an even length and to have a minimum size of 16 elements.<br />- The advice map must contain a key with `HASH`, and its value is `num_leaves \|\| hash_data`, and hash_data is the data used to computed `HASH`<br />- `mmt_ptr`: the memory location where the MMR data will be written, starting with the MMR forest (the total count of its leaves) followed by its peaks. |
+
+## Sparse Merkle tree (64)
+
+Module `std::collections::smt64` contains procedures for manipulating key-value maps with single-element keys and 4-element values. The current implementation is a thin wrapper over a simple Sparse Merkle Tree of depth 64. In the future, this will be replaced with a compact Sparse Merkle Tree implementation.
+
+The following procedures are available to read data from and make updates to a Sparse Merkle Tree.
+
+| Procedure   | Description |
+| ----------- | ------------- |
+| get         | Returns the value located under the specified key in the Sparse Merkle Tree defined by the specified root.<br /><br />If no values had been previously inserted under the specified key, an empty word is returned.<br /><br />Inputs: `[key, ROOT, ...]`<br />Outputs: `[VALUE, ROOT, ...]`<br /><br />Fails if the tree with the specified root does not exist in the VM's advice provider. |
+| set         | Inserts the specified value under the specified key in a Sparse Merkle Tree defined by the specified root. If the insert is successful, the old value located under the specified key is returned via the stack.<br /><br />If `VALUE` is an empty word, the new state of the tree is guaranteed to be equivalent to the state as if the updated value was never inserted.<br /><br />Inputs: `[VALUE, key, ROOT, ...]`<br />Outputs: `[OLD_VALUE, NEW_ROOT, ...]`<br /><br />Fails if the tree with the specified root does not exits in the VM's advice provider. |
+| insert      | Inserts the specified value under the specified key in a Sparse Merkle Tree defined by the specified root. If the insert is successful, the old value located under the specified key is returned via the stack.<br /><br />This procedure requires that `VALUE` be a non-empty word.<br /><br />Inputs: `[VALUE, key, ROOT, ...]`<br />Outputs: `[OLD_VALUE, NEW_ROOT, ...]`<br /><br />Fails if:<br />- The tree with the specified root does not exits in the VM's advice provider.<br />- The provided value is an empty word. |
+
+## Sparse Merkle tree (256)
+
+Module `std::collections::smt` contains procedures for manipulating key-value maps with 4-element keys and 4-element values. The underlying implementation is a Tiered (compacted) Sparse Merkle where leaves can exist only at specific depths called "tiers". These depths are: 16, 32, 48, and 64. Initially, when a tree is empty, it is equivalent to an empty Sparse Merkle Tree of depth 64 (i.e., leaves at depth 64 are set to [ZERO; 4]). As non-empty values are inserted into the tree, they are added to the first available tier.
+
+The following procedures are available to read data from and make updates to a Sparse Merkle Tree.
+
+| Procedure   | Description   |
+| ----------- | ------------- |
+| get         | Returns the value located under the specified key in the Sparse Merkle Tree defined by the specified root.<br /><br />If no values had been previously inserted under the specified key, an empty word is returned.<br /><br />Inputs: `[KEY, ROOT, ...]`<br />Outputs: `[VALUE, ROOT, ...]`<br /><br />Fails if the tree with the specified root does not exist in the VM's advice provider. |
+| set         | Inserts the specified value under the specified key in a Sparse Merkle Tree defined by the specified root. If the insert is successful, the old value located under the specified key is returned via the stack.<br /><br />If `VALUE` is an empty word, the new state of the tree is guaranteed to be equivalent to the state as if the updated value was never inserted.<br /><br />Inputs: `[VALUE, KEY, ROOT, ...]`<br />Outputs: `[OLD_VALUE, NEW_ROOT, ...]`<br /><br />Fails if the tree with the specified root does not exits in the VM's advice provider. |
+| insert      | Inserts the specified value under the specified key in a Sparse Merkle Tree defined by the specified root. If the insert is successful, the old value located under the specified key is returned via the stack.<br /><br />This procedure requires that `VALUE` be a non-empty word.<br /><br />Inputs: `[VALUE, KEY, ROOT, ...]`<br />Outputs: `[OLD_VALUE, NEW_ROOT, ...]`<br /><br />Fails if:<br />- The tree with the specified root does not exits in the VM's advice provider.<br />- The provided value is an empty word. |
diff --git a/docs/miden/vm/user-docs/stdlib/crypto/dsa.md b/docs/miden/vm/user-docs/stdlib/crypto/dsa.md
new file mode 100644
index 000000000..6f1c6d4f9
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/crypto/dsa.md
@@ -0,0 +1,14 @@
+Namespace `std::crypto::dsa` contains a set of  digital signature schemes supported by default in the Miden VM. Currently, these schemes are:
+
+* `RPO Falcon512`: a variant of the [Falcon](https://falcon-sign.info/) signature scheme.
+
+## RPO Falcon512
+
+Module `std::crypto::dsa::rpo_falcon512` contains procedures for verifying `RPO Falcon512` signatures. These signatures differ from the standard Falcon signatures in that instead of using `SHAKE256` hash function in the *hash-to-point* algorithm we use `RPO256`. This makes the signature more efficient to verify in the Miden VM.
+
+The module exposes the following procedures:
+
+| Procedure   | Description |
+| ----------- | ------------- |
+| verify      | Verifies a signature against a public key and a message. The procedure gets as inputs the hash of the public key and the hash of the message via the operand stack. The signature is expected to be provided via the advice provider.<br /><br />The signature is valid if and only if the procedure returns.<br /><br />Inputs: `[PK, MSG, ...]`<br />Outputs: `[...]`<br /><br />Where `PK` is the hash of the public key and `MSG` is the hash of the message. Both hashes are expected to be computed using `RPO` hash function.<br /><br /> The procedure relies on the `adv.push_sig` [decorator](../../assembly/io-operations.md#nondeterministic-inputs) to retrieve the signature from the host. The default host implementation assumes that the private-public key pair is loaded into the advice provider, and uses it to generate the signature. However, for production grade implementations, this functionality should be overridden to ensure more secure handling of private keys.|
+
diff --git a/docs/miden/vm/user-docs/stdlib/crypto/fri.md b/docs/miden/vm/user-docs/stdlib/crypto/fri.md
new file mode 100644
index 000000000..43d34239c
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/crypto/fri.md
@@ -0,0 +1,9 @@
+Namespace `std::crypto::fri` contains modules for verifying [FRI](https://eccc.weizmann.ac.il/report/2017/134/) proofs.
+
+## FRI Extension 2, Fold 4
+
+Module `std::crypto::fri::frie2f4` contains procedures for verifying FRI proofs generated over the quadratic extension of the Miden VM's base field. Moreover, the procedures assume that layer folding during the commit phase of FRI protocol was performed using folding factor 4.
+
+| Procedure | Description |
+| ----------- | ------------- |
+| verify | Verifies a FRI proof where the proof was generated over the quadratic extension of the base field and layer folding was performed using folding factor 4.<br /><br />Input:  `[query_start_ptr, query_end_ptr, layer_ptr, rem_ptr, g, ...]`><br />Output: `[...]`<br /><br />- `query_start_ptr` is a pointer to a list of tuples of the form `(e0, e1, p, 0)` where `p` is a query index at the first layer and `(e0, e1)` is an extension field element corresponding to the value of the first layer at index p.<br />- `query_end_ptr` is a pointer to the first empty memory address after the last `(e0, e1, p, 0)` tuple.<br />- `layer_ptr` is a pointer to the first layer commitment denoted throughout the code by C. `layer_ptr + 1` points to the first `(alpha0, alpha1, t_depth, d_size)` where `d_size` is the size of initial domain divided by 4, `t_depth` is the depth of the Merkle tree commitment to the first layer and `(alpha0, alpha1)` is the first challenge used in folding the first layer. Both `t_depth` and `d_size` are expected to be smaller than 2^32. Otherwise, the result of this procedure is undefined.<br />- `rem_ptr` is a pointer to the first tuple of two consecutive degree 2 extension field elements making up the remainder codeword. This codeword can be of length either 32 or 64.<br /><br />The memory referenced above is used contiguously, as follows:<br />`[layer_ptr ... rem_ptr ... query_start_ptr ... query_end_ptr]`<br /><br />This means for example that:<br />1. `rem_ptr - 1` points to the last `(alpha0, alpha1, t_depth, d_size)` tuple.<br />2. The length of the remainder codeword is `2 * (rem_ptr - query_start_ptr)`.<br /><br />Cycles: for domains of size `2^n` where:<br />- `n` is even: 12 + 6 + num_queries * (40 + num_layers * 76 + 69) + 2626<br />- `n` is odd:  12 + 6 + num_queries * (40 + num_layers * 76 + 69) + 1356 |
diff --git a/docs/miden/vm/user-docs/stdlib/crypto/hashes.md b/docs/miden/vm/user-docs/stdlib/crypto/hashes.md
new file mode 100644
index 000000000..47b07a403
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/crypto/hashes.md
@@ -0,0 +1,19 @@
+Namespace `std::crypto` contains modules for commonly used cryptographic hash functions.
+
+## BLAKE3
+
+Module `std::crypto::hashes::blake3` contains procedures for computing hashes using [BLAKE3](https://blake3.io/) hash function. The input and output elements are assumed to contain one 32-bit value per element.
+
+| Procedure   | Description                                                                                                                                                                                                                 |
+| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| hash_1to1   | Computes BLAKE3 1-to-1 hash.<br/><br/>Input: 32-bytes stored in the first 8 elements of the stack (32 bits per element).<br /> <br/>Output: A 32-byte digest stored in the first 8 elements of stack (32 bits per element). |
+| hash_2to1   | Computes BLAKE3 2-to-1 hash.<br/><br/>Input: 64-bytes stored in the first 16 elements of the stack (32 bits per element).<br /> <br/>Output: A 32-byte digest stored in the first 8 elements of stack (32 bits per element) |
+
+## SHA256
+
+Module `std::crypto::hashes::sha256` contains procedures for computing hashes using [SHA256](https://en.wikipedia.org/wiki/SHA-2) hash function. The input and output elements are assumed to contain one 32-bit value per element.
+
+| Procedure   | Description                                                                                                                                                                                                                  |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| hash_1to1   | Computes SHA256 1-to-1 hash.<br/><br/>Input: 32-bytes stored in the first 8 elements of the stack (32 bits per element).<br /> <br/>Output: A 32-byte digest stored in the first 8 elements of stack (32 bits per element).  |
+| hash_2to1   | Computes SHA256 2-to-1 hash.<br/><br/>Input: 64-bytes stored in the first 16 elements of the stack (32 bits per element).<br /> <br/>Output: A 32-byte digest stored in the first 8 elements of stack (32 bits per element). |
diff --git a/docs/miden/vm/user-docs/stdlib/index.md b/docs/miden/vm/user-docs/stdlib/index.md
new file mode 100644
index 000000000..44ee0117b
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/index.md
@@ -0,0 +1,39 @@
+The Miden standard library provides a set of procedures which can be used by any Miden program. These procedures build on the core instruction set of [Miden assembly](../assembly/index.md) expanding the functionality immediately available to the user.
+
+The goals of Miden standard library are:
+
+* Provide highly-optimized and battle-tested implementations of commonly-used primitives.
+* Reduce the amount of code that needs to be shared between parties for proving and verifying program execution.
+
+The second goal can be achieved because calls to procedures in the standard library can always be serialized as 32 bytes, regardless of how large the procedure is.
+
+## Terms and notations
+
+In this document we use the following terms and notations:
+
+- A *field element* is an element in a prime field of size $p = 2^{64} - 2^{32} + 1$.
+- A *binary* value means a field element which is either $0$ or $1$.
+- Inequality comparisons are assumed to be performed on integer representations of field elements in the range $[0, p)$.
+
+Throughout this document, we use lower-case letters to refer to individual field elements (e.g., $a$). Sometimes it is convenient to describe operations over groups of elements. For these purposes we define a *word* to be a group of four elements. We use upper-case letters to refer to words (e.g., $A$). To refer to individual elements within a word, we use numerical subscripts. For example, $a_0$ is the first element of word $A$, $b_3$ is the last element of word $B$, etc.
+
+## Organization and usage
+
+Procedures in the Miden Standard Library are organized into modules, each targeting a narrow set of functionality. Modules are grouped into higher-level namespaces. However, higher-level namespaces do not expose any procedures themselves. For example, `std::math::u64` is a module containing procedures for working with 64-bit unsigned integers. This module is a part of the `std::math` namespace. However, the `std::math` namespace does not expose any procedures.
+
+For an example of how to invoke procedures from imported modules see [this section](../assembly/code-organization.md#importing-modules).
+
+## Available modules
+
+Currently, Miden standard library contains just a few modules, which are listed below. Over time, we plan to add many more modules which will include various cryptographic primitives, additional numeric data types and operations, and many others.
+
+| Module | Description |
+| ------ | ----------- |
+| [std::collections::mmr](./collections.md#merkle-mountain-range) | Contains procedures for manipulating [Merkle Mountain Ranges](https://github.com/opentimestamps/opentimestamps-server/blob/master/doc/merkle-mountain-range.md). |
+| [std::collections::smt64](./collections.md#sparse-merkle-tree-64) | Contains procedures for manipulating key-value maps with single-element keys and 4-element values. |
+| [std::crypto::fri::frie2f4](./crypto/fri.md#fri-extension-2-fold-4) | Contains procedures for verifying FRI proofs (field extension = 2, folding factor = 4). |
+| [std::crypto::hashes::blake3](./crypto/hashes.md#blake3) | Contains procedures for computing hashes using BLAKE3 hash function. |
+| [std::crypto::hashes::sha256](./crypto/hashes.md#sha256) | Contains procedures for computing hashes using SHA256 hash function. |
+| [std::math::u64](./math/u64.md) | Contains procedures for working with 64-bit unsigned integers. |
+| [std::mem](./mem.md)            | Contains procedures for working with random access memory. |
+| [std::sys](./sys.md)            | Contains system-level utility procedures. |
diff --git a/docs/miden/vm/user-docs/stdlib/math/u64.md b/docs/miden/vm/user-docs/stdlib/math/u64.md
new file mode 100644
index 000000000..265bb4c12
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/math/u64.md
@@ -0,0 +1,70 @@
+Module `std::math::u64` contains a set of procedures which can be used to perform unsigned 64-bit integer operations. These operations fall into the following categories:
+
+* **Arithmetic operations**: Addition, multiplication, division, etc.
+* **Comparison operations**: Equality, less than, greater than etc.
+* **Bitwise operations**: Binary AND, OR, XOR, bit shifts etc.
+
+All procedures assume that an unsigned 64-bit integer (u64) is encoded using two elements, each containing an unsigned 32-bit integer (u32). When placed on the stack, the least-significant limb is assumed to be deeper in the stack. For example, a u64 value `a` consisting of limbs `a_hi` and `a_lo` would be position on the stack like so:
+
+```sh
+[a_hi, a_lo, ... ]
+```
+
+Procedures which check whether the input values are encoded correctly are designated with `checked` prefix. For example, `checked_add` would fail if any of the top 4 elements on the stack contains a value greater than $2^{32} - 1$. In contrast, `wrapping_add` and `overflowing_add` would not perform these checks, and therefore, if any of the top 4 stack elements is greater than $2^{32} - 1$, the operation will not fail but rather will produce an undefined result. Thus, when using versions of procedures which are not checked, it is important to be certain that input values are 32-bit limbs encoding valid u64 values.
+
+## Arithmetic operations
+
+| Procedure          | Description   |
+| ------------------ | ------------- |
+| checked_add        | Performs addition of two unsigned 64-bit integers and fails if the result would overflow.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a + b) % 2^64 |
+| overflowing_add    | Performs addition of two unsigned 64-bit integers preserving the overflow.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [overflow_flag, c_hi, c_lo, ...], where c = (a + b) % 2^64 |
+| wrapping_add       | Performs addition of two unsigned 64-bit integers discarding the overflow.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a + b) % 2^64 |
+| checked_sub        | Performs subtraction of two unsigned 64-bit integers and fails if the result would underflow.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a - b) % 2^64 |
+| overflowing_sub    | Performs subtraction of two unsigned 64-bit integers preserving the overflow.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [underflow_flag, c_hi, c_lo, ...], where c = (a - b) % 2^64 |
+| wrapping_sub       | Performs subtraction of two unsigned 64-bit integers discarding the overflow.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a - b) % 2^64 |
+| checked_mul        | Performs multiplication of two unsigned 64-bit integers and fails if the result would overflow.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a * b) % 2^64 |
+| overflowing_mul    | Performs multiplication of two unsigned 64-bit integers preserving the overflow.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi_hi, c_hi_lo, c_lo_hi, c_lo_lo, ...], where c = (a * b) % 2^64|
+| wrapping_mul       | Performs multiplication of two unsigned 64-bit integers discarding the overflow.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a * b) % 2^64 |
+| checked_div        | Performs division of two unsigned 64-bit integers discarding the remainder.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a // b |
+| unchecked_div      | Performs division of two unsigned 64-bit integers discarding the remainder.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a // b |
+| checked_mod        | Performs modulo operation of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a % b |
+| unchecked_mod      | Performs modulo operation of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a % b |
+| checked_divmod     | Performs divmod operation of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [r_hi, r_lo, q_hi, q_lo ...], where r = a % b, q = a // b |
+| unchecked_divmod |  Performs divmod operation of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [r_hi, r_lo, q_hi, q_lo ...], where r = a % b, q = a // b |
+
+## Comparison operations
+
+| Procedure          | Description   |
+| ------------------ | ------------- |
+| checked_lt         | Performs less-than comparison of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a < b, and 0 otherwise. |
+| unchecked_lt       | Performs less-than comparison of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a < b, and 0 otherwise. |
+| checked_gt         | Performs greater-than comparison of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a > b, and 0 otherwise. |
+| unchecked_gt       | Performs greater-than comparison of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a > b, and 0 otherwise.<br /> This takes 11 cycles. |
+| checked_lte        | Performs less-than-or-equal comparison of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a <= b, and 0 otherwise. |
+| unchecked_lte      | Performs less-than-or-equal comparison of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a <= b, and 0 otherwise. |
+| checked_gte        | Performs greater-than-or-equal comparison of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a >= b, and 0 otherwise. |
+| unchecked_gte      | Performs greater-than-or-equal comparison of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a >= b, and 0 otherwise. |
+| checked_eq         | Performs equality comparison of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a == b, and 0 otherwise. |
+| unchecked_eq       | Performs equality comparison of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a == b, and 0 otherwise. |
+| checked_neq        | Performs inequality comparison of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a != b, and 0 otherwise. |
+| unchecked_neq      | Performs inequality comparison of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a != b, and 0 otherwise. |
+| checked_eqz        | Performs comparison to zero of an unsigned 64-bit integer.<br /> The input value is assumed to be represented using 32-bit limbs, fails if it is not.<br /> The stack transition looks as follows:<br /> [a_hi, a_lo, ...] -> [c, ...], where c = 1 when a == 0, and 0 otherwise. |
+| unchecked_eqz      | Performs comparison to zero of an unsigned 64-bit integer.<br /> The input value is assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [a_hi, a_lo, ...] -> [c, ...], where c = 1 when a == 0, and 0 otherwise. |
+| checked_min        | Compares two unsigned 64-bit integers and drop the larger one from the stack.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a when a < b, and b otherwise. |
+| unchecked_min      | Compares two unsigned 64-bit integers and drop the larger one from the stack.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a when a < b, and b otherwise. |
+| checked_max        | Compares two unsigned 64-bit integers and drop the smaller one from the stack.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a when a > b, and b otherwise. |
+| unchecked_max      | Compares two unsigned 64-bit integers and drop the smaller one from the stack.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a when a > b, and b otherwise. |
+
+## Bitwise operations
+
+| Procedure   | Description   |
+| ----------- | ------------- |
+| checked_and | Performs bitwise AND of two unsigned 64-bit integers.<br /> The input values are assumed to be represented using 32-bit limbs, but this is not checked.<br /> The stack transition looks as follows:<br /> [[b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a AND b. |
+| checked_or |  Performs bitwise OR of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a OR b. |
+| checked_xor |  Performs bitwise XOR of two unsigned 64-bit integers.<br /> The input values are expected to be represented using 32-bit limbs, and the procedure will fail if they are not.<br /> The stack transition looks as follows:<br /> [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a XOR b. |
+| overflowing_shl |  Performs left shift of one unsigned 64-bit integer preserving the overflow and<br /> using the pow2 operation.<br /> The input value to be shifted is assumed to be represented using 32-bit limbs.<br /> The shift value should be in the range [0, 64), otherwise it will result in an error.<br /> The stack transition looks as follows:<br /> [b, a_hi, a_lo, ...] -> [d_hi, d_lo, c_hi, c_lo, ...], where (d,c) = a << b, <br /> which d contains the bits shifted out.<br /> This takes 35 cycles. |
+| unchecked_shl |  Performs left shift of one unsigned 64-bit integer using the pow2 operation.<br /> The input value to be shifted is assumed to be represented using 32-bit limbs.<br /> The shift value should be in the range [0, 64), otherwise it will result in an error.<br /> The stack transition looks as follows:<br /> [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a << b mod 2^64.<br /> This takes 28 cycles. |
+| overflowing_shr |  Performs right shift of one unsigned 64-bit integer preserving the overflow and<br /> using the pow2 operation.<br /> The input value to be shifted is assumed to be represented using 32-bit limbs.<br /> The shift value should be in the range [0, 64), otherwise it will result in an error.<br /> The stack transition looks as follows:<br /> [b, a_hi, a_lo, ...] -> [d_hi, d_lo, c_hi, c_lo, ...], where c = a >> b, d = a << (64 - b).<br /> This takes 94 cycles. |
+| unchecked_shr |  Performs right shift of one unsigned 64-bit integer using the pow2 operation.<br /> The input value to be shifted is assumed to be represented using 32-bit limbs.<br /> The shift value should be in the range [0, 64), otherwise it will result in an error.<br /> The stack transition looks as follows:<br /> [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a >> b.<br /> This takes 44 cycles. |
+| unchecked_rotl |  Performs left rotation of one unsigned 64-bit integer using the pow2 operation.<br /> The input value to be shifted is assumed to be represented using 32-bit limbs.<br /> The shift value should be in the range [0, 64), otherwise it will result in an error.<br /> The stack transition looks as follows:<br /> [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a << b mod 2^64.<br /> This takes 35 cycles. |
+| unchecked_rotr |  Performs right rotation of one unsigned 64-bit integer using the pow2 operation.<br /> The input value to be shifted is assumed to be represented using 32-bit limbs.<br /> The shift value should be in the range [0, 64), otherwise it will result in an error.<br /> The stack transition looks as follows:<br /> [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a << b mod 2^64.<br /> This takes 40 cycles. |
diff --git a/docs/miden/vm/user-docs/stdlib/mem.md b/docs/miden/vm/user-docs/stdlib/mem.md
new file mode 100644
index 000000000..a6563fed9
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/mem.md
@@ -0,0 +1,8 @@
+Module `std::mem` contains a set of utility procedures for working with random access memory.
+
+| Procedure   | Description   |
+| ----------- | ------------- |
+| memcopy | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />Stack transition looks as follows:<br /><br />[n, read_ptr, write_ptr, ...] -> [...]<br /><br />Cycles: 15 + 16n |
+| pipe_double_words_to_memory | Moves an even number of words from the advice stack to memory.<br /><br />Input: [C, B, A, write_ptr, end_ptr, ...]<br />Output: [C, B, A, write_ptr, ...]<br /><br />Where:<br />- The words C, B, and A are the RPO hasher state<br />- A is the capacity<br />- C, B are the rate portion of the state<br />- The value `num_words = end_ptr - write_ptr` must be positive and even<br /><br />Cycles: 10 + 9 * num_words / 2 |
+| pipe_words_to_memory | Moves an arbitrary number of words from the advice stack to memory.<br /><br />Input: [num_words, write_ptr, ...]<br />Output: [HASH, write_ptr', ...]<br /><br />Where `HASH` is the sequential RPO hash of all copied words.<br /><br />Cycles:<br />- Even num_words: 48 + 9 * num_words / 2<br />- Odd num_words: 65 + 9 * round_down(num_words / 2) |
+| pipe_preimage_to_memory | Moves an arbitrary number of words from the advice stack to memory and asserts it matches the commitment.<br /><br />Input: [num_words, write_ptr, COM, ...]<br />Output: [write_ptr', ...]<br /><br />Cycles:<br />- Even num_words: 58 + 9 * num_words / 2<br /> - Odd num_words: 75 + 9 * round_down(num_words / 2) |
diff --git a/docs/miden/vm/user-docs/stdlib/sys.md b/docs/miden/vm/user-docs/stdlib/sys.md
new file mode 100644
index 000000000..f7be395ef
--- /dev/null
+++ b/docs/miden/vm/user-docs/stdlib/sys.md
@@ -0,0 +1,5 @@
+Module `std::sys` contains a set of system-level utility procedures.
+
+| Procedure      | Description   |
+| -------------- | ------------- |
+| truncate_stack | Removes elements deep in the stack until the depth of the stack is exactly 16. The elements are removed in such a way that the top 16 elements of the stack remain unchanged. If the stack would otherwise contain more than 16 elements at the end of execution, then adding a call to this function at the end will reduce the size of the public inputs that are shared with the verifier.<br/>Input: Stack with 16 or more elements.<br/> Output: Stack with only the original top 16 elements. |
diff --git a/mkdocs.yml b/mkdocs.yml
index 29aad239f..4f2a623ae 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -360,7 +360,67 @@ nav:
           - Concepts:
               - Crypto primitives:
                   - Tiered sparce Merkle tree (TSMT):  miden/concepts/crypto-primitives/tsmt.md
-      - Miden VM docs: https://0xpolygonmiden.github.io/miden-vm/
+      - Miden VM:
+          - Introduction:
+              - Introduction: miden/vm/intro/index.md
+              - Overview: miden/vm/intro/overview.md
+              - Usage: miden/vm/intro/usage.md
+              - Performance: miden/vm/intro/performance.md
+          - Development tooling:
+              - Overview: miden/vm/tools/index.md
+              - Debugger: miden/vm/tools/debugger.md
+              - REPL: miden/vm/tools/repl.md
+          - Miden programming reference:
+              - Miden programming reference: miden/vm/user-docs/index.md
+              - Assembly:
+                  - Miden assembly:  miden/vm/user-docs/assembly/index.md
+                  - Code organization:  miden/vm/user-docs/assembly/code-organization.md
+                  - Execution contexts:  miden/vm/user-docs/assembly/execution-contexts.md
+                  - Flow control:  miden/vm/user-docs/assembly/flow-control.md
+                  - Field operations:  miden/vm/user-docs/assembly/field-operations.md
+                  - u32 operations:  miden/vm/user-docs/assembly/u32-operations.md
+                  - Stack manipulation:  miden/vm/user-docs/assembly/stack-manipulation.md
+                  - IO operations:  miden/vm/user-docs/assembly/io-operations.md
+                  - Cryptographic operations:  miden/vm/user-docs/assembly/cryptographic-operations.md
+                  - Debugging:  miden/vm/user-docs/assembly/debugging.md
+              - Standard library:
+                  - Miden standard library: miden/vm/user-docs/stdlib/index.md
+                  - Collections:  miden/vm/user-docs/stdlib/collections.md
+                  - Crypto:
+                      - Digital signatures:  miden/vm/user-docs/stdlib/crypto/dsa.md
+                      - FRI verification procedures:  miden/vm/user-docs/stdlib/crypto/fri.md
+                      - Cryptographic hashes:  miden/vm/user-docs/stdlib/crypto/hashes.md
+                  - Math:
+                      - Unsigned 64-bit integer operations:  miden/vm/user-docs/stdlib/math/u64.md
+                  - Memory procedures:  miden/vm/user-docs/stdlib/mem.md
+                  - System procedures:  miden/vm/user-docs/stdlib/sys.md
+          - Design:
+              - Design: miden/vm/design/index.md
+              - Programs:  miden/vm/design/programs.md
+              - Decoder:
+                  - Decoder: miden/vm/design/decoder/index.md
+                  - Constraints:  miden/vm/design/decoder/constraints.md
+              - Operand stack:
+                  - Operand stack: miden/vm/design/stack/index.md
+                  - Operation constraints:  miden/vm/design/stack/op-constraints.md
+                  - System operations:  miden/vm/design/stack/system-ops.md
+                  - Field operations:  miden/vm/design/stack/field-ops.md
+                  - u32 operations:  miden/vm/design/stack/u32-ops.md
+                  - Stack manipulation:  miden/vm/design/stack/stack-ops.md
+                  - Input/output operations:  miden/vm/design/stack/io-ops.md
+                  - Cryptographic operations:  miden/vm/design/stack/crypto-ops.md
+              - Range checker:  miden/vm/design/range.md
+              - Chiplets:
+                  - miden/vm/design/chiplets/index.md
+                  - Hash chiplet:  miden/vm/design/chiplets/hasher.md
+                  - Bitwise chiplet:  miden/vm/design/chiplets/bitwise.md
+                  - Memory chiplet:  miden/vm/design/chiplets/memory.md
+                  - Kernel ROM chiplet:  miden/vm/design/chiplets/kernel_rom.md 
+              - Lookups:
+                  - miden/vm/design/lookups/index.md
+                  - Multiset checks:  miden/vm/design/lookups/multiset.md
+                  - LogUp: miden/vm/design/lookups/logup.md
+          - Background material: miden/vm/background.md
   - Developer tools: 
       - Developer tools: tools/index.md
       - Smart contract development: 
@@ -428,22 +488,22 @@ nav:
                   - withdrawStartWithMetaData:  tools/matic-js/pos/erc721/withdraw-start-with-meta-data.md
                   - withdrawStart:  tools/matic-js/pos/erc721/withdraw-start.md
               - ERC1155:
-                  - approve-all-for-mintable:  tools/matic-js/pos/erc1155/approve-all-for-mintable.md
-                  - approve-all:  tools/matic-js/pos/erc1155/approve-all.md
-                  - deposit-many:  tools/matic-js/pos/erc1155/deposit-many.md
+                  - approveAllForMintable:  tools/matic-js/pos/erc1155/approve-all-for-mintable.md
+                  - approveAll:  tools/matic-js/pos/erc1155/approve-all.md
+                  - depositMany:  tools/matic-js/pos/erc1155/deposit-many.md
                   - deposit:  tools/matic-js/pos/erc1155/deposit.md
-                  - get-balance:  tools/matic-js/pos/erc1155/get-balance.md
+                  - getBalance:  tools/matic-js/pos/erc1155/get-balance.md
                   - index:  tools/matic-js/pos/erc1155/index.md
-                  - is-approved-all:  tools/matic-js/pos/erc1155/is-approved-all.md
-                  - is-withdraw-exited-many:  tools/matic-js/pos/erc1155/is-withdraw-exited-many.md
-                  - is-withdraw-exited:  tools/matic-js/pos/erc1155/is-withdraw-exited.md
+                  - isApprovedAll:  tools/matic-js/pos/erc1155/is-approved-all.md
+                  - isWithdrawExitedMany:  tools/matic-js/pos/erc1155/is-withdraw-exited-many.md
+                  - isWithdrawExited:  tools/matic-js/pos/erc1155/is-withdraw-exited.md
                   - transfer:  tools/matic-js/pos/erc1155/transfer.md
-                  - withdraw-exit-faster-many:  tools/matic-js/pos/erc1155/withdraw-exit-faster-many.md
-                  - withdraw-exit-faster:  tools/matic-js/pos/erc1155/withdraw-exit-faster.md
-                  - withdraw-exit-many:  tools/matic-js/pos/erc1155/withdraw-exit-many.md
-                  - withdraw-exit:  tools/matic-js/pos/erc1155/withdraw-exit.md
-                  - withdraw-start-many:  tools/matic-js/pos/erc1155/withdraw-start-many.md
-                  - withdraw-start:  tools/matic-js/pos/erc1155/withdraw-start.md
+                  - withdrawExitFasterMany:  tools/matic-js/pos/erc1155/withdraw-exit-faster-many.md
+                  - withdrawExitFaster:  tools/matic-js/pos/erc1155/withdraw-exit-faster.md
+                  - withdrawExitMany:  tools/matic-js/pos/erc1155/withdraw-exit-many.md
+                  - withdrawExit:  tools/matic-js/pos/erc1155/withdraw-exit.md
+                  - withdrawStartMany:  tools/matic-js/pos/erc1155/withdraw-start-many.md
+                  - withdrawStart:  tools/matic-js/pos/erc1155/withdraw-start.md
               - Common methods:
                   - Deposit ETH:  tools/matic-js/pos/deposit-ether.md
                   - isCheckPointed:  tools/matic-js/pos/is-check-pointed.md