diff --git a/backends/tofino/CMakeLists.txt b/backends/tofino/CMakeLists.txt
index 8ceaf2e1eb2..64d52bf83f1 100644
--- a/backends/tofino/CMakeLists.txt
+++ b/backends/tofino/CMakeLists.txt
@@ -23,9 +23,6 @@ if (CMAKE_BUILD_TYPE STREQUAL Release OR CMAKE_BUILD_TYPE STREQUAL RelWithDebInf
   add_definitions("-DRELEASE_BUILD=1")
 endif()
 
-# JBay is always enabled, the preprocessor guard is deprecated
-add_definitions("-DHAVE_JBAY=1")
-
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
 if (ENABLE_STATIC_LIBS)
@@ -85,8 +82,7 @@ else()
     set (BFN_P4C_GIT_SHA $ENV{BFN_P4C_GIT_SHA})
   endif()
 endif()
-set (ENV{P4C_VERSION} "${BFN_P4C_VERSION} (SHA: ${BFN_P4C_GIT_SHA})")
-MESSAGE(STATUS "p4c-barefoot version: $ENV{P4C_VERSION}")
+MESSAGE(STATUS "p4c-barefoot version: ${BFN_P4C_VERSION}")
 
 # Generate the sha specific version file. It includes the GIT SHA.
 # Because this version changes frequently, we include it separately from the normal version files.
@@ -229,6 +225,7 @@ set (BF_P4C_IR_SRCS
   bf-p4c/parde/match_register.cpp
   bf-p4c/parde/clot/clot.cpp
   bf-p4c/phv/phv.cpp
+  # FIXME: This should be a library.
   bf-utils/dynamic_hash/dynamic_hash.cpp
   bf-utils/dynamic_hash/bfn_hash_algorithm.cpp
   )
@@ -240,3 +237,6 @@ endforeach()
 set(EXTENSION_IR_SOURCES ${EXTENSION_IR_SOURCES} ${QUAL_BF_P4C_IR_SRCS} PARENT_SCOPE)
 
 add_subdirectory(bf-p4c)
+
+# Initialize bf-asm after bf-p4c.
+add_subdirectory(bf-asm)
diff --git a/backends/tofino/LICENSE b/backends/tofino/LICENSE
index a24a1c32224..bc47beb02fe 100644
--- a/backends/tofino/LICENSE
+++ b/backends/tofino/LICENSE
@@ -1,4 +1,4 @@
-Copyright (C) 2024 Intel Corporation
+Copyright (C) 2025 Intel Corporation
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not
 use this file except in compliance with the License.  You may obtain a copy
diff --git a/backends/tofino/bf-asm/.gdbinit b/backends/tofino/bf-asm/.gdbinit
new file mode 100644
index 00000000000..90702882411
--- /dev/null
+++ b/backends/tofino/bf-asm/.gdbinit
@@ -0,0 +1,402 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# vim: ft=python
+set print object
+set unwindonsignal on
+set unwind-on-terminating-exception on
+
+if $_isvoid($bpnum)
+    break __assert_fail
+    break error
+    break bug
+end
+
+define d
+    call ::dump($arg0)
+end
+
+
+python
+def template_split(s):
+    parts = []
+    bracket_level = 0
+    current = []
+    for c in (s):
+        if c == "," and bracket_level == 1:
+            parts.append("".join(current))
+            current = []
+        else:
+            if c == '>':
+                bracket_level -= 1
+            if bracket_level > 0:
+                current.append(c)
+            if c == '<':
+                bracket_level += 1
+    parts.append("".join(current))
+    return parts
+
+def vec_begin(vec):
+    return vec['_M_impl']['_M_start']
+def vec_end(vec):
+    return vec['_M_impl']['_M_finish']
+def vec_size(vec):
+    return int(vec_end(vec) - vec_begin(vec))
+def vec_at(vec, i):
+    return (vec_begin(vec) + i).dereference()
+
+class bitvecPrinter(object):
+    "Print a bitvec"
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        data = self.val['data']
+        rv = ""
+        size = self.val['size']
+        ptr = self.val['ptr']
+        unitsize = ptr.type.target().sizeof * 8
+        while size > 1:
+            data = ptr.dereference()
+            i = 0
+            while i < unitsize:
+                if (rv.__len__() % 120 == 119): rv += ':'
+                elif (rv.__len__() % 30 == 29): rv += ' '
+                elif (rv.__len__() % 6 == 5): rv += '_'
+                if (data & 1) == 0:
+                    rv += "0"
+                else:
+                    rv += "1"
+                data >>= 1
+                i += 1
+            ptr += 1
+            size -= 1
+            data = ptr.dereference()
+        while rv == "" or data > 0:
+            if (rv.__len__() % 120 == 119): rv += ':'
+            elif (rv.__len__() % 30 == 29): rv += ' '
+            elif (rv.__len__() % 6 == 5): rv += '_'
+            if (data & 1) == 0:
+                rv += "0"
+            else:
+                rv += "1"
+            data >>= 1
+        return rv
+class value_t_Printer(object):
+    "Print a value_t"
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        typ = self.val['type']
+        if typ == 0:  # tINT
+            return str(self.val['i'])
+        elif typ == 1:  # tBIGINT
+            v = self.val['bigi']
+            data = v['data']
+            size = v['size']
+            val = 0
+            while size > 0:
+                val <<= 64
+                val += data.dereference()
+                size -= 1
+                data += 1
+            return str(val)
+        elif typ == 2:  # tRANGE
+            return str(self.val['lo']) + '..' + str(self.val['hi'])
+        elif typ == 3:  # tSTR
+            return self.val['s']
+        elif typ == 4:  # tMATCH
+            return self.val['m']
+        elif typ == 5:  # tBIGMATCH
+            return self.val['bigm']
+        elif typ == 6:  # tVEC
+            return "vector of %d elements" % self.val['vec']['size']
+        elif typ == 7:  # tMAP
+            return "map of %d elements" % self.val['map']['size']
+        elif typ == 8:  # tCMD
+            cmd = self.val['vec']['data']
+            count = self.val['vec']['size']
+            rv = str(cmd.dereference())
+            rv += "("
+            while count > 1:
+                count -= 1
+                cmd += 1
+                rv += str(cmd.dereference())
+                if count > 1:
+                    rv += ", "
+            rv += ")"
+            return rv;
+        else:
+            return "<value_t type " + hex(typ) + ">"
+    class _vec_iter:
+        def __init__(self, data, size):
+            self.data = data
+            self.size = size
+            self.counter = -1
+        def __iter__(self):
+            return self
+        def __next__(self):
+            self.counter += 1
+            if self.counter >= self.size:
+                raise StopIteration
+            item = self.data.dereference()
+            self.data += 1
+            return ("[%d]" % self.counter, item)
+        def next(self): return self.__next__()
+    class _map_iter:
+        def __init__(self, data, size):
+            self.data = data
+            self.size = size
+        def __iter__(self):
+            return self
+        def __next__(self):
+            self.size -= 1
+            if self.size < 0:
+                raise StopIteration
+            item = self.data.dereference()
+            self.data += 1
+            return ("[" + str(item['key']) + "]", item['value'])
+        def next(self): return self.__next__()
+
+    class _not_iter:
+        def __init__(self):
+            pass
+        def __iter__(self):
+            return self
+        def __next__(self):
+            raise StopIteration
+        def next(self): return self.__next__()
+    def children(self):
+        typ = self.val['type']
+        if typ == 6:
+            vec = self.val['vec']
+            return self._vec_iter(vec['data'], vec['size'])
+        elif typ == 7:
+            map = self.val['map']
+            return self._map_iter(map['data'], map['size'])
+        else:
+            return self._not_iter()
+class value_t_VECTOR_Printer(object):
+    "Print a VECTOR(value_t)"
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return "vector of %d elements" % self.val['size']
+    class _iter:
+        def __init__(self, data, size):
+            self.data = data
+            self.size = size
+            self.counter = -1
+        def __iter__(self):
+            return self
+        def __next__(self):
+            self.counter += 1
+            if self.counter >= self.size:
+                raise StopIteration
+            item = self.data.dereference()
+            self.data += 1
+            return ("[%d]" % self.counter, item)
+        def next(self): return self.__next__()
+    def children(self):
+        return self._iter(self.val['data'], self.val['size'])
+class pair_t_VECTOR_Printer(object):
+    "Print a VECTOR(pair_t)"
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return "map of %d elements" % self.val['size']
+    class _iter:
+        def __init__(self, data, size):
+            self.data = data
+            self.size = size
+        def __iter__(self):
+            return self
+        def __next__(self):
+            self.size -= 1
+            if self.size < 0:
+                raise StopIteration
+            item = self.data.dereference()
+            self.data += 1
+            return ("[" + str(item['key']) + "]", item['value'])
+        def next(self): return self.__next__()
+    def children(self):
+        return self._iter(self.val['data'], self.val['size'])
+class ordered_map_Printer:
+    "Print an ordered_map<>"
+    def __init__(self, val):
+        self.val = val
+        self.args = template_split(val.type.tag)
+        self.eltype = gdb.lookup_type('std::pair<' + self.args[0] + ' const,' + self.args[1] + '>')
+    def to_string(self):
+        it = self.val['data']['_M_impl']['_M_node']['_M_next']
+        e = self.val['data']['_M_impl']['_M_node'].address
+        if it == e:  # empty map
+            return "{}"
+        else:
+            return None
+    class _iter:
+        def __init__(self, eltype, it, e):
+            self.eltype = eltype
+            self.it = it
+            self.e = e
+        def __iter__(self):
+            return self
+        def __next__(self):
+            if self.it == self.e:
+                raise StopIteration
+            el = (self.it + 1).cast(self.eltype.pointer()).dereference()
+            self.it = self.it.dereference()['_M_next']
+            return ("[" + str(el['first']) + "]", el['second']);
+        def next(self): return self.__next__()
+    def children(self):
+        return self._iter(self.eltype, self.val['data']['_M_impl']['_M_node']['_M_next'],
+                          self.val['data']['_M_impl']['_M_node'].address)
+class InputXbar_Group_Printer:
+    "Print an InputXbar::Group"
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        types = [ 'invalid', 'exact', 'ternary', 'byte', 'gateway', 'xcmp' ]
+        t = int(self.val['type'])
+        if t >= 0 and t < len(types):
+            rv = types[t]
+        else:
+            rv = '<bad type 0x%x>' % int(self.val['type'])
+        rv += ' group ' + str(self.val['index'])
+        return rv
+class ActionBusSource_Printer:
+    "Print an ActionBusSource"
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        try:
+            types = [ "None", "Field", "HashDist", "HashDistPair", "RandomGen",
+                      "TableOutput", "TableColor", "TableAddress", "Ealu", "XCmp",
+                      "NameRef", "ColorRef", "AddressRef" ]
+            t = int(self.val['type'])
+            if t >= 0 and t < len(types):
+                rv = types[t]
+            else:
+                rv = '<bad type 0x%x>' % int(self.val['type'])
+            if t == 9:  # XCMP on one line without children
+                rv += "[" + str(self.val['xcmp_group']) + ":" + str(self.val['xcmp_byte']) + "]"
+        except Exception as e:
+            rv += "{crash: "+str(e)+"}"
+        return rv
+    class _iter:
+        def __init__(self, val, type):
+            self.val = val
+            self.type = type
+            self.count = 0
+        def __iter__(self):
+            return self
+        def __next__(self):
+            self.count = self.count + 1
+            if self.type == 3:
+                if self.count == 1:
+                    return ("hd1", self.val['hd1'])
+                elif self.count == 2:
+                    return ("hd2", self.val['hd2'])
+                else:
+                    raise StopIteration
+            #elif self.type == 9:
+            #    XCmp on one line without children
+            #    if self.count == 1:
+            #        return ("group", self.val['xcmp_group'])
+            #    elif self.count == 2:
+            #        return ("byte", self.val['xcmp_byte'])
+            elif self.count > 1:
+                raise StopIteration
+            elif self.type == 1:
+                return ("field", self.val['field'].dereference())
+            elif self.type == 2:
+                return ("hd", self.val['hd'])
+            elif self.type == 4:
+                return ("rng", self.val['rng'])
+            elif self.type == 5 or self.type == 6 or self.type == 7:
+                return ("table", self.val['table'])
+            elif self.type == 10 or self.type == 11 or self.type == 12:
+                return ("name_ref", self.val['name_ref'])
+            raise StopIteration
+        def next(self): return self.__next__()
+    def children(self):
+        return self._iter(self.val, int(self.val['type']))
+
+class PhvRef_Printer:
+    "Print a Phv::Ref"
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        threads = [ "ig::", "eg::", "gh::" ]
+        rv = threads[self.val['gress_']] + str(self.val['name_'])
+        if self.val['lo'] >= 0:
+            rv += '(' + str(self.val['lo'])
+            if self.val['hi'] >= 0:
+                rv += '..' + str(self.val['hi'])
+            rv += ')'
+        return rv
+
+class Mem_Printer:
+    "Print a MemUnit or subclass"
+    def __init__(self, val, big, small):
+        self.val = val
+        self.big = big
+        self.small = small
+    def to_string(self):
+        if self.val['stage'] > -32768:
+            return "%s(%d,%d,%d)" % (self.big, self.val['stage'], self.val['row'], self.val['col'])
+        if self.val['row'] >= 0:
+            return "%s(%d,%d)" % (self.big, self.val['row'], self.val['col'])
+        return "%s(%d)" % (self.small, self.val['col'])
+
+def bfas_pp(val):
+    if val.type.tag == 'bitvec':
+        return bitvecPrinter(val)
+    if val.type.tag == 'value_t':
+        return value_t_Printer(val)
+    if val.type.tag == 'value_t_VECTOR':
+        return value_t_VECTOR_Printer(val)
+    if val.type.tag == 'pair_t_VECTOR':
+        return pair_t_VECTOR_Printer(val)
+    if str(val.type.tag).startswith('ordered_map<'):
+        return ordered_map_Printer(val)
+    if val.type.tag == 'InputXbar::Group':
+        return InputXbar_Group_Printer(val)
+    if val.type.tag == 'ActionBusSource':
+        return ActionBusSource_Printer(val)
+    if val.type.tag == 'Phv::Ref':
+        return PhvRef_Printer(val)
+    if val.type.tag == 'SRamMatchTable::Ram':
+        return Mem_Printer(val, 'Ram', 'Lamb')
+    if val.type.tag == 'MemUnit':
+        return Mem_Printer(val, 'Mem', 'Mem')
+    return None
+
+try:
+    found = False
+    for i in range(len(gdb.pretty_printers)):
+        try:
+            if gdb.pretty_printers[i].__name__ == "bfas_pp":
+                gdb.pretty_printers[i] = bfas_pp
+                found = True
+        except:
+            pass
+    if not found:
+        gdb.pretty_printers.append(bfas_pp)
+except:
+    pass
+
+end
diff --git a/backends/tofino/bf-asm/.gitignore b/backends/tofino/bf-asm/.gitignore
new file mode 100644
index 00000000000..0651a09e2dc
--- /dev/null
+++ b/backends/tofino/bf-asm/.gitignore
@@ -0,0 +1,27 @@
+Makefile.in
+aclocal.m4
+autom4te.cache
+build
+compile
+configure
+depcomp
+install-sh
+missing
+ylwrap
+*.o
+*.d
+*.out
+*.tofino
+*.pyc
+gen
+templates
+asm-parse.c
+lex-yaml.c
+json2cpp
+json_diff
+mksizes
+reflow
+tags
+tfas
+y.output
+faillog.txt
diff --git a/backends/tofino/bf-asm/CMakeLists.txt b/backends/tofino/bf-asm/CMakeLists.txt
new file mode 100644
index 00000000000..5afc518db18
--- /dev/null
+++ b/backends/tofino/bf-asm/CMakeLists.txt
@@ -0,0 +1,310 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# # # #### Tofino assembler
+project(BFASM)
+
+MESSAGE("-- Adding bf-asm")
+
+OPTION(ASAN_ENABLED "Enable ASAN checks" OFF)
+
+set (BFASM_LIB_DEPS p4ctoolkit ${P4C_LIB_DEPS})
+set (BFASM_GEN_DIR ${BFASM_BINARY_DIR}/gen)
+
+# other required libraries
+include (CheckLibraryExists)
+# check includes
+include (CheckIncludeFile)
+check_include_file (execinfo.h HAVE_EXECINFO_H)
+check_include_file (ucontext.h HAVE_UCONTEXT_H)
+
+set(CMAKE_CXX_FLAGS "")  # clear CXX_FLAGS
+# TODO: Fix build warnings with -Wall and enable it.
+# add_cxx_compiler_option ("-Wall")
+# add_cxx_compiler_option ("-Wextra")
+# add_cxx_compiler_option ("-Wno-unused")
+# add_cxx_compiler_option ("-Wno-unused-parameter")
+# add_cxx_compiler_option ("-Wno-pragmas")
+# add_cxx_compiler_option ("-Wno-unknown-pragmas")
+add_cxx_compiler_option ("-Wno-overloaded-virtual")
+add_cxx_compiler_option ("-Wno-deprecated")
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES i386|i586|i686)
+  # on 32-bit platforms we get a lot of warnings when using the error macros
+  add_cxx_compiler_option("-Wno-write-strings")
+endif()
+if (ENABLE_BAREFOOT_INTERNAL)
+  add_definitions("-DBAREFOOT_INTERNAL=1")
+endif()
+
+message(STATUS "P4C ${P4C_SOURCE_DIR}")
+macro(get_schema_version schema_file schema_var)
+  execute_process(
+    COMMAND python3 -c "from ${schema_file} import get_schema_version;print(get_schema_version(), end='', flush=True)"
+    OUTPUT_VARIABLE __schema_version
+    RESULT_VARIABLE __schema_errcode
+    ERROR_VARIABLE __schema_errstr
+    WORKING_DIRECTORY ${BFN_P4C_SOURCE_DIR}/compiler_interfaces/schemas)
+  if (${__schema_errcode})
+    MESSAGE(FATAL_ERROR "Error retrieving ${schema_file} version ${__schema_errstr}")
+  endif()
+  set(${schema_var} ${__schema_version})
+endmacro(get_schema_version)
+# Now force cmake to rerun if any of the files that we depend on versions for
+# change: context and manifest for now
+# We generate a pair of dummy dependency files will be ignored
+set(SCHEMA_FILES
+  ${BFN_P4C_SOURCE_DIR}/compiler_interfaces/schemas/context_schema.py
+  ${BFN_P4C_SOURCE_DIR}/compiler_interfaces/schemas/manifest_schema.py
+  ${BFN_P4C_SOURCE_DIR}/compiler_interfaces/schemas/phv_schema.py
+  ${BFN_P4C_SOURCE_DIR}/compiler_interfaces/schemas/power_schema.py
+  ${BFN_P4C_SOURCE_DIR}/compiler_interfaces/schemas/resources_schema.py
+  )
+foreach (f ${SCHEMA_FILES})
+  configure_file(${f} ${CMAKE_BINARY_DIR}/${f}.dep)
+endforeach()
+
+get_schema_version(context_schema CONTEXT_SCHEMA_VERSION)
+MESSAGE(STATUS "Found context schema version ${CONTEXT_SCHEMA_VERSION}")
+add_definitions("-DCONTEXT_SCHEMA_VERSION=\"${CONTEXT_SCHEMA_VERSION}\"")
+
+# ASAN CHECKS
+if (ASAN_ENABLED)
+  # force this set of flags only
+  set (CMAKE_CXX_FLAGS "-fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer -fno-optimize-sibling-calls -g -O1")
+endif()
+
+# json_diff
+set (JSONDIFF_SOURCES json_diff.cpp json.cpp fdstream.cpp)
+
+# bfdumpbin
+set (BFDUMPBIN_SOURCES bfdumpbin.cpp fdstream.cpp json.cpp bson.cpp)
+
+# bfdis
+set (BFDIS_SOURCES bfdis.cpp disasm.cpp fdstream.cpp)
+
+# reflow
+set (REFLOW_SOURCES reflow.cpp)
+
+# b2j
+set (B2J_SOURCES b2j.cpp json.cpp bson.cpp)
+
+# j2b
+set (J2B_SOURCES j2b.cpp json.cpp bson.cpp)
+
+# mksizes
+set (MKSIZES_SOURCES mksizes.cpp)
+
+# json_diff
+add_executable (json_diff ${JSONDIFF_SOURCES})
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(json_diff PUBLIC "-Wno-error")
+
+# bfdumpbin
+add_executable (bfdumpbin ${BFDUMPBIN_SOURCES})
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(bfdumpbin PUBLIC "-Wno-error")
+
+# reflow
+add_executable (reflow ${REFLOW_SOURCES})
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(reflow PUBLIC "-Wno-error")
+
+# b2j
+add_executable (b2j ${B2J_SOURCES})
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(b2j PUBLIC "-Wno-error")
+
+# j2b
+add_executable (j2b ${J2B_SOURCES})
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(j2b PUBLIC "-Wno-error")
+
+# mksizes
+add_executable (mksizes ${MKSIZES_SOURCES})
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(mksizes PUBLIC "-Wno-error")
+
+set (BFAS_COMMON_SOURCES
+  action_bus.cpp
+  action_table.cpp
+  asm-types.cpp
+  atcam_match.cpp
+  attached_table.cpp
+  bfas.cpp
+  bson.cpp
+  counter.cpp
+  crash.cpp
+  deparser.cpp
+  depositfield.cpp
+  dynhash.cpp
+  error_mode.cpp
+  exact_match.cpp
+  exename.cpp
+  flexible_headers.cpp
+  gateway.cpp
+  hash_action.cpp
+  hash_dist.cpp
+  hashexpr.cpp
+  idletime.cpp
+  input_xbar.cpp
+  instruction.cpp
+  json.cpp
+  match_table.cpp
+  meter.cpp
+  misc.cpp
+  p4_table.cpp
+  parser-tofino-jbay.cpp
+  phase0.cpp
+  phv.cpp
+  primitives.cpp
+  proxy_hash.cpp
+  salu_inst.cpp
+  selection.cpp
+  sram_match.cpp
+  stage.cpp
+  stateful.cpp
+  synth2port.cpp
+  tables.cpp
+  target.cpp
+  ternary_match.cpp
+  top_level.cpp
+  ubits.cpp
+  vector.c
+  widereg.cpp
+  # FIXME: This should be a library.
+  ${BFN_P4C_SOURCE_DIR}/bf-utils/dynamic_hash/dynamic_hash.cpp
+  ${BFN_P4C_SOURCE_DIR}/bf-utils/dynamic_hash/bfn_hash_algorithm.cpp
+  )
+
+
+BISON_TARGET (asm-parse asm-parse.ypp ${BFASM_GEN_DIR}/asm-parse.cpp VERBOSE)
+
+add_custom_command(OUTPUT ${BFASM_GEN_DIR}/uptr_sizes.h
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${BFASM_GEN_DIR}
+  COMMAND ${BFASM_BINARY_DIR}/mksizes > ${BFASM_GEN_DIR}/uptr_sizes.h)
+add_custom_target(bfasm_uptr DEPENDS mksizes ${BFASM_GEN_DIR}/uptr_sizes.h)
+
+add_custom_command(OUTPUT ${BFASM_GEN_DIR}/lex-yaml.c
+  COMMAND ${FLEX_EXECUTABLE} -t ${BFASM_SOURCE_DIR}/lex-yaml.l > ${BFASM_GEN_DIR}/lex-yaml.c
+  DEPENDS ${BFASM_SOURCE_DIR}/lex-yaml.l
+  COMMENT "Generating lex-yaml.cpp")
+add_custom_target(bfasm_yaml DEPENDS ${BFASM_GEN_DIR}/lex-yaml.c)
+add_dependencies(bfasm_yaml bfasm_uptr)
+
+set (BFAS_GEN_SOURCES
+  ${BFASM_GEN_DIR}/asm-parse.cpp
+  ${BFASM_GEN_DIR}/uptr_sizes.h
+)
+
+set (BFASM_WALLE ${BFASM_SOURCE_DIR}/walle/walle.py)
+set (WALLE_SOURCES
+      ${BFASM_SOURCE_DIR}/walle/chip.py
+      ${BFASM_SOURCE_DIR}/walle/csr.py
+      ${BFASM_SOURCE_DIR}/walle/walle.py)
+
+add_subdirectory (tofino)
+add_subdirectory (jbay)
+set (BFASM_LIBS ${BFASM_LIBS} regs_jbay regs_tofino)
+
+# Other configuration files that need to be generated
+configure_file ("${BFASM_SOURCE_DIR}/cmake/config.h.cmake" "${BFASM_BINARY_DIR}/config.h")
+
+set_source_files_properties (${BFAS_GEN_SOURCES} ${BFASM_GEN_DIR}/lex-yaml.c PROPERTIES GENERATED TRUE)
+
+set (BFAS_SOURCES ${BFAS_COMMON_SOURCES} ${BFAS_GEN_SOURCES}
+  ${BFAS_TOFINO_SRCS}
+  ${BFAS_JBAY_SRCS}
+)
+
+# bfdis
+if (ENABLE_GTESTS)
+  # FIXME -- bfdis depends on bfas_lib which is only built if GTESTS are enabled.  So for
+  # now we only enable bfdis with ENABLE_GTESTS.  Should fix to use bfas_lib for bfas
+  # rather than building separately, so it will always be anbled.
+  add_executable (bfdis ${BFDIS_SOURCES})
+  target_link_libraries (bfdis bfas_lib ${BFASM_LIBS} ${BFASM_LIB_DEPS})
+endif()
+
+
+set_source_files_properties(${BFAS_SOURCES} PROPERTIES COMPILE_FLAGS ${BFASM_CXX_FLAGS})
+# Remove compiler flag that is C++ only for vector.c
+string(REPLACE "-Wno-overloaded-virtual" "" vector_c_flags ${BFASM_CXX_FLAGS})
+set_source_files_properties(vector.c PROPERTIES COMPILE_FLAGS ${vector_c_flags})
+add_executable (bfas ${BFAS_SOURCES})
+# Enable extensions for bfas. FIXME: Do we need this?
+target_compile_options(bfas PRIVATE -std=gnu++17)
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(bfas PUBLIC "-Wno-error")
+target_link_libraries (bfas ${BFASM_LIBS} ${BFASM_LIB_DEPS})
+add_dependencies(bfas bfasm_yaml)
+
+install (TARGETS bfas
+  RUNTIME DESTINATION bin)
+# Link bfas into the p4c binary folder.
+add_custom_target(linkbfas
+  COMMAND ${CMAKE_COMMAND} -E create_symlink ${BFASM_BINARY_DIR}/bfas ${P4C_BINARY_DIR}/bfas
+  )
+add_dependencies(linkbfas bfas)
+add_dependencies(p4c_driver linkbfas)
+
+
+string(CONFIGURE "/^DECLARE_(ABSTRACT_)?TABLE_TYPE\(([a-zA-Z0-9_]+)/2/c/" CTAGS_CXX_REGEXP @ONLY)
+add_custom_target(ctags-asm
+  COMMAND ctags -R -I VECTOR "--regex-C++=${CTAGS_CXX_REGEXP}"
+  COMMENT "Generating ctags")
+
+
+if (ENABLE_GTESTS)
+  # TODO Components need to be built, once, into intermediate libraries.
+  # These lib would then be linked to unit-tests and also linked into larger components/executable.
+  # The exact shape, size, hierarchy of components is to be decided.
+  # For now we will allow the source to be built twice, once for 'bfas', and once for
+  # gtest/CMakeList.txt as a single monolithic component 'bfas_lib'.
+  # TODO ASAP refactor bfas.cpp, moving main() into its own file.
+  #           add_executable (bfas asm_main.cpp)
+  #           target_link_libraries (bfas bfas_lib ${BFASM_LIBS} ${BFASM_LIB_DEPS})
+
+  add_library (bfas_lib ${BFAS_SOURCES})
+  target_compile_definitions(bfas_lib PRIVATE BUILDING_FOR_GTEST)  # removes main()
+  target_link_libraries (bfas_lib PRIVATE ${BFASM_LIBS} ${BFASM_LIB_DEPS})
+
+  set(BFAS_GTEST_SOURCES
+    gtest/gtestasm.cpp
+    gtest/asm-types.cpp
+    gtest/depositfield.cpp
+    gtest/gateway.cpp
+    gtest/hashexpr.cpp
+    gtest/mirror.cpp
+    gtest/parser-test.cpp
+    gtest/register-matcher.h
+    gtest/register-matcher.cpp
+    )
+
+  # Do not use a unity build for gtestasm (for now).
+  set_source_files_properties (${BFAS_GTEST_SOURCES} PROPERTIES SKIP_UNITY_BUILD_INCLUSION TRUE)
+
+  add_executable (gtestasm ${BFAS_GTEST_SOURCES} ${BFP4C_SOURCES})
+  target_link_libraries (gtestasm PRIVATE bfas_lib gtest ${BFASM_LIB_DEPS})
+  target_compile_options (gtestasm PRIVATE -Wall -Wextra -ggdb -O3
+                          -Wno-unused-parameter -Wno-sign-compare)
+    # Disable errors for warnings. FIXME: Get rid of this.
+  target_compile_options(gtestasm PUBLIC "-Wno-error")
+
+  # Add to CTests - but this is in the BFASM project viz build/bf-asm, not build/p4c
+  add_test (NAME gtestasm COMMAND gtestasm WORKING_DIRECTORY ${P4C_BINARY_DIR})
+  set_tests_properties (gtestasm PROPERTIES LABELS "gtest")
+endif ()
diff --git a/backends/tofino/bf-asm/Options.md b/backends/tofino/bf-asm/Options.md
new file mode 100644
index 00000000000..05dba981d2a
--- /dev/null
+++ b/backends/tofino/bf-asm/Options.md
@@ -0,0 +1,98 @@
+# bfas command line options
+
+usage: bfas [ options ] file.bfa
+
+### general options
+
+* -h
+  help
+
+* --target *target*
+
+  specify the target (obsolete as target is generally specified in the .bfa file)
+
+* -Werror
+
+  treat warnings as errors
+
+### options for controlling output
+
+* -a
+* --allpipes
+
+  Generate a binary that has explicit writes for all pipes, rather than just one
+
+* -s
+* --singlepipe
+* --pipe*N*
+
+* -G
+* --gen\_json
+
+  Generate .cfg.json files instead of binary
+
+* --no-bin
+* --num-stages-override*N*
+
+* -M
+
+  Attempt to match glass bit-for-bit
+
+* -o *directory*
+
+  Generate output in the specified directory rather than in the current working dir
+
+### options for controling cfg details
+
+* -C
+  condense json by stripping out unset subtrees (default)
+
+* --disable-egress-latency-padding
+
+  Disable the padding of egress latency to avoid tofino1 TM overrun bus
+
+* --disable-longbranch
+* --enable-longbranch
+
+  Disable or enable support for long branches
+
+* --disable-tof2lab44-workaround
+
+* --high\_availability\_disabled
+* --multi-parsers
+* --no-condense
+* --noop-fill-instruction *opcode*
+  
+  Insert instructions (of the form *opcode* R, R, R) for noop slots in VLIW instructions
+  where the slot is not used by any action in the stage.  *opcode* must be one that is an
+  identity function when applied to two copies of the same value (and, or, alu\_a, alu\_b,
+  mins, maxs, minu, or maxu)
+
+* -p
+  Disable power gating
+
+* --singlewrite
+* --stage\_dependency\_pattern *pattern*
+* --table-handle-offset*N*
+
+### options for logging/debugging
+
+* -l *file*
+
+  redirect logging output to file
+
+* --log-hashes
+
+* -q
+
+  disable all logging output
+
+* --no-warn
+
+* -T *debug spec*
+
+  enable logging of specific source files and specific levels
+
+* -v
+
+  increase logging verbosity
diff --git a/backends/tofino/bf-asm/README.md b/backends/tofino/bf-asm/README.md
new file mode 100644
index 00000000000..ed0f50f1ae0
--- /dev/null
+++ b/backends/tofino/bf-asm/README.md
@@ -0,0 +1,375 @@
+# Tofino Assembler
+
+## Documentation
+
+## Setup
+
+The repository contains code for the Barefoot assembler (bfas) and linker (walle).
+More info on walle can be found in walle/README.md.
+
+Assembler takes assembly files (.bfa or .tfa) as input to generate output json which is
+then fed to walle to produce binary for tofino.
+
+## Dependencies
+
+- GNU make
+- A C++ compiler supporting C++11 (the Makefile uses g++ by defalt)
+- bison
+- flex
+
+Running the test suite requires access to the Glass p4c\_tofino compiler.
+Running stf tests requires access to the simple test harness.  The
+`tests/runtests` script will look in various places for these tools (see the top
+of the script)
+
+## Building Assembler
+
+The assenbler is built automatically as part of the full bf-p4c-tofino build; there
+is currently no supported standalone method of building the assembler by itself.
+
+## Address Sanitizer checks
+
+(obsolete)
+To enable address sanitizer checks in the assembler use,
+
+```
+user@box$ ./bootstrap.sh --enable-asan-checks
+```
+
+Or alternatively,
+
+```
+user@box$ ./configure --enable-asan-checks
+```
+
+This configures the Makefile to add -fsanitizer=address & -fsanitizer=undefined.
+By default the leak sanitizer is also enabled along with the address santizier.
+You can disable it by setting environment variable ASAN\_OPTIONS with
+"detect\_leaks=0".
+
+## Testing
+
+### Make Targets
+
+```
+user@box$ make check
+```
+
+Runs tests/runtests script on all .p4 files in the tests and tests/mau
+directories and .bfa files in tests/asm directory. This script can run one or
+more tests specified on the command line, or will run all .p4 files in the
+current directory if run with no arguments.  Stf tests can be run if specified
+explicitly on the command line; they will not run by default.
+
+```
+user@box$ make check-sanity
+```
+
+This is similar to `make check` but will only run on .p4 files in the tests
+directory which is a small subset for a quick sanity check.
+
+### Runtests Script
+
+The ./tests/runtests script will first run glass compiler (p4c-tofino) on
+input .p4 file and then run the assembler (bfas) on generated assembly (.tfa)
+file.  Glass also generates output json which is then compared (by the script)
+to the json generated from assembler.
+
+To skip running glass use -f option on the runtests script
+
+Use -j <value> to run parallel threads. If invoking through Make targets set
+MAKEFLAGS to "-j <value>"
+
+### Expected Failures
+
+expected\_failures.txt files are under tests & tests/mau directory which outline
+failing tests with cause (compile, bfas, mismatch). These files must be updated
+to reflect any new or fixed fails.
+
+| FAIL     |  TYPE        | CAUSE                                                   |
+|----------|--------------|---------------------------------------------------------|
+| compile  |  Glass       | Glass cannot compile input .p4 file                     |
+| bfas     |  Assembler   | Assembler error while running input assembly file (.bfa)|
+| mismatch |  Json output | Difference in json outputs for glass and assembler      |
+
+### Context Json Ignore
+Context Json output from Glass compiler is verbose and may or may not be
+consumed entirely by the drivers unlike the assembler Json output. The
+tests/runtests script ignores the keys placed in the tests/ctxt\_json\_ignore file
+while creating json diff to only display relevant mismatches
+
+### Json Diff
+Each test after running will have its own <testname>.out dir with following
+items:
+E.g. TEST = exact\_match0.p4
+exact\_match0.p4.out
+##### Glass Json output
+```
+├── cfg
+│   ├── memories.all.parser.egress.cfg.json.gz
+│   ├── memories.all.parser.ingress.cfg.json.gz
+│   ├── memories.pipe.cfg.json.gz
+│   ├── memories.top.cfg.json.gz
+│   ├── regs.all.deparser.header_phase.cfg.json.gz
+│   ├── regs.all.deparser.input_phase.cfg.json.gz
+│   ├── regs.all.parse_merge.cfg.json.gz
+│   ├── regs.all.parser.egress.cfg.json.gz
+│   ├── regs.all.parser.ingress.cfg.json.gz
+│   ├── regs.match_action_stage.00.cfg.json.gz
+│   ├── regs.match_action_stage.01.cfg.json.gz
+│   ├── regs.match_action_stage.02.cfg.json.gz
+│   ├── regs.match_action_stage.03.cfg.json.gz
+│   ├── regs.match_action_stage.04.cfg.json.gz
+│   ├── regs.match_action_stage.05.cfg.json.gz
+│   ├── regs.match_action_stage.06.cfg.json.gz
+│   ├── regs.match_action_stage.07.cfg.json.gz
+│   ├── regs.match_action_stage.08.cfg.json.gz
+│   ├── regs.match_action_stage.09.cfg.json.gz
+│   ├── regs.match_action_stage.0a.cfg.json.gz
+│   ├── regs.match_action_stage.0b.cfg.json.gz
+│   ├── regs.pipe.cfg.json.gz
+│   └── regs.top.cfg.json.gz
+├── context
+│   ├── deparser.context.json
+│   ├── mau.context.json
+│   ├── parser.context.json
+│   └── phv.context.json
+```
+##### Assembler Output Directory
+```
+├── exact_match0.out
+```
+##### Assembler Json Output
+```
+│   ├── memories.all.parser.egress.cfg.json.gz
+│   ├── memories.all.parser.ingress.cfg.json.gz
+│   ├── memories.pipe.cfg.json.gz
+│   ├── memories.top.cfg.json.gz
+│   ├── regs.all.deparser.header_phase.cfg.json.gz
+│   ├── regs.all.deparser.input_phase.cfg.json.gz
+│   ├── regs.all.parse_merge.cfg.json.gz
+│   ├── regs.all.parser.egress.cfg.json.gz
+│   ├── regs.all.parser.ingress.cfg.json.gz
+│   ├── regs.match_action_stage.00.cfg.json.gz
+│   ├── regs.match_action_stage.01.cfg.json.gz
+│   ├── regs.match_action_stage.02.cfg.json.gz
+│   ├── regs.match_action_stage.03.cfg.json.gz
+│   ├── regs.match_action_stage.04.cfg.json.gz
+│   ├── regs.match_action_stage.05.cfg.json.gz
+│   ├── regs.match_action_stage.06.cfg.json.gz
+│   ├── regs.match_action_stage.07.cfg.json.gz
+│   ├── regs.match_action_stage.08.cfg.json.gz
+│   ├── regs.match_action_stage.09.cfg.json.gz
+│   ├── regs.match_action_stage.0a.cfg.json.gz
+│   ├── regs.match_action_stage.0b.cfg.json.gz
+│   ├── regs.pipe.cfg.json.gz
+│   ├── regs.top.cfg.json.gz
+```
+##### Context Json
+```
+│   └── context.json
+```
+##### Symlink to Glass Assembly File
+```
+├── exact_match0.tfa -> out.tfa
+```
+##### Glass Run Log
+```
+├── glsc.log
+```
+##### Json Diff File
+```
+├── json_diff.txt
+```
+##### Glass Output Logs
+```
+├── logs
+│   ├── asm.log
+│   ├── mau.characterize.log
+│   ├── mau.config.log
+│   ├── mau.gateway.log
+│   ├── mau.gw.log
+│   ├── mau.log
+│   ├── mau.power.log
+│   ├── mau.resources.log
+│   ├── mau.rf.log
+│   ├── mau.sram.log
+│   ├── mau.tcam.log
+│   ├── mau.tp.log
+│   ├── pa.characterize.log
+│   ├── pa.liveness.log
+│   ├── pa.log
+│   ├── parde.calcfields.log
+│   ├── parde.config.log
+│   ├── parde.error.log
+│   ├── parde.log
+│   ├── pa.results.log
+│   ├── parser.characterize.log
+│   └── transform.log
+├── name_lookup.c
+```
+##### Glass output assembly file
+```
+├── out.tfa
+```
+##### Assembler Run Log
+```
+├── bfas.config.log
+├── bfas.log
+```
+##### Test visualization htmls
+```
+└── visualization
+    ├── deparser.html
+    ├── jquery.js
+    ├── mau.html
+    ├── parser.egress.html
+    ├── parser.ingress.html
+    ├── phv_allocation.html
+    └── table_placement.html
+```
+
+## Backends (Tofino/JBay)
+Assembler currently supports Tofino backend but code is generic enough to be
+ported to a different backend like JBay. Architecture specific constants must be
+parameterized and placed in the constants.h file
+
+"tofino" and "jbay" directories hold the chip schema to be used by the
+assembler. The chip schema contains register information and is a binary
+(python pickle file) generated from csv file in bfnregs repository.
+
+### Extracting information from hardware bfnregs info
+
+To the greatest extent possible, we automatically generate assembler support code
+directly from the information provided to use by the hardware team.  The main 'source'
+we get from hardware are the Semfore .csr files and the associated .csv files generated
+by Semafore from the .csr files.  We use walle (walle subdirectory) to read the .csv
+files and distill them into a chip.schema -- a python pickle file containing the
+datastructures defined in walle/csr.py that encapsulate the information and structure
+of all the hardware registers.
+
+We then use walle to generate C++ code embodying the register structure, defining C++
+classes containing the structure of all the registers.  The template.yaml file defines
+various options for the structure of the resulting C++ code -- which registers to use
+as the 'roots' of class hierarchies, what files to write the code in, which methods to
+define in each class.  Within the templates.yaml file, there's a `global:` section giving
+global options for all files, a `generate:` section listing the files to generate, and
+an `ignore:` section listing register subtrees to ignore (no code will be generated for
+them -- its as if they don't exist).
+
+Options that can be used include:
+
+| option          | description |
+| -----------     | ---------------- |
+| decl            | generate just declarations (suitable for a header file) |
+| defn            | generate definitions for those declarations.  With neither `decl` or `defn` will generate complete classes with inline methods |
+| checked\_array  | Use the checked\_array class (`checked_array.h`) for arrays (default) |
+| delete\_copy    | Delete copy constructors for generated classes |
+| dump\_unread    | generate a `dump_unread` method which dumps all unread registers to an ostream (default False) |
+| emit\_binary    | generate an `emit_binary` method that outputs binary code for the driver/model |
+| emit\_fieldname | generate `emit_fieldname` method used to print logging messages |
+| emit\_json      | generate `emit_json` method to generate config json |
+| enable\_disable | generate `enable`, `disable`, and `modified` methods |
+| global          | generate the specified register types once as global names rather than as nested in the containing object(s) |
+| include         | generate a `#include` of the specified file |
+| name            | Change the name of the top-level object |
+| namespace       | Put all declarations in the specified namesapce |
+| unpack\_json    | generate `unpack_json` method |
+| widereg         | Use `widereg` for registers wider than 64 bits |
+| write\_dma      | Generate `'B'` block writes for the specified registers instead of `'R'` single register writes in `emit_binary` methods |
+
+This results in C++ code that can either generate .cfg.json files or binary files for use by
+the driver/model.  When cfg.json files are produced, walle can be used to link them into a
+binary file.  There are also options for generating C++ code to read .cfg.json files for
+future support of binary disassembly.
+
+### Config JSON
+The config json files (with .cfg.json extension) are generated by the
+assembler which are fed into walle to generate the binary
+(also called `tofino.bin`)
+
+The config json is nothing but json files with a map of all the registers for
+a backend. In order to limit the json file size assembler disables registers
+which are not set (with the -C or condense json flag). Some registers are also
+explicitly disabled or enabled based on what the drive expects to see in the
+tofino.bin. Below is the status of regs and whether they will appear in the
+config json.
+```
+---------------------------------
+Disabled - (unconditionally)
+---------------------------------
+mem_pipe.mau
+regs.input.icr.inp_cfg
+regs.input.icr.intr
+regs.header.hem.he_edf_cfg
+regs.header.him.hi_edf_cfg
+regs.glb_group
+regs.chan0_group.chnl_drop
+regs.chan0_group.chnl_metadata_fix
+regs.chan1_group.chnl_drop
+regs.chan1_group.chnl_metadata_fix
+regs.chan2_group.chnl_drop
+regs.chan2_group.chnl_metadata_fix
+regs.chan3_group.chnl_drop
+regs.chan3_group.chnl_metadata_fix
+---------------------------------
+Disabled - (if Zero)
+---------------------------------
+regs (In all regs)
+mem_top (mau)
+mem_pipe (mau/dummy_reg)
+reg_top (ethgpiobr, ethgpiotl, pipes)
+reg_pipe (mau, pmarb, deparser)
+---------------------------------
+Enabled - (always)
+---------------------------------
+regs.dp.imem.imem_subword8
+regs.dp.imem.imem_subword16
+regs.dp.imem.imem_subword32
+regs.rams.map_alu.row[row].adrmux.mapram_config[col]
+```
+Once JBay support is added for all regs, above will be different for both
+backends.
+
+Driver dictates which regs are disabled or enabled unconditionally. Other
+regs which are disabled if zero are to limit file size and driver should
+automatically fill in the zero values.
+
+#### Generating and using chip.schema
+
+chip.schema files are generated by walle from the csv files in the
+bfnregs repo.  To generate a new chip.schema file, use
+
+    walle/walle.py --generate-schema ${BFNREGS_REPO}/modules/${CHIP}_regs
+
+where `${BFNREGS_REPO}` is the root of the bfnregs repo, and `${CHIP}`
+is the chip to target (`tofino`, `trestles`, or `jbay` at the moment).
+The newly created chip.schema file should then be moved into the jbay
+or tofino subdirectory where the build system expects to find it.
+
+chip.schema is a binary (python pickle) file; you can use
+`walle.py --dump-schema` to dump it as (vaguely human readble)
+yaml.  It is basically a DAG of python objects (csr.address\_map,
+csr.address\_map\_instance, and csr.reg) describing the register tree.
+The build uses walle to turn this into json files describing various
+subtrees of the dag.  The `template_objectss.yaml` file describes which
+subtrees to generate json files for as well as list of subtrees to
+ignore (elide from the json files).  Names in this file are the names of
+csr.address\_map objects (NOT instances), and where the generated files
+are nested, the containing json will contain a reference to the contained
+json rather than a copy of the tree.  In this way, the generated json
+files as a group describe the DAG even though json can only describe
+trees, not DAGs.
+
+If, when running make, you get a KeyError from walle, that generally
+means that the template\_objects.yaml file contains a refernce to
+some csr.address\_map that does not exist in the chip.schema file --
+the register tree has changed in a way that invalidates the json files
+it is trying to generate.  If you have your python setup to drop into pydb
+automatically on an uncaught exception (highly recommended), at that point
+you can use `pp section` to list all the csr.address\_map objects that
+*are* in the chip.schema.  Generally you'll find that it is the 'ignore'
+names that have changed, so fixing them is trivial.
+
+## Assembly Syntax
+The assembly syntax is documented in `SYNTAX.md` file
diff --git a/backends/tofino/bf-asm/SYNTAX.yaml b/backends/tofino/bf-asm/SYNTAX.yaml
new file mode 100644
index 00000000000..e11a476f63a
--- /dev/null
+++ b/backends/tofino/bf-asm/SYNTAX.yaml
@@ -0,0 +1,1294 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# yaml format tags for tofino assembler
+#
+# symbols used below:
+#   <name>      ::= a string of 1 or more letters, digits, '_', '-', '.',
+#                   '$', or '@' not starting with a digit.  Cannot start or
+#                   end with '-' or '.' or have two consecutive '-' or '.'
+#   <thread>    ::= "ingress" | "egress" | "ghost"
+#                   ghost is only availble in jbay mau and phv sections
+#                   ghost only has 'ghost_md' and 'pipe_mask' sections used to
+#                   configure 'tm_status_phv' register
+#   <register>  ::= <name> that matches a predefined register name
+#   <slice>     ::= <name>(<lo>..<hi>) | <name>(<bit>)
+#                   no spaces between parts of the <slice>
+#   <constant>  ::= unsigned integer constant
+#                   0x/0b/0o prefix for hex/binary/octal
+#   <match-constant> ::= constant where one or more digits may be replaced
+#                   by '*' to denote don't-care for ternary matches.
+#   <range>     ::= <constant>..<constant>
+#                   no spaces between parts of the <range>
+#   <vector>    ::= <constant> | <range> | '[' <constant> | <range> , ... ']'
+#                   can be a single constant or range or multiple constants
+#                   or ranges in a yaml list
+#   <phv-location> ::= <name> | <slice>
+#                   Must be a register name or a name defined in the phv section
+#   <bit-location> ::= <phv-location>
+#                   Denotes a single bit
+#   <matcher>    ::= half | byte0 | byte1
+#            denotes one of the parser match units
+#   <operands>   ::= <operand> { , <operand> }
+#   <operand>    ::= <phv-location> | <field> | <constant>
+#                  | hash_dist <unit> [ <lo>..<hi> ]
+#                  | rng <unit> [ <lo>..<hi> ]
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+phv [<thread>]:
+    # Defines PHV alias names for registers.  <thread> is 'ingress', 'ghost' or
+    # 'egress' and is optional.  If present, aliases are defined only for
+    # that thread.  If not present, aliases are for all threads
+    <name>: <register> | <slice>
+        # Makes <name> an alias for a register or piece of a register.
+        # register B0 through B63 for byte registers, H0-H95 for half
+        # (16-bit) registers, or W0-W63 for word (32-bit) registers.
+        # May also use R0-R224 for all registers in W/B/H order.
+        # TW0-TW31, TB0-TB21, TH0-TH47, R256-R367 for tagalong
+    <name>:
+        stage <lo>..<hi>: <register> | <slice>
+        stage <int>: <register> | <slice>
+            # makes name an alias that varies over stages.  In the first form, the map
+            # applies just for those stages; in the second form it applies from that stage
+            # up until the next stage specified by another stage key.  Stage numbers
+            # apply to iPHV (input), so uses for VLIW destinations use the mapping for the
+            # next stage.  Parser uses stage 0 and deparser uses the end-of-pipe stage
+    context_json:
+        <name>:
+            # any keys/vector/values here are converted to json and output into
+            # the context.json phv_allocation.records info for field <name>
+            # verbatim
+hdr:
+    # Information related to hdr_ids (header names and compressed
+    # header sequence and length encoding in bridge metadata)
+    map:
+        # Mapping of hdr_ids to header names
+        <name>: <constant>
+            # constant is hdr_id, name can be used as substitution for hdr_id
+            # e.g. md32: 0, ethernet: 1, ipv4: 2, ...
+    seq:
+        # Compressed header sequence encoding in bridge metadata
+        <constant>: [ <hdr-id-name>, ... ]
+            # constant is 8b sequence number; 255 is reserved for
+            # escape value to specify explicit list of all headers
+            # e.g. 0: ethernet, ipv4
+    len:
+        # Compressed header length encoding in bridge metadata
+        <hdr-id-name>: { base_len: <constant>, num_comp_bits: <constant>, scale: <constant> }
+            # base_len is 8b, num_comp_bits 3b, scale 2b
+            # e.g. ipv4: { base_len: 20, num_comp_bits: 4, scale: 2 }  # (20B + N * 4B; N < 10)
+            # e.g. ipv6: { base_len: 40, num_comp_bits: 7, scale: 3 }  # (40B + N * 8B)
+parser [<thread>] [<port_use>, ...]:
+    # Defines a parser.  <thread> must be 'ingress' or 'egress'
+    # <port_use> can have numerical values, such as range, int, vector of range and int.
+    # <port_use> can also have symbolic values, such as "ALL", "CPU", "PCIE", "UNUSED"
+    # "CPU" and "PCIE" are symbolic names that the corresponding numeric values are device-specific.
+    # "UNUSED" represents the parser program that is not configured to any physical
+    # parser during device initialization, but may be used by driver during runtime.
+    name: <name> # parser name defined in the arch file, used by bfrt to find the parser
+                 # config in context.json
+    start: <name>
+    start: '[' <name>, ... ']'
+        # define up to 4 distinct initial start states for the four channels
+    priority: <int> | '[' <int>, ... ']'
+    priority_threshold: <int> | '[' <int>, ... ']'
+        # define initial parser priority and threshold for the 4 channels
+    states:
+        # Parser states can be defined with or without the 'states' key but this
+        # is preferred as it avoids name collisions with other assembly
+        # directives,
+        <name> [<match-constant>]:
+            # Defines a parser state.  The state 'start' is the implicit initial state
+            # if there is no explicit initial state defined by a separate 'start' entry.
+            # The state 'end' cannot exist (used for exit)
+            # The optional constant is the 8-bit value used to denote the state;
+            # overlapping state values will be flagged as an error
+            match: <vector> | { <matcher> : <byte-loc> }
+                # specifies up to 4 bytes to match against in the input buffer
+                # may also specify 'ctr_zero' and 'ctr_neg' to match those
+                # special bits, or the specific matchers 'byte0', 'byte1' or 'half'
+                # to match against values explicitly loaded by a 'save' in a previous
+                # state.  May additionally specify specific matchers to use.
+            option: <name>
+                # enable an optional feature for this state.  Currently the only option
+                # is "ignore_max_depth" which means the state (and subsequent states) are
+                # ignore for calculating the max parser bytes.  Used for min parser padding
+                # states
+            <match-constant>:
+                # actions to perform when the match matches this match constant
+                # this is a tcam priority match, so only the first match triggers
+                buf_req: <constant>
+                    # number bytes that must be in the input buffer to not stall
+                counter: inc <constant> | dec <constant> | load <constant> |
+                    # modification of the counter
+                    src: <matcher>     # matcher to load counter from
+                    max: <constant>    # max value of the counter
+                    rotate: <constant> # number of bits to circular right rotate
+                    mask: <constant>   # mask of rotated value
+                    add: <constant>    # immediate to add to masked value
+                checksum <int>:
+                    # modification of checksum unit <int>
+                    type: VERIFY | RESIDUAL | CLOT
+                    mask: <vector> # vector of byte ranges of input buffer
+                    swap: <vector>
+                    dest: <phv-location> | clot <tag> # write destination
+                    start: 0 | 1
+                    end: 0 | 1
+                next: <name> | <match-constant>
+                    # next state -- match-constant takes don't care bits
+                    # from current state
+                offset: [set] <constant> | inc <constant>
+                    # modificate to the offset
+                priority: <constant> | @ <offset> [ >> <constant> ] [ & <constant> ]
+                    # update the packet priority
+                load: { <matcher> : <byte-loc>, ... } | <matcher>
+                    # specifies one or more values from the input stream to be
+                    # loaded into specific matchers or specific matchers to have
+                    # their values preserved for use by later states
+                save: [ <matcher>, ... ]
+                    # specifies on more more values from the matchers to be
+                    # saved into the scratch pad registers
+                shift: <constant>
+                    # number of bytes to shift out
+                intr_md: <constant>
+                    # number of bits of intrinsic metadata being shifted out
+                [rotate] <constant> : [offset] <phv_location>
+                [rotate] <range> : [offset] <phv_location>
+                    # write the specifed byte (or range) to named phv slot
+                [offset] <phv_location>: [rotate] <constant>
+                    # write the specified constant to the phv location
+                clot <tag>:
+                    # output a CLOT (jbay only)
+                    start: <constant>
+                    length: <constant> | <expression>
+                        # expression is generally '@' <const> [ '>>' <const> ] [ '&' <const> ]
+                        # with variations (unary is highest precedence, followed by shift,
+                        # mask(&), +/- lowest)
+                    max_length: <constant>
+                    stack_depth: <constant>
+                    stack_inc: <constant>
+                hdr_len_inc_stop: <constant>
+                    # stop the header length counter, and use value as final increment amount (jbay only)
+                disable_partial_hdr_err: <constant>
+                    # specifies whether the partial header error is enabled or not for
+                    # the current state. Specific to:
+                    #           - JBay
+                partial_hdr_err_proc: <constant>
+                    # specifies whether or not the best effort extraction (a.k.a. greedy extract)
+                    # is performed when insufficient data is available in the input buffer.
+            value_set <name> <size>:
+                handle: <int> # pvs handle
+                field_mapping:
+                  <fieldname>(range) : <Matcher>(range)
+                # actions to perform for a value set of the given <name> and integer <size>
+            default:
+                # actions to perform regardless of the match
+            # if there is no 'default' tag in a state, anything that is not
+            # recognized as a valid state tag is treated as part of an implicit
+            # default
+    hdr_len_adj: <constant>
+        # value for the hdr_len_adj register
+    init_zero: [ <phv_location>,... ]
+        # list of phv slots that should be initialized to (valid) zero
+    meta_opt: <constant>
+        # value for the meta_opt register
+    multi_write: [ <phv_location>,... ]
+        # list of phv slots that the parser may write multiple times
+        # values OR'd with previous values
+    parser_error: <phv_location>
+        # define a phv location to receive parser error codes
+    bubble:
+        # configure rate limit registers for pipe
+        inc: <int>
+        dec: <int>
+        max: <int>
+        interval: <int>  # tofino2+ only
+    ghost_md:
+        # container(s) allocated to store ghost intrinsic metadata
+        # tofino2+ only (set tm_status_phv reg)
+    pipe_mask:
+        # pipe_mask to be set for ghost packets
+        # (tm_status_phv.pipe_mask in tofino2)
+    parse_depth_checks_disabled: true | false
+        # Parse depth checks disabled
+    states:
+        <name>: <match-constant>
+            # specifies match values for states (64b)
+            # e.g. parse_ipv4: 0x********_******02
+            # e.g. parse_tcp: 0x********_******04
+    port_metadata:
+        # specifies port metadata for each logical port number
+        <constant>: <vector>
+            # constant is logical port number, vector is port metadata (14B)
+    profile <constant>:
+        # specifies a parser profile, constant represents the TCAM&SRAM index
+        match_port: <match-constant>
+        match_inband_metadata: <match-constant>
+        # if the logical port number (2'b0 ++ 6b) or inband metadata (8B) match is omitted,
+        # it is treated as *
+        initial_pktlen: <constant>
+            # specifies the value to adjust the length for AdjustedPacketLength (6b)
+        initial_seglen: <constant>
+            # specifies the value to adjust the length for AdjustedSegLength (6b)
+        initial_state: <constant> | <name>
+            # specifies initial state (80b)
+            # if a state name is used, all-match bits are initialized to 0
+            # and upper-most two bytes are set to 0
+        initial_flags: <constant>
+            # specifies initial flags (64b)
+        initial_ptr: <constant>
+            # specifies initial pointer (8b)
+        initial_w0_offset: <constant>
+            # specifies initial W0 offset (8b)
+        initial_w1_offset: <constant>
+            # specifies initial W1 offset (8b)
+        initial_w2_offset: <constant>
+            # specifies initial W2 offset (8b)
+        initial_alu0_instruction: <alu0-instruction>
+            # specifies initial instruction for ALU0 (15b)
+        initial_alu1_instruction: <alu1-instruction>
+            # specifies initial instruction for ALU1 (19b)
+        metadata_select: '[' <metadata-select>, ... ']'
+            # specifies source of each of 32B of MD32 metadata
+    analyzer_stage <constant> [<name>]:
+        # constant is stage number
+        # if name is present, it is looked up in state_map and all rules match the state
+        # only one of stage name and rule state match can be present
+        # e.g. stage 0 parse_ipv4
+        rule <constant>:
+            # constant is explicit rule index; it also specifies the rule priority
+            # when more rules match (higher value is higher priority)
+            # each rule supports up to 4 instructions for modifying flags, one for up to 16 bits
+            # (modify_flags16), one for up to 4 bits (modify_flags4), and two for 1 bit
+            # (modify_flag0 and modify_flag1). All of these can be set simultaneously. If the
+            # affected flags overlap, the instructions take effect in the following order of
+            # precedence:
+            # modify_flags16, modify_flag4, modify_flag0, modify_flag1.
+            match_state: <match-constant> | <name>
+            # if state match constant is ommited, it is derived from the state name at stage
+            # if it is missing, it is treated as *
+            match_w0: <match-constant>
+            match_w1: <match-constant>
+            # if w0 or w1 match constant is ommited, it is treated as *
+            # e.g. rule 0 w0 17  # state is matched based on `stage 0 parse_ipv4' above
+            # e.g. rule 0 w0 17 state 0x************02  # 17 is for TCP; 02 is for parse_ipv4 (see state_map)
+            next_state: <match-constant> | <name>
+                # specifies state for the next stage (80b)
+                # e.g. next_state: 0x**************04
+                # e.g. next_state: parse_tcp
+            next_skip_extractions: <bool>
+                # skips Wx extractions in the next stage (bool)
+            next_w0_offset: <constant>
+                # specifies W0 offset for the next stage (8b)
+                # e.g. next_w0_offset: 2  # TCP destination port
+            next_w1_offset: <constant>
+                # specifies W1 offset for the next stage (8b)
+            next_w2_offset: <constant>
+                # specifies W2 offset for the next stage (8b)
+            next_alu0_instruction: <alu0-instruction>
+                # specifies instruction for ALU0 for the next stage (15b)
+            next_alu1_instruction: <alu1-instruction>
+                # specifies instruction for ALU1 for the next stage (19b)
+            push_hdr_id: { hdr: <hdr-id-name>, offset: <constant> }
+                # specifies header ID (8b) or name, and offset (8b) to be pushed to ana_hdr_ptrs
+                # if name is specified, it is looked up in hdr -> map
+                # 0xff for header ID is reserved for invalid
+                # offset is relative to pointer
+                # e.g. push_hdr_id: ipv4 0
+            modify_flags16:
+                # specifies src (2b), imm (16b), mask (16b), and shift (6b) to set/clear
+                # multiple flags at once.
+                src: <modify-flags-src>
+                imm: <constant>
+                    # whether to set or clear the corresponding flags, only used if src == 3
+                mask: <constant>
+                    # modify a flag if the corresponding mask bit is set
+                shift: <constant>
+                    # index into flags at which to start the operation
+            modify_flags4:
+                # same as modify_flag16, but with 4b imm and mask
+                src: <modify-flags-src>
+                imm: <constant>
+                mask: <constant>
+                shift: <constant>
+            modify_flag0: { set: <constant> | clear: <constant> },
+                # set or clear the flag at the index given by the 6-bit <constant>
+            modify_flag1: { set: <constant> | clear: <constant> },
+                # same as modify_flag0
+            modify_checksum: { idx: <constant>, enabled: <bool> },
+                # changes the enabled state of the checksum unit at the 1-bit index idx
+    phv_builder_group <constant>:
+        # specifies extract groups
+        pov_select: <pov-select>
+            # specifies which POV bytes are used to address the TCAM & SRAM
+        extract <constant>:
+            # specifies extracts
+            match: <match-constant>
+                # match constant is 4B
+            source: <phe-source>
+                # specifies a PHE source pair
+    initial_predication_vector:
+        # specifies the initial predication information
+        pov_select: <pov-select>
+            # specifies which POV bytes are used to address the TCAM
+        next_tbl_config:
+            # a mapping for the IPV TCAM, from match constants to next table identifiers
+            <match-constant>: <constant>
+                # match constant is 4B, constant is 1B
+    ghost_initial_predication_vector:
+        # specifies the initial predication information for the ghost thread
+        pov_select: <pov-select>
+            # specifies which POV bytes are used to address the TCAM
+        next_tbl_config:
+            # a mapping for the IPV TCAM, from match constants to next table identifiers
+            <match-constant>: <constant>
+                # match constant is 4B, constant is 1B
+    checksum_checkers:
+        mask <constant>: <constant>
+            # There can be up to 4 masks.
+            # Each mask is specified as an up to 224b wide constant.
+            # A mask specifies which bytes of the header are used for the checksum computation.
+            # 1 -> used, 0 -> not used
+        unit <constant>:
+            # Each unit is able to verify 1 checksum.
+            # The checksum is computed according to the selected 'config'
+            # which specifies a header ('hdr') and a mask ('mask_sel').
+            # A config is selected using the 'match_pov' key.
+            # There are 2 units.
+            # Both units operate independently and allow verification of overlapping bytes.
+            pov_select: <pov-select>
+                # Specifies which POV bytes are used to address the TCAM
+            config <constant>:
+                # There are up to 16 checksum configurations for each unit.
+                match_pov: <match-constant>
+                    # 32b match key
+                mask_sel: <constant>
+                    # Selects one of the 4 masks (2b).
+                    # Both csum units can select the same mask.
+                hdr: <hdr-id-name>
+                    # Specifies which header is used for the checksum computation.
+                    # The concrete header bytes which are used for the checksum
+                    # are specified using a mask.
+                    # One of the 4 masks is selected using the 'mask_sel' field.
+    pov_flags_pos: <constant>
+        # specifies start position of POV flags in bridge metadata (6b)
+    pov_state_pos: <constant>
+        # specifies start position of POV state in bridge metadata (6b)
+stage <constant> <thread>:
+    # Defines a single stage of the MAU.  The order of the tables within the
+    # stage is the logical table ordering, so order matters
+    <table-type> <name> [<logical_id>]:
+        # common keys available in (almost) all tables types
+        row: <vector>
+            # one or more ram rows the table uses
+            # whether these are physical or logical rows depends on the table type
+        column: <vector> | '[' <vector>, ... ']'
+            # May be a single vector or a list of vectors.  If a list, length
+            # must match the number of rows specified
+            # Denotes the rams used on each row.  RAM type (sram, tcam or mapram)
+            # depends on the table type
+            stage <int>: <vector> | '[' <vector>, ... ']'
+            - stage <int>: <vector>
+        stages: <vector> | '[' <vector>, ... ']'
+        bus: 0 | 1 | '[' 0 | 1, ... ']'
+            # bus(es) to use.  If a list, must match the number of rows
+        lhbus: 0 | 1 | <vector>
+        rhbus: 0 | 1 | <vector>
+        word: <vector>
+            # for wide tables, specify which word of the wide word is in each row of the table
+        vpns: <vector>
+            # vpn values to use for rams
+        dyanamic_config:
+            <int> : <match-constant>
+            <int> : { <slice> : <match-constant> }
+                # defines the match for one specific dconfig bit.  Match may be a single
+                # match for the entire width or matches against specific
+                # named slices of those PHVs.  Other bits are implicitly don't-care
+        input_xbar [ <int> | <match-constant> ]:
+            # Input xbar config for this table
+            <group-name>: <phv-location> | '[' <phv_location>, ... ']' |
+                { <int>: <phv_location> | <int>..<int>: <phv_location>,... }
+            [<type>] group <constant>: ...  # tofino 1/2
+                # One or more registers to be mapped into the specified ixbar group
+                # in order or at the locations specified.  Locations are bit offsets in
+                # the group (even for groups that are not bit-addressable
+            hash [table] <constant>:
+                # hash table config
+                <int>: <hash>
+                    # specify one column of the table -- hash is a 64-bit constant
+                <int> | <range>: <expression>
+                    # specify one or more columns according to expression.  Phv refs
+                    # must be in the corresponding input group of this input_xbar
+                    <phv-location>
+                        # identity copy of phv (must match width of range)
+                    random(<phv-location>, ...)
+                        # random hash of the given phv locations.  We generate
+                        # with random(3) and we do NOT call srand, so the hash
+                        # for a given program is repeatable.
+                    crc(<int>, [<int>,] <phv-location>, ...)
+                        # Deprecated.
+                        # crc hash of the given phv locations -- first arg is integer
+                        # constant denoting polynomial (Koopman notation).  Second arg
+                        # is an initial constant prefixed to the input.
+                    crc(<int>, [<int>,] '{' <int> : <phv-location>, ... '}' [, '{' <range> : <int> '}' ])
+                        # crc hash of the given phv locations, at the specified offsets
+                        # from the lsb of the crc input as a whole.
+                        # arguments are polynomial (Koopman Notation)
+                        # init shift register
+                        # final xor
+                        # total number of bits in the crc
+                        # --
+                        # hash calculation could have a list of constants as inputs.
+                        # <range> : <int> represents the constant value in hash calculation,
+                        # the key <range> encodes the offset and length of the constant
+                        # the value <int> encodes the value of the constant
+                    crc_rev(...)
+                        # bit-reversed (little endian instead of the defaul big endian) crc
+                    xor(<int>, '{' <int> : <phv-location>, ... '}')
+                        # XOR of a data block (message)
+                        #
+                        # Fields specified in the second parameter are joined into one
+                        # big bit stream, cut into blocks of width specified in the first
+                        # parameter and the blocks are bitwise XORed together. If the
+                        # field list is not continuous (there are gaps in bit offsets),
+                        # zeros are padded in.
+                        #
+                        # Constants are not supported by this directive - they are computed
+                        # into the seed value by the backend.
+                    <expression> ^ <expression>
+                        # xor of other expressions.
+                    <expression> & <int>
+                    <int> & <expression>
+                        # mask expression, including just some bits in the result
+                    stripe(<expression>, ...)
+                        # stripe other expressions across the width required,
+                        # repeating as necessary
+                    sextend(<expression>)
+                    sign_extend(<expression>)
+                        # sign extend expression (replicating the sign bit to the needed width
+                <int>: parity
+                    # Keyword parity indicates this bit is reserved for parity
+                    # calculation
+                valid <int>: <int>
+                    # specifies the 16-bit valid hash for one column of the table
+            hash group <constant>:
+                # hash group config
+                table: <vector>
+                    # one or more hash tables to xor together for this group
+                seed: <hash>
+                    # 52-bit hash seed value
+                seed_parity: <bool>
+                    # optional parameter to indicate if seed must be parity
+                    # encoded, must be true when hash parity is enabled on the
+                    # group.
+                <int>
+                    # a single table to use for the group
+            <group-name>: '[]'
+            - <group-name>
+                # use an xbar group configured elsewhere
+            random_seed: <constant>
+                # random seed from pragma for the table
+            exact unit: <vector>
+            output unit: <int>
+        gateway:
+            # gateway table on this table -- see below
+        format: { <name>: <range>, <name>: <size> ... }
+            # format of data in the table, mapping names to ranges of bits.
+            # fields with sizes instead of explicit ranges will be laid out
+            # by the assembler following preceeding fields
+        hash_dist: { <unit>: <config>, ... }
+            <int>: # hash distribution unit to config
+                hash: <int>
+                mask: <int>
+                shift: <int>
+                expand: <int>
+                    # hash distribution config params
+                output: <name> | '[' <name>,... ']'
+                    # outputs to enable for this hast_dist unit
+                    # 'lo' | 'hi' | 'meter' | 'stats' | 'action' | 'hashmod'
+        instruction: <table-name>(<instruction-loc>, <pfe-loc>)
+            # specifies where to get the action index and pfe for the instruction
+            # to run in a given table
+        action: <table-name>(<action> [, <index>])
+            # Action table to use -- action is a named field from the format
+            # that determines which action to do.  Index is optional (for
+            # indirect action), named field from format.  If not present use
+            # direct action (index is match address).
+        action_enable: <int>
+        enable_action_data_enable: true | false
+        enable_action_instruction_enable: true | false
+        default_action: <action>
+        default_action_handle: <int>
+            # Specifies a unique integer for action handle, used to match glass
+            # If not present assembler generate handles
+        default_action_parameters: { <name> : <int>, ... }
+            # Specifies list of params and values
+        action_bus: { <int> : <name> | <int>..<int> : <name>, ... }
+            # immediate actions data
+            # meter output data
+        actions:
+            # defines actions that can be used in the table
+            <name> [<index>]:
+                # the optional index is the index to use in the 8-entry
+                # instruction indirection map of the table.
+            [- <address>]   # constant imem address to use for this action
+            [- <data alias map>]        # map of aliases for data operands
+                <name> : <name> [ (<bit-range>) ]
+                    # defines a name as an alias for (a slice of) something else
+            [- p4_param_order: '{' param_name : <int>, ... '}' ]
+                # Param order specifying param name and width for context json (p4_parameters)
+            [- hit_allowed: '{' allowed: true|false, reason: <string> '}' ]
+            [- default_action: '{' allowed: true|false, reason: <string> '}' ]
+               # the next table to be run when the entry hits with this action, could be
+               # an index into the hit_next
+            [- next_table: <int> | <name> ]
+               # the next table to run when the entry misses with this action
+            [- next_table_miss: <name> ]
+            [- context_json: ... ]
+                # any keys/vector/values here are converted to json and output
+                # into the context.json info for this action verbatim
+            - <instruction> <operands>
+        selector: <table-name>(<index-field> [ , <length-field> [ , <shift-field> ] ] )
+            # selection table to use
+        stats: <table-name> [ (<index-field>) ]
+            # statistics table to use
+        meter: <table-name> [ (<index-field>) ]
+            # meter table to use
+        stateful: <table-name> [ (<sful action> [, <index-field> ]) ]
+        stateful: <table-name> [ (<sful action>, counter [ hit | miss | gateway ]) ]
+            # stateful table to use
+        idletime:
+            # idletime table
+            row: <vector>
+            column: <vector> | '[' <vector>, ... ']'
+            bus: <int>
+            precision: 1 | 2 | 3 | 6
+            sweep_interval: <int>
+            notification: enable | disable | two_way
+            per_flow_enable: true | false
+        table_counter: disable | table_miss | table_hit | gateway_miss |
+                   gateway_hit | gateway_inhibit
+            # event type to count in per-table event counter
+        hit: <table-name> | '[' <table-name>, ... ']'
+            # next table on table hit.  If a list, 'format' must contain a
+            # 'next' field that determines which next table to use
+        miss: <table-name>
+            # next table on table miss
+        next: <table-name>
+            # default (unconditional) next table.  Exclusive with hit/miss
+        p4: # information about P4 level tables and control plane API
+            name: <name>
+                # P4 table name
+            handle: <int>
+                # runtime API handle for the table
+            size: <int>
+                # table size specified in P4 -- may be smaller than the actual
+                # table size, as table is rounded up to fill memories
+            match_type: exact | ternary | lpm | ...
+            action_profile: <name>
+            how_referenced: direct | indirect
+        p4_param_order:
+            # order of match params as seen in p4 program
+            # PD generated has same order and needs to match context
+            # json output
+            <name>:
+                # param names with their types and size info
+                type: <table-type>
+                size: <int>
+                ...
+                context_json:
+                    # any keys/vector/values here are converted to json and
+                    # output into the context.json info for this match param
+                    # verbatim
+        context_json:
+            # any keys/vector/values here are converted to json and output into
+            # the context.json info for this table verbatim
+          static_entries:
+            # List of static entries as described in the p4 program. These are
+            # passed on directly to the driver through context json. The
+            # match_key_fields_values and action_parameters_values follow the
+            # same order as the p4_param_order list in the table and action
+            # sections.
+            # Match Key Fields Values based on match type:
+            # Exact - field_name, value
+            # Ternary - field_name, value, mask
+            # Range - field_name, range_start, range_end
+            # Lpm - field_name, value, prefix_length // TODO
+            - priority: <int>
+              match_key_fields_values:
+              - field_name: <name>
+                value: <int>
+                mask: <hex> # Only for ternary match
+                range_start: <int> # Only for range match
+                range_end: <int> # Only for range match
+                prefix_length: <int> # Only for lpm match // TODO
+              action_handle: <int>
+              is_default_entry: <bool>
+              action_parameters_values:
+              - parameter_name: <string>
+                value: <int>
+    exact_match <name> [<logical_id>]:
+        # Exact match table
+        row: <vector>
+        column: <vector> | '[' <vector>, ... ']'
+            # physical rows and srams used by the table
+        stash: # Stash Allocation for exact match tables only
+          row: <vector>
+          col: <vector>
+          # Row and col are indexed in sync to give RAM used to determine word
+          # in entry
+          unit: <vector>
+          # Unit value can be (0,1) as there are 2 units per row and is indexed
+          # in sync with row/col values to give stash unit
+        input_xbar:
+            # specifies exact match groups, hash tables, and groups (see above)
+            # If there are multiple groups, the must match the total width of the
+            # format, which must in turn match the rows and the ways.
+        format: { <name>: <range>, <name>: <size> ... }
+            # names may have `(<int>)` suffix denoting up to 5 match groups
+            # all match groups must contain the same keys
+            # some names have predefined meanings:
+            match: ...  # exact match groups to match against
+            action: ... # field that selects which action to run
+            next: ...   # next table
+        match: <phv-location> | '[' <phv-location>, ... ']'
+            # value(s) to match against the 'match' field(s) in the format
+        ways:
+            - '{' < way description '}'
+            # description of one way of the table
+                xme: <int>
+                    # 0 - 7 (lambs) , 8 - 15 (stms)
+                group: <int>
+                    # hash group or XME used for this way
+                index: <range> | <int>
+                    # hash bits used to index the way rams/lambs (including subword bits)
+                select: <range> [ '&' <mask> ]
+                    # hash bits used to select enable rams/lambs in the way
+                rams: '[' <ram>, ... ']'
+                    # rams or lambs in the way.  Each is a vector of 1, 2, or 3 integers
+                    '[' <row>,<col> ']'             # tofino1/2
+            - '[' <int>,<int>,<int>, '[' <row>,<col> ']',... ']'
+            # DEPRECATED description of one way of the table
+            # initial 3 values are hash group, 10-bit slice from group, and
+            # mask of upper 12 bits from the group.
+        match_group_map: '[' '[' <int>,... ']',... ']'
+            # map from per-word match groups to overall match groups
+            # one row for each word in the width of the table with up to
+            # 5 values for up to 5 match groups in that word.  Values are
+            # match groups in the format
+        # common keys described above
+        action:
+        actions:
+        action_bus:
+        default_action:
+        default_action_handle:
+        default_action_parameters:
+        context_json:
+        gateway:
+        hash_dist:
+        hit:
+        idletime:
+        meter:
+        miss:
+        next:
+        p4:
+        selector:
+        stateful:
+        stats:
+    ternary_match <name> [<logical_id>]:
+        # Ternary match table
+        row: <vector>
+        column: <vector> | '[' <vector>, ... ']'
+            # tcam rows and columns to use
+        input_xbar:
+            # specifies ternary match groups
+            group <constant>: <phv-location> | '[' <phv_location>, ... ']'
+                # odd groups are 5 bytes wide, even groups 6 -- the extra byte
+                # is the byte group n/2
+                # TBD -- Need a way to explicitly set byte swizzler?
+        match:
+            # Input xbar group(s) to match against -- may be a vector of maps for wide
+            # matches using multiple groups
+            group: <int>
+                # Match group to match against (placed on tcam bus)
+            byte_group: <int>
+                # byte group to use for top 4 bits of tcam bus
+            byte_config: <int>
+                # value for tcams.vh_data_xbar.tcam_vh_xbar.tcam_row_halfbyte_mux_ctl
+                #                             .tcam_row_halfbyte_mux_ctl_select
+            dirtcam: <int>
+                # dirtcam control bits for the group; used to set
+                # tcams.col.tcam_mode.tcam_data_dirtcam_mode (bits 0..9)
+                # and tcams.col.tcam_mode.tcam_vbit_dirtcam_mode (bits 10..11)
+        indirect: <table-name>
+            # ternary indirection table to use with this table
+            # if there's an indirection table, it should contain all the table refs
+        indirect: <int> | <vector>
+        indirect_bus: <int>
+            # which indirect bus to use for ternary tables with no indirection table
+        # common keys described above
+        action:
+        actions:
+        action_bus:
+        default_action:
+        default_action_handle:
+        default_action_parameters:
+        context_json:
+        gateway:
+        hash_dist:
+        hit:
+        idletime:
+        meter:
+        miss:
+        next:
+        p4:
+        selector:
+        stateful:
+        stats:
+    ternary_indirect <name>:
+        # Ternary indirection table
+        row: <vector>
+        column: <vector> | '[' <vector>, ... ']'
+            # physical rows and srams to use
+        bus: 0 | 1 | '[' 0 | 1, ... ']'
+            # ternary indirection bus to use.  List must match rows
+        format: { <name>: <range-or-constant>, ... }
+            # fields in the ram record, sized in bits
+        # common keys described above
+        action:
+        actions:
+        action_bus:
+        default_action:
+        default_action_handle:
+        default_action_parameters:
+        context_json:
+        gateway:
+        hash_dist:
+        hit:
+        idletime:
+        meter:
+        miss:
+        next:
+        p4:
+        selector:
+        stateful:
+        stats:
+    hash_action <name> [<logical_id>]:
+        # hash-action table
+        row: <int>
+        bus: <int>
+            # specify which physical row and exact match bus to use
+        input_xbar:
+            # input xbar config (as exact match table)
+        # common keys described above
+        action:
+        actions:
+        action_bus:
+        default_action:
+        default_action_handle:
+        default_action_parameters:
+        context_json:
+        gateway:
+        hash_dist:
+        hit:
+        idletime:
+        meter:
+        miss:
+        next:
+        p4:
+        selector:
+        stateful:
+        stats:
+    phase0_match <name>
+        # special phase 0 match table before stage 0 (only in stage 0 ingress)
+        p4: # information about P4 level tables and control plane API
+        width: <int>
+        # other common keys are NOT available in this table type
+    proxy_hash <name> [<logical_id>]:
+        # Proxy hash Table
+        row: <vector>
+        column: <vector> | '[' <vector>, ... ']'
+            # see exact_match
+        input_xbar:
+            # see exact_match
+        format: { <name>: <range>, <name>: <size> ... }
+            # see exact_match
+            match: ...  # exact match groups to match against
+            action: ... # field that selects which action to run
+            next: ...   # next table
+        match: hash_group(<int>..<int>) | '[' hash_group(<int>..<int>), ... ']'
+            # hash groups
+        ways:
+            - '[' <int>,<int>,<int>, '[' <row>,<col> ']',... ']'
+            # see exact_match
+        proxy_hash_group:
+            # hash group of the 8 possible hash groups to use
+        proxy_hash_algorithm:
+            # for the context JSON, proxy_hash_algorithm key
+        # common keys described above
+        action:
+        actions:
+        action_bus:
+        default_action:
+        default_action_handle:
+        default_action_parameters:
+        context_json:
+        gateway:
+        hash_dist:
+        hit:
+        idletime:
+        meter:
+        miss:
+        next:
+        p4:
+        selector:
+        stateful:
+        stats:
+    action <name>:
+        # Action table
+        logical_row: <vector>
+        column: <vector> | '[' <vector>, ... ']'
+            # srams to use -- in logical (16x6) coords, not physical (8x12)
+        home_row: <vector> | '[' <vector>, ... ']'
+            # row(s) to use as home rows for the table
+        format [<action>]: { <name>: <range-or-constant>, ... }
+            # fields in the ram record.  Different actions may have
+            # different formats (and different sizes)...
+        action_bus: { <int> : <name> | <int>..<int> : <name>, ... }
+            # mapping from action bus bytes to values in the table.  Names
+            # must be present in the 'format' for the table.
+            # Can be optional -- if not present, assembler will attempt to
+            # lay out fields in the action bus based on usage in actions.
+        actions:
+            # defines actions that can be used in the table
+            <name> [<index>]:
+                # the optional index is the index to use in the 8-entry
+                # instruction indirection map of the table.
+            [- <address>]   # constant imem address to use for this action
+            [- <data alias map>]        # map of aliases for data operands
+                <name> : <name> [ (<bit-range>) ]
+                    # defines a name as an alias for (a slice of) something else
+            [- p4_param_order: '{' param_name : <int>, ... '}' ]
+                # Param order specifying param name and width for context json (p4_parameters)
+            [- p4_param_order:
+                param_name:
+                    width: <int>
+                    context_json: #anything
+              ... ]
+                # Alternative syntax for specifying param order when attaching context_json
+            [- default_action: '{' allowed: true|false, reason: <string> '}' ]
+            - <instruction> <operands>
+        p4: # information about P4 level tables and control plane API
+            # same as exact_match p4 info
+        context_json: #anything
+    gateway <name> [<logical_id>]:
+        # 'bare' Gateway table -- no corresponding match table, so must
+        # always specify next table
+        name: <gateway-name>
+            # Only output when gateway associated with a match table i.e. not
+            # 'bare'
+        row: <constant>
+            # physical match row to use
+        bus: 0 | 1
+            # match bus to use
+        payload_row: <constant>
+        payload_bus: <constant>
+            # row/bus to use for payload -- can only be specified on a
+            # standalone gateway, as an attached gateway uses the row(s)
+            # specified by the table it is attached to
+        input_xbar:
+            # as for exact_match, but can only specify one group
+        match: <phv-location> | '[' <phv-location>, ... ']'
+            # value(s) to match against the match constants
+        xor: <phv-location> | '[' <phv-location>, ... ']'
+            # value(s) to xor against the match value
+        range: 2 | 4
+            # do 2 or 4 bit range matches in the upper 12 bits of the gateway
+        <match-constant>:
+            # match row for gateway.  Value may be <table-name> (for next table)
+            # or "run_table" or a map with some or all of these keys.
+            next: <table-name>
+                # next table for this match
+            run_table: <true> | <false>
+                # disable the gateway (run the logical match normally)
+                # not applicable to bare gateways
+            action: <action-name>
+                # run the specified action when the line hits
+        ? [ <int>, ..., <match-constant> ] :
+            # Range match row for gateway.  Each value except the ladt is a
+            # 2**n bit lookup table for a range match unit (so 4 bit values
+            # for range:2 and 16 bit values for range:4).  The last value is
+            # the normal tcam match for the bottm 32 bits of the gateway
+            # Same value options as normal match rows.  Big-endian order
+            # for units (last int is bottom 2 or 4 bits of upper 12 bits)
+        miss:
+            # behavior if no row matches (same options as match row above)
+        condition:
+            # condition output used for model logging
+          expression : <string>
+            # condition string as specified in p4
+          true : <string>
+            # next table name when condition is true
+          false : <string>
+            # next table name when condition is false
+        payload: <constant>
+            # payload data to use if gateway is not disabled (run_table is false)
+        match_address: <constant>
+            # gateway match address to use if the gateway is not disabled
+        context_json: #anything
+    selection <name> [<logical_id>]:
+        row: <vector>
+        logical_bus: <vector>
+            # must match the number of rows specified. Indicate the logical bus
+            # used for each rows. Value can be: 'A' => Action Bus, 'S' => Synth
+            # Bus, 'O' => Overflow Bus, 'X' => Undefined.
+        column: <vector> | '[' <vector>, ... ']'
+            # srams to use -- in logical (16x6) coords, not physical (8x12)
+        maprams: <vector> | '[' <vector>, ... ']'
+            # map rams to use
+        home_row: <vector>
+            # represent the row ultimately connected to the ALU
+        input_xbar:
+            # hash match groups on input xbar
+        mode: resilient <int> | fair <int>
+        non_linear: true | false
+        per_flow_enable: true | false
+        pool_sizes: <vector>
+        selection_hash: <int>
+        hash_dist: <hash_distribution>
+            # see hash_action hash_dist
+        p4: # information about P4 level tables and control plane API
+            # same as exact_match p4 info
+        context_json: #anything
+    counter <name> [<logical_id>]:
+        row: <vector>
+        logical_bus: <vector>
+            # must match the number of rows specified. Indicate the logical bus
+            # used for each rows. Value can be: 'A' => Action Bus, 'S' => Synth
+            # Bus, 'O' => Overflow Bus, 'X' => Undefined.
+        column: <vector> | '[' <vector>, ... ']'
+            # srams to use -- in logical (16x6) coords, not physical (8x12)
+        maprams: <vector> | '[' <vector>, ... ']'
+            # map rams to use
+        vpns: <vector>
+        home_row: <vector>
+            # represent the row ultimately connected to the ALU
+        format:
+        count: bytes | packets | both | packets_and_bytes
+        lrt: '{' <threshold>: <interval>, ... '}'
+          - '{' threshold: <int>, interval: <int> '}' ...
+          # largest recent with threshold params
+        global_binding: true | false
+        per_flow_enable: true | false
+        bytecount_adjust: <int>
+          # add value to counted bytes
+    meter <name> [<logical_id>]:
+        row: <vector>
+        logical_bus: <vector>
+            # must match the number of rows specified. Indicate the logical bus
+            # used for each rows. Value can be: 'A' => Action Bus, 'S' => Synth
+            # Bus, 'O' => Overflow Bus, 'X' => Undefined.
+        column: <vector> | '[' <vector>, ... ']'
+            # srams to use -- in logical (16x6) coords, not physical (8x12)
+        maprams: <vector> | '[' <vector>, ... ']'
+            # map rams to use
+        vpns: <vector>
+        home_row: <vector>
+            # represent the row ultimately connected to the ALU
+        input_xbar:
+            # hash match groups on input xbar
+        color_aware: true | false | per_flow
+        color_maprams:
+            row: <vector>
+                # logical rows
+            column: <vector> | '[' <vector>, ... ']'
+            bus: <vector>
+            vpns: <vector>
+        hash_dist: <hash_distribution>
+            # see hash_action hash_dist
+        type: standard | lpf | red
+        count: bytes | packets
+        bytecount_adjust: <int>
+          # add value to counted bytes
+        sweep_interval: <int>
+        global_binding: true | false
+        per_flow_enable: true | false
+        context_json: #anything
+    stateful <name> [<logical_id>]:
+        row: <vector>
+        logical_bus: <vector>
+            # must match the number of rows specified. Indicate the logical bus
+            # used for each rows. Value can be: 'A' => Action Bus, 'S' => Synth
+            # Bus, 'O' => Overflow Bus, 'X' => Undefined.
+        column: <vector> | '[' <vector>, ... ']'
+            # srams to use -- in logical (16x6) coords, not physical (8x12)
+        maprams: <vector> | '[' <vector>, ... ']'
+            # map rams to use
+        vpns: <vector>
+        home_row: <vector>
+            # represent the row ultimately connected to the ALU
+        hash_dist: <hash_distribution>
+            # see hash_action hash_dist
+        input_xbar:
+            # exact match group and hash to use for phv input
+        data_bytemask: <int>
+        hash_bytemask: <int>
+            # masks specifying which byte of the phv input come from data and hash
+        initial_value: { lo : <int> , hi : <int> }
+            # Specify initial value for register, assumed 0 otherwise
+        const_table: <vector> | '{' <index>: <value> '}'
+        math_table:
+          data: <vector> | '{' <index>: <value> '}'
+          invert: true | false
+          shift: <int>
+          scale: <int>
+        log_vpn: <int> | <range>
+          # vpns to use in stateful logging mode
+        pred_shift: <int>
+        pred_comb_shift: <int>
+          # set the salu_output_pred_shift and _comb_shift csr regs explicitly
+          # FIXME -- should have a better way of doing this?
+        actions:
+            <action name> :
+                - <instruction>
+                    # SALU instructions to run for this table
+        context_json: #anything
+        # jbay additional features:
+        sbus:
+          # jbay only -- shared bus use
+          learn: <table> | '[' <table>, ... ']'
+          match: <table> | '[' <table>, ... ']'
+          combine: "and" | "or"
+        fifo: { push: <mode>, pop: <mode> }
+        stack: { push: <mode>, pop: <mode> }
+        bloom filter clear: <mode>
+          # fifo or stack or bloom filter fast clear mode (mutually exclusive)
+          # <mode> is hit | miss | gateway | active | control_plane
+          # controls when the stack/fifo is pushed or popped
+        watermark: push <level> | pop <level>
+          # watermark interrupts sent every <level> pushes or pops
+        offset_vpn: true | false
+          # adjust immediate data by vpn offset to compute vpns for multistage
+          # fifo/stack (jbay only)
+        address_shift: <int>
+          # shift up the incoming meter address before vpn/index/subword extract (jbay only)
+        stage_alu_id: <int>
+          # stage + alu id to be preprended to output addresses
+    dependency: concurrent | action | match
+        # set the interstage dependency between this stage and the
+        # previous stage.  Ignored in stage 0
+    error_mode: no_config | propagate | map_to_immediate | disable
+    always_run_action:
+        # action that runs automatcially in the stage independent of tables
+        - <instruction> <operands>
+    # configuration setting for mpr_stage_id
+    mpr_stage_id: <int>
+    # configuration setting for mpr_bus_dep_glob_exec
+    # A bit that is 0 means treat that global execute bit as pass-through (action dependent),
+    # because the next stage is action dependent, while a 1 means update it in the current stage.
+    mpr_bus_dep_glob_exec: <int>
+    # configuration setting for mpr_bus_dep_long_brch
+    # A bit that is 0 means treat that long branch tag ID bit as pass-through (action dependent),
+    # because the next stage is action dependent, while a 1 means update it in the current stage.
+    mpr_bus_dep_long_brch: <int>
+    # configuration setting for mpr_always_run
+    mpr_always_run: <int>
+    # Note that unspecified values are assumed to be 0.
+    mpr_next_table_lut:
+        <int>: <int>  # Resolved incoming logical ID to activation bit map
+    mpr_glob_exec_lut:
+        <int>: <int>  # Resolved incoming global execute bit to activation bit map
+    mpr_long_brch_lut:
+        <int>: <int>  # Resolved incoming long branch tag ID to activation bit map
+deparser <thread>:
+    # Defines a deparser.  <thread> must be 'ingress' or 'egress'
+    dictionary:
+        # ordered list of phv locations to write out as the output deparser
+        - <phv-location> : <bit-location>
+            # single value to write iff the referred bit is set
+        - full_checksum <int> : <bit-location>
+            # checksum result to write iff the referred bit is set
+        - <constant> : <bit-location>
+            # constant to write iff the referred bit is set (jbay only)
+        - clot <tag>:
+            # clot to output (jbay only)
+            pov: <bit-location>
+            length: <int>
+                # maximum length of the clot
+            <int>: <phv_location> | checksum <int>
+                # offset in clot to replace with a PHV or checksum value
+    pov: <phv-location> | '[' <phv-location>, ... ']'
+        # optional explicit use/ordering of phvs for POV.  All phvs used for POV bits
+        # in the dictionary will be added to the end of this, if not already present
+    partial_checksum <int>:
+        <phv-location>: { swap: <constant> [, pov: <phv-location> ] }
+        # checksum unit programming -- pov bits for jbay only
+    full_checksum <int>:
+        partial_checksum <int> | clot <tag>: { pov: <phv-location> , invert: <bool> }
+    <name>: <phv-location> [ ':' <pov-bit ] | <digest-params>
+        # more generally, any deparser param that comes from the phv is
+        # specified this way.  Only jbay has pov bits here
+        # <digest-params> are as follows
+        select: <phv-location> [ ':' <pov-bit> ]
+            # controls which digest group is output
+        shift: <constant>
+        <int>: <phv-location> | '[' <phv-location>, ... ']'
+                # values for a single digest group; specifies the sequence of
+                # phv containers in the appropriate table entry. this is usually
+                # data that is included in the digest, but it may also contain
+                # control metadata; for example, when configuring mirroring on
+                # Tofino, the first phv container specifies the mirror session id.
+        context_json: # anything
+    # ingress or egress params:
+    mirror: <digest-params>
+    egress_unicast_port: <phv-location> [ ':' <pov-bit> ]
+        # specifies the port to write to
+    # FIXME: should this be squashed into the port?
+    egress_unicast_pipe: <phv-location> [ ':' <pov-bit> ]
+        # specifies the port to write to
+    drop_ctl: <phv-location> [ ':' <pov-bit> ]
+    # jbay only, ingress or egress
+    afc: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_epipe_port: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_c2c_ctrl: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_coal_smpl_len: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_dond_ctrl: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_hash: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_icos: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_io_sel: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_mc_ctrl: '{' <phv->location> ':' <pov-bit> '}'
+    mirr_qid: '{' <phv->location> ':' <pov-bit> '}'
+    mtu_trunc_err_f: '{' <phv->location> ':' <pov-bit> '}'
+    mtu_trunc_len: '{' <phv->location> ':' <pov-bit> '}'
+    # ingress only deparser params:
+    learning: <digest-params> [ ':' <pov-bit> ]
+    resubmit: <digest-params> [ ':' <pov-bit> ]
+    copy_to_cpu: <phv-location> [ ':' <pov-bit> ]
+    egress_multicast_group_<int>: <phv-location> [ ':' <pov-bit> ]
+    hash_lag_ecmp_mcast_<int>: <phv-location> [ ':' <pov-bit> ]
+    copy_to_cpu_cos: <phv-location> [ ':' <pov-bit> ]
+    ingress_port_source: <phv-location> [ ':' <pov-bit> ]
+    deflect_on_drop: <phv-location> [ ':' <pov-bit> ]
+    meter_color: <phv-location> [ ':' <pov-bit> ]
+    icos: <phv-location> [ ':' <pov-bit> ]
+    qid: <phv-location> [ ':' <pov-bit> ]
+    xid: <phv-location> [ ':' <pov-bit> ]
+    yid: <phv-location> [ ':' <pov-bit> ]
+    rid: <phv-location> [ ':' <pov-bit> ]
+    warp: <phv-location> [ ':' <pov-bit> ]
+    ct_disable: <phv-location> [ ':' <pov-bit> ]
+    ct_mcast: <phv-location> [ ':' <pov-bit> ]
+    # jbay ingress only
+    bypass_egr: '{' <phv->location> ':' <pov-bit> '}'
+    # egress only deparser params: [ ':' <pov-bit> ]
+    force_tx_err: <phv-location> [ ':' <pov-bit> ]
+    tx_pkt_has_offsets: <phv-location> [ ':' <pov-bit> ]
+    capture_tx_ts: <phv-location> [ ':' <pov-bit> ]
+    coal: <phv-location> [ ':' <pov-bit> ]
+    ecos: <phv-location> [ ':' <pov-bit> ]
+    copy_to_cpu_cos: <phv-location> [ ':' <pov-bit> ] # or c2c_cos
+    copy_to_cpu_qid: <phv-location> [ ':' <pov-bit> ] # or c2c_qid
+    mirr_bitmap: <phv-location> [ ':' <pov-bit> ]
+    valid_vec: <phv-location> [ ':' <pov-bit> ]
+    # Ingress pipe -> TM fields
+    #  - tableid (1b) -- ???
+    #  - mcid1 - Multicast Group ID 1
+    #  - mcid2 - Multicast Group ID 2
+    #  - hash1 - Hash for L1 (is the the same as hash_lag_ecmp_mcast_<int>?)
+    #  - hash2 - Hash for L2 (is the the same as hash_lag_ecmp_mcast_<int>?)
+    packet_body_offset:
+        # TODO: Needed? Maybe just use a fixed header type for PBO?
+        # Packet body offset
+        # Payload body offset is:
+        #   base_offset (unsigned) + const_offset (signed) + var_offset (unsigned).
+        hdr: <name> | <constant>
+            # Header name or ID to use for base offset location
+        offset: <constant>
+            # Constant (signed) offset to add to the base offset
+            # Default: 0
+        var_off_pos: <constant>
+            # Variable offset: start bit position in POV
+            # Default: 0
+        var_off_len: <constant>
+            # Variable offset: length in POV
+            # Default: 0
+    zero: <phv_location>,
+        # list of phv slots that should be initialized to (valid) zero
+    remaining_bridge_metadata:
+        # packing of remaining bridge metadata
+        pov_select: <pov-select>
+            # POV bytes used to address the TCAM & SRAM
+        config <constant>:
+            # TCAM & SRAM configuration
+            match: <match-constant>
+                # POV match (4B)
+            start: <constant>
+                # start position of the remaining bridge metadata in bridge metadata (6b)
+                # register: rem_brm_ext_ram.rem_brm_ext[*].rem_brm_start
+            bytes: '[' <constant> | <phv-location>, ... ']' | '{' <constant> ':' <constant> | <phv-location>, ... '}'
+                # source PHEs of the remaining bridge metadata
+                # if a list is used, the items are implicitly addressed (from 0 up) bytes of remaining bridge metadata
+                # if a map is used, the items are explicitly addressed bytes of remaining bridge metadata
+                # up to 62 items depending on the remaining bridge metadata start position and the number of POV bytes (8B flags + 8B state)
+                # if an 8b constant is used as the PHE byte source, its value is directly written to the configuration registers
+                # if a PHE (slice) name is used as the PHE byte source, it is first mapped to the PHE byte number
+                # registers: rem_brm_ext_ram.rem_brm_ext[*].b*_phv_sel
+flexible_headers:
+    # Lists the headers that were re-packed by the compiler because
+    # they were marked flexible. See context.json schema 'flexible_headers' node
+    # for more information. This section is optional. It exists only if there are
+    # flexible headers defined in the program (e.g., bridged metadata)
+    # It consists of the json snippet that is part of context.json verbatim.
+primitives: <filename>
+    # Defines the name of the json file that has information on primitives used
+    # within table actions. These are placed in the respective actions as
+    # primitives node. This node is mainly used by model for logging
+    # instructions as specified in original p4 program
+dynhash: <filename>
+    # Defines the name of the json file that has the dynamic hash calculation
+    # node. This node is directly merged into the context json at the top level
+# version 1.0.0
+version: <number>
+    # semantic versioning number
+# version 1.0.1
+version:
+    version: <number>
+       # semantic versioning number
+    run_id: <sha>
+       # defines an id that ties together all the files produced by the compiler
+       # part of the Version section
+    target: <name>
+       # specify the target architecture
diff --git a/backends/tofino/bf-asm/action_bus.cpp b/backends/tofino/bf-asm/action_bus.cpp
new file mode 100644
index 00000000000..6019286c81a
--- /dev/null
+++ b/backends/tofino/bf-asm/action_bus.cpp
@@ -0,0 +1,1204 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "action_bus.h"
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "lib/hex.h"
+#include "misc.h"
+
+static MeterBus_t MeterBus;
+
+std::ostream &operator<<(std::ostream &out, const ActionBusSource &src) {
+    const char *sep = "";
+    switch (src.type) {
+        case ActionBusSource::None:
+            out << "None";
+            break;
+        case ActionBusSource::Field:
+            out << "Field(";
+            for (auto &range : src.field->bits) {
+                out << sep << range.lo << ".." << range.hi;
+                sep = ", ";
+            }
+            out << ")";
+            if (src.field->fmt && src.field->fmt->tbl)
+                out << " " << src.field->fmt->tbl->find_field(src.field);
+            break;
+        case ActionBusSource::HashDist:
+            out << "HashDist(" << src.hd->hash_group << ", " << src.hd->id << ")";
+            break;
+        case ActionBusSource::HashDistPair:
+            out << "HashDistPair([" << src.hd_tuple.hd1->hash_group << ", " << src.hd_tuple.hd1->id
+                << "]," << "[" << src.hd_tuple.hd2->hash_group << ", " << src.hd_tuple.hd2->id
+                << "])";
+            break;
+        case ActionBusSource::RandomGen:
+            out << "rng " << src.rng.unit;
+            break;
+        case ActionBusSource::TableOutput:
+            out << "TableOutput(" << (src.table ? src.table->name() : "0") << ")";
+            break;
+        case ActionBusSource::TableColor:
+            out << "TableColor(" << (src.table ? src.table->name() : "0") << ")";
+            break;
+        case ActionBusSource::TableAddress:
+            out << "TableAddress(" << (src.table ? src.table->name() : "0") << ")";
+            break;
+        case ActionBusSource::Ealu:
+            out << "EALU";
+            break;
+        case ActionBusSource::XcmpData:
+            out << "XCMP(" << src.xcmp_data.xcmp_group << ":" << src.xcmp_data.xcmp_byte << ")";
+            break;
+        case ActionBusSource::NameRef:
+            out << "NameRef(" << (src.name_ref ? src.name_ref->name : "0") << ")";
+            break;
+        case ActionBusSource::ColorRef:
+            out << "ColorRef(" << (src.name_ref ? src.name_ref->name : "0") << ")";
+            break;
+        case ActionBusSource::AddressRef:
+            out << "AddressRef(" << (src.name_ref ? src.name_ref->name : "0") << ")";
+            break;
+        default:
+            out << "<invalid type 0x" << hex(src.type) << ">";
+            break;
+    }
+    return out;
+}
+
+/* identifes which bytes on the action bus are tied together in the hv_xbar input,
+ * so must be routed together.  The second table here is basically just bitcount of
+ * masks in the first table. */
+static std::array<std::array<unsigned, 16>, ACTION_HV_XBAR_SLICES> action_hv_slice_byte_groups = {{
+    {0x3, 0x3, 0xc, 0xc, 0xf0, 0xf0, 0xf0, 0xf0, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+    {0xf, 0xf, 0xf, 0xf, 0xf0, 0xf0, 0xf0, 0xf0, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+    {0xf, 0xf, 0xf, 0xf, 0xf0, 0xf0, 0xf0, 0xf0, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+    {0xf, 0xf, 0xf, 0xf, 0xf0, 0xf0, 0xf0, 0xf0, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00, 0xff00,
+     0xff00, 0xff00},
+}};
+
+static std::array<std::array<int, 16>, ACTION_HV_XBAR_SLICES> action_hv_slice_group_align = {
+    {{2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8},
+     {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8},
+     {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8},
+     {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8},
+     {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+     {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+     {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+     {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}}};
+
+ActionBus::ActionBus(Table *tbl, VECTOR(pair_t) & data) {
+    lineno = data.size ? data[0].key.lineno : -1;
+    for (auto &kv : data) {
+        if (!CHECKTYPE2(kv.key, tINT, tRANGE)) continue;
+        unsigned idx = kv.key.type == tRANGE ? kv.key.range.lo : kv.key.i;
+        if (!CHECKTYPE2M(kv.value, tSTR, tCMD, "field name or slice")) continue;
+        const char *name = kv.value.s;
+        value_t *name_ref = &kv.value;
+        unsigned off = 0, sz = 0;
+        if (kv.value.type == tCMD) {
+            BUG_CHECK(kv.value.vec.size > 0 && kv.value[0].type == tSTR);
+            if (kv.value == "hash_dist" || kv.value == "rng") {
+                if (!PCHECKTYPE(kv.value.vec.size > 1, kv.value[1], tINT)) continue;
+                name = kv.value[0].s;
+                name_ref = nullptr;
+            } else {
+                if (!PCHECKTYPE2M(kv.value.vec.size == 2, kv.value[1], tRANGE, tSTR,
+                                  "field name or slice"))
+                    continue;
+                // if ((kv.value[1].range.lo & 7) != 0 || (kv.value[1].range.hi & 7) != 7) {
+                //     error(kv.value.lineno, "Slice must be byte slice");
+                //     continue; }
+                name = kv.value[0].s;
+                name_ref = &kv.value[0];
+                if (kv.value[1].type == tRANGE) {
+                    off = kv.value[1].range.lo;
+                    sz = kv.value[1].range.hi - kv.value[1].range.lo + 1;
+                } else if (kv.value[1] != "color") {
+                    error(kv.value[1].lineno, "unexpected %s", kv.value[1].s);
+                }
+            }
+        }
+        Table::Format::Field *f = tbl->lookup_field(name, "*");
+        ActionBusSource src;
+        const char *p = name - 1;
+        while (!f && (p = strchr(p + 1, '.')))
+            f = tbl->lookup_field(p + 1, std::string(name, p - name));
+        if (!f) {
+            if (tbl->table_type() == Table::ACTION) {
+                error(kv.value.lineno, "No field %s in format", name);
+                continue;
+            } else if (kv.value == "meter") {
+                src = ActionBusSource(MeterBus);
+                if (kv.value.type == tCMD) {
+                    if (kv.value[1] == "color") {
+                        src.type = ActionBusSource::ColorRef;
+                        if (!sz) off = 24, sz = 8;
+                    } else if (kv.value[1] == "address") {
+                        src.type = ActionBusSource::AddressRef;
+                    }
+                }
+            } else if (kv.value.type == tCMD && kv.value == "hash_dist") {
+                if (auto hd = tbl->find_hash_dist(kv.value[1].i)) {
+                    src = ActionBusSource(hd);
+                } else {
+                    error(kv.value.lineno, "No hash_dist %" PRId64 " in table %s", kv.value[1].i,
+                          tbl->name());
+                    continue;
+                }
+                sz = 16;
+                for (int i = 2; i < kv.value.vec.size; ++i) {
+                    if (kv.value[i] == "lo" || kv.value[i] == "low") {
+                        src.hd->xbar_use |= HashDistribution::IMMEDIATE_LOW;
+                    } else if (kv.value[i] == "hi" || kv.value[i] == "high") {
+                        src.hd->xbar_use |= HashDistribution::IMMEDIATE_HIGH;
+                        off += 16;
+                    } else if (kv.value[i].type == tINT) {
+                        if (auto hd_hi = tbl->find_hash_dist(kv.value[i].i)) {
+                            src.hd->xbar_use |= HashDistribution::IMMEDIATE_LOW;
+                            hd_hi->xbar_use |= HashDistribution::IMMEDIATE_HIGH;
+                            setup_slot(kv.value.lineno, tbl, name, idx + 2, ActionBusSource(hd_hi),
+                                       16, 16);
+                            setup_slot(kv.value.lineno, tbl, name, idx,
+                                       ActionBusSource(src.hd, hd_hi), 32, 0);
+                        }
+                    } else if (kv.value[i].type == tRANGE) {
+                        if ((kv.value[i].range.lo & 7) != 0 || (kv.value[i].range.hi & 7) != 7)
+                            error(kv.value.lineno, "Slice must be byte slice");
+                        off += kv.value[i].range.lo;
+                        sz = kv.value[i].range.hi - kv.value[i].range.lo + 1;
+                    } else {
+                        error(kv.value[i].lineno, "Unexpected hash_dist %s",
+                              value_desc(kv.value[i]));
+                        break;
+                    }
+                }
+            } else if (kv.value.type == tCMD && kv.value == "rng") {
+                src = ActionBusSource(RandomNumberGen(kv.value[1].i));
+                if (kv.value.vec.size > 2 && CHECKTYPE(kv.value[2], tRANGE)) {
+                    off = kv.value[2].range.lo;
+                    sz = kv.value[2].range.hi + 1 - off;
+                }
+            } else if (name_ref) {
+                src = ActionBusSource(new Table::Ref(*name_ref));
+                if (kv.value.type == tCMD) {
+                    if (kv.value[1] == "color") {
+                        src.type = ActionBusSource::ColorRef;
+                        if (!sz) off = 24, sz = 8;
+                    } else if (kv.value[1] == "address") {
+                        src.type = ActionBusSource::AddressRef;
+                    }
+                }
+            } else if (tbl->format) {
+                error(kv.value.lineno, "No field %s in format", name);
+                continue;
+            }
+        } else {
+            src = ActionBusSource(f);
+            if (!sz) sz = f->size;
+            if (off + sz > f->size)
+                error(kv.value.lineno, "Invalid slice of %d bit field %s", f->size, name);
+        }
+        if (kv.key.type == tRANGE) {
+            unsigned size = (kv.key.range.hi - idx + 1) * 8;
+            // Make slot size (sz) same as no. of bytes allocated on action bus.
+            if (size > sz) sz = size;
+        } else if (!sz) {
+            sz = idx < ACTION_DATA_8B_SLOTS                               ? 8
+                 : idx < ACTION_DATA_8B_SLOTS + 2 * ACTION_DATA_16B_SLOTS ? 16
+                                                                          : 32;
+        }
+        setup_slot(kv.key.lineno, tbl, name, idx, src, sz, off);
+        tbl->apply_to_field(
+            name, [](Table::Format::Field *f) { f->flags |= Table::Format::Field::USED_IMMED; });
+        if (f) {
+            auto &slot = by_byte.at(idx);
+            tbl->apply_to_field(name, [&slot, tbl, off](Table::Format::Field *f) {
+                ActionBusSource src(f);
+                if (slot.data.emplace(src, off).second) {
+                    LOG4("        data += " << src.toString(tbl) << " off=" << off);
+                }
+            });
+        }
+    }
+}
+
+std::unique_ptr<ActionBus> ActionBus::create() {
+    return std::unique_ptr<ActionBus>(new ActionBus());
+}
+
+std::unique_ptr<ActionBus> ActionBus::create(Table *tbl, VECTOR(pair_t) & data) {
+    return std::unique_ptr<ActionBus>(new ActionBus(tbl, data));
+}
+
+void ActionBus::setup_slot(int lineno, Table *tbl, const char *name, unsigned idx,
+                           ActionBusSource src, unsigned sz, unsigned off) {
+    if (idx >= ACTION_DATA_BUS_BYTES) {
+        error(lineno, "Action bus index out of range");
+        return;
+    }
+    if (by_byte.count(idx)) {
+        auto &slot = by_byte.at(idx);
+        if (sz > slot.size) {
+            slot.name = name;
+            slot.size = sz;
+        }
+        slot.data.emplace(src, off);
+        LOG4("ActionBus::ActionBus: " << idx << ": " << name << " sz=" << sz
+                                      << " data += " << src.toString(tbl) << " off=" << off);
+    } else {
+        by_byte.emplace(idx, Slot(name, idx, sz, src, off));
+        LOG4("ActionBus::ActionBus: " << idx << ": " << name << " sz=" << sz
+                                      << " data = " << src.toString(tbl) << " off=" << off);
+    }
+}
+
+unsigned ActionBus::Slot::lo(Table *tbl) const {
+    int rv = -1;
+    for (auto &src : data) {
+        int off = src.second;
+        if (src.first.type == ActionBusSource::Field) off += src.first.field->immed_bit(0);
+        BUG_CHECK(rv < 0 || rv == off);
+        rv = off;
+    }
+    BUG_CHECK(rv >= 0);
+    return rv;
+}
+
+bool ActionBus::compatible(const ActionBusSource &a, unsigned a_off, const ActionBusSource &b,
+                           unsigned b_off) {
+    if ((a.type == ActionBusSource::HashDist) && (b.type == ActionBusSource::HashDistPair)) {
+        return ((compatible(a, a_off, ActionBusSource(b.hd_tuple.hd1), b_off)) ||
+                (compatible(a, a_off, ActionBusSource(b.hd_tuple.hd2), b_off + 16)));
+    } else if ((a.type == ActionBusSource::HashDistPair) && (b.type == ActionBusSource::HashDist)) {
+        return ((compatible(ActionBusSource(a.hd_tuple.hd1), a_off, b, b_off)) ||
+                (compatible(ActionBusSource(a.hd_tuple.hd2), a_off + 16, b, b_off)));
+    }
+    if (a.type != b.type) return false;
+    switch (a.type) {
+        case ActionBusSource::Field:
+            // corresponding fields in different groups are compatible even though they
+            // are at different locations.  Table::Format::pass1 checks that
+            if (a.field->by_group == b.field->by_group) return true;
+            return a.field->bit(a_off) == b.field->bit(b_off);
+        case ActionBusSource::HashDist:
+            return a.hd->hash_group == b.hd->hash_group && a.hd->id == b.hd->id && a_off == b_off;
+        case ActionBusSource::HashDistPair:
+            return ((a.hd_tuple.hd1->hash_group == b.hd_tuple.hd1->hash_group &&
+                     a.hd_tuple.hd1->id == b.hd_tuple.hd1->id) &&
+                    (a_off == b_off) &&
+                    (a.hd_tuple.hd2->hash_group == b.hd_tuple.hd2->hash_group &&
+                     a.hd_tuple.hd2->id == b.hd_tuple.hd2->id));
+        case ActionBusSource::TableOutput:
+            return a.table == b.table;
+        default:
+            return false;
+    }
+}
+
+void ActionBus::pass1(Table *tbl) {
+    bool is_immed_data = dynamic_cast<MatchTable *>(tbl) != nullptr;
+    LOG1("ActionBus::pass1(" << tbl->name() << ")" << (is_immed_data ? " [immed]" : ""));
+    if (lineno < 0)
+        lineno = tbl->format && tbl->format->lineno >= 0 ? tbl->format->lineno : tbl->lineno;
+    Slot *use[ACTION_DATA_BUS_SLOTS] = {0};
+    for (auto &slot : Values(by_byte)) {
+        for (auto it = slot.data.begin(); it != slot.data.end();) {
+            if (it->first.type >= ActionBusSource::NameRef &&
+                it->first.type <= ActionBusSource::AddressRef) {
+                // Remove all NameRef and replace with TableOutputs or Fields
+                // ColorRef turns into TableColor,  AddressRef into TableAddress
+                if (it->first.name_ref) {
+                    bool ok = false;
+                    if (*it->first.name_ref) {
+                        ActionBusSource src(*it->first.name_ref);
+                        switch (it->first.type) {
+                            case ActionBusSource::NameRef:
+                                src.table->set_output_used();
+                                break;
+                            case ActionBusSource::ColorRef:
+                                src.type = ActionBusSource::TableColor;
+                                src.table->set_color_used();
+                                break;
+                            case ActionBusSource::AddressRef:
+                                src.type = ActionBusSource::TableAddress;
+                                src.table->set_address_used();
+                                break;
+                            default:
+                                BUG();
+                        }
+                        slot.data[src] = it->second;
+                        ok = true;
+                    } else if (tbl->actions) {
+                        Table::Format::Field *found_field = nullptr;
+                        Table::Actions::Action *found_act = nullptr;
+                        for (auto &act : *tbl->actions) {
+                            int lo = -1, hi = -1;
+                            auto name = act.alias_lookup(it->first.name_ref->lineno,
+                                                         it->first.name_ref->name, lo, hi);
+                            if (auto *field = tbl->lookup_field(name, act.name)) {
+                                if (found_field) {
+                                    if (field != found_field ||
+                                        slot.data.at(ActionBusSource(field)) != it->second + lo)
+                                        error(it->first.name_ref->lineno,
+                                              "%s has incompatible "
+                                              "aliases in actions %s and %s",
+                                              it->first.name_ref->name.c_str(),
+                                              found_act->name.c_str(), act.name.c_str());
+                                } else {
+                                    found_act = &act;
+                                    found_field = field;
+                                    slot.data[ActionBusSource(field)] = it->second + lo;
+                                    ok = true;
+                                }
+                            }
+                        }
+                    }
+                    if (!ok)
+                        error(it->first.name_ref->lineno, "No format field or table named %s",
+                              it->first.name_ref->name.c_str());
+                } else {
+                    auto att = tbl->get_attached();
+                    if (!att || att->meters.empty()) {
+                        error(lineno, "No meter table attached to %s", tbl->name());
+                    } else if (att->meters.size() > 1) {
+                        error(lineno, "Multiple meter tables attached to %s", tbl->name());
+                    } else {
+                        ActionBusSource src(att->meters.at(0));
+                        switch (it->first.type) {
+                            case ActionBusSource::NameRef:
+                                src.table->set_output_used();
+                                break;
+                            case ActionBusSource::ColorRef:
+                                src.type = ActionBusSource::TableColor;
+                                src.table->set_color_used();
+                                break;
+                            case ActionBusSource::AddressRef:
+                                src.type = ActionBusSource::TableAddress;
+                                src.table->set_address_used();
+                                break;
+                            default:
+                                BUG();
+                        }
+                        slot.data[src] = it->second;
+                    }
+                }
+                it = slot.data.erase(it);
+            } else {
+                if (it->first.type == ActionBusSource::TableColor)
+                    it->first.table->set_color_used();
+                if (it->first.type == ActionBusSource::TableOutput)
+                    it->first.table->set_output_used();
+                ++it;
+            }
+        }
+        if (error_count > 0) continue;
+        auto first = slot.data.begin();
+        if (first != slot.data.end()) {
+            for (auto it = next(first); it != slot.data.end(); ++it) {
+                if (!compatible(first->first, first->second, it->first, it->second))
+                    error(lineno, "Incompatible action bus entries at offset %d", slot.byte);
+            }
+        }
+        int slotno = Stage::action_bus_slot_map[slot.byte];
+        for (unsigned byte = slot.byte; byte < slot.byte + slot.size / 8U;
+             byte += Stage::action_bus_slot_size[slotno++] / 8U) {
+            if (slotno >= ACTION_DATA_BUS_SLOTS) {
+                error(lineno, "%s extends past the end of the actions bus", slot.name.c_str());
+                break;
+            }
+            if (auto tbl_in_slot = tbl->stage->action_bus_use[slotno]) {
+                if (tbl_in_slot != tbl) {
+                    if (!(check_atcam_sharing(tbl, tbl_in_slot) ||
+                          check_slot_sharing(slot, tbl->stage->action_bus_use_bit_mask)))
+                        warning(lineno, "Action bus byte %d set in table %s and table %s", byte,
+                                tbl->name(), tbl->stage->action_bus_use[slotno]->name());
+                }
+            } else {
+                tbl->stage->action_bus_use[slotno] = tbl;
+                // Set a per-byte mask on the action bus bytes to indicate which
+                // bits in bytes are being used. A slot can be shared among
+                // tables which dont overlap any bits. The code assumes the
+                // action bus allocation is byte aligned (and sets the mask to
+                // 0xF), while this could ideally be not the case. In that
+                // event, the mask must be set accordingly. This will require
+                // additional logic to determine which bits in the byte are used
+                // or additional syntax in the action bus assembly output.
+                tbl->stage->action_bus_use_bit_mask.setrange(slot.byte * 8U, slot.size);
+            }
+            if (use[slotno]) {
+                BUG_CHECK(!slot.data.empty() && !use[slotno]->data.empty());
+                auto nsrc = slot.data.begin()->first;
+                unsigned noff = slot.data.begin()->second;
+                unsigned nstart = 8 * (byte - slot.byte) + noff;
+                if (nsrc.type == ActionBusSource::Field) nstart = nsrc.field->immed_bit(nstart);
+                auto osrc = use[slotno]->data.begin()->first;
+                unsigned ooff = use[slotno]->data.begin()->second;
+                unsigned ostart = 8 * (byte - use[slotno]->byte) + ooff;
+                if (osrc.type == ActionBusSource::Field) {
+                    if (ostart < osrc.field->size)
+                        ostart = osrc.field->immed_bit(ostart);
+                    else
+                        ostart += osrc.field->immed_bit(0);
+                }
+                if (ostart != nstart)
+                    error(lineno,
+                          "Action bus byte %d used inconsistently for fields %s and "
+                          "%s in table %s",
+                          byte, use[slotno]->name.c_str(), slot.name.c_str(), tbl->name());
+            } else {
+                use[slotno] = &slot;
+            }
+            unsigned hi = slot.lo(tbl) + slot.size - 1;
+            if (action_hv_slice_use.size() <= hi / 128U) action_hv_slice_use.resize(hi / 128U + 1);
+            auto &hv_groups = action_hv_slice_byte_groups.at(slot.byte / 16);
+            for (unsigned byte = slot.lo(tbl) / 8U; byte <= hi / 8U; ++byte) {
+                byte_use[byte] = 1;
+                action_hv_slice_use.at(byte / 16).at(slot.byte / 16) |= hv_groups.at(byte % 16);
+            }
+        }
+    }
+}
+
+bool ActionBus::check_slot_sharing(Slot &slot, bitvec &action_bus) {
+    return (action_bus.getrange(slot.byte * 8U, slot.size) == 0);
+}
+
+bool ActionBus::check_atcam_sharing(Table *tbl1, Table *tbl2) {
+    bool atcam_share_bytes = false;
+    bool atcam_action_share_bytes = false;
+    // Check tables are not same atcam's sharing bytes on action bus
+    if (tbl1->to<AlgTcamMatchTable>() && tbl2->to<AlgTcamMatchTable>() &&
+        tbl1->p4_table->p4_name() == tbl2->p4_table->p4_name())
+        atcam_share_bytes = true;
+    // Check tables are not same atcam action tables sharing bytes on action bus
+    if (auto tbl1_at = tbl1->to<ActionTable>()) {
+        if (auto tbl2_at = tbl2->to<ActionTable>()) {
+            auto tbl1_mt = tbl1_at->get_match_table();
+            auto tbl2_mt = tbl2_at->get_match_table();
+            if (tbl1_mt->p4_table->p4_name() == tbl2_mt->p4_table->p4_name())
+                atcam_action_share_bytes = true;
+        }
+    }
+    return (atcam_share_bytes || atcam_action_share_bytes);
+}
+
+void ActionBus::need_alloc(Table *tbl, const ActionBusSource &src, unsigned lo, unsigned hi,
+                           unsigned size) {
+    LOG3("need_alloc(" << tbl->name() << ") " << src << " lo=" << lo << " hi=" << hi << " size=0x"
+                       << hex(size));
+    need_place[src][lo] |= size;
+    switch (src.type) {
+        case ActionBusSource::Field:
+            lo += src.field->immed_bit(0);
+            break;
+        case ActionBusSource::TableOutput:
+            src.table->set_output_used();
+            break;
+        case ActionBusSource::TableColor:
+            src.table->set_color_used();
+            break;
+        case ActionBusSource::TableAddress:
+            src.table->set_address_used();
+            break;
+        case ActionBusSource::XcmpData:
+            break;
+        default:
+            break;
+    }
+    byte_use.setrange(lo / 8U, size);
+}
+
+/**
+ * find_free -- find a free slot on the action output bus for some data.  Looks through bytes
+ * in the range min..max for a free space where we can put 'bytes' bytes from an action
+ * input bus starting at 'lobyte'.  'step' is an optimization to only check every step bytes
+ * as we know alignment restrictions mean those are the only possible aligned spots
+ */
+int ActionBus::find_free(Table *tbl, unsigned min, unsigned max, unsigned step, unsigned lobyte,
+                         unsigned bytes) {
+    unsigned avail;
+    LOG4("find_free(" << min << ", " << max << ", " << step << ", " << lobyte << ", " << bytes
+                      << ")");
+    for (unsigned i = min; i + bytes - 1 <= max; i += step) {
+        unsigned hv_slice = i / ACTION_HV_XBAR_SLICE_SIZE;
+        auto &hv_groups = action_hv_slice_byte_groups.at(hv_slice);
+        int mask1 = action_hv_slice_group_align.at(hv_slice).at(lobyte % 16U) - 1;
+        int mask2 = action_hv_slice_group_align.at(hv_slice).at((lobyte + bytes - 1) % 16U) - 1;
+        if ((i ^ lobyte) & mask1) continue;  // misaligned
+        bool inuse = false;
+        for (unsigned byte = lobyte & ~mask1; byte <= ((lobyte + bytes - 1) | mask2); ++byte) {
+            if (!byte_use[byte]) continue;
+            if (action_hv_slice_use.size() <= byte / 16U)
+                action_hv_slice_use.resize(byte / 16U + 1);
+            if (action_hv_slice_use.at(byte / 16U).at(hv_slice) & hv_groups.at(byte % 16U)) {
+                LOG5("  input byte " << byte << " in use for hv_slice " << hv_slice);
+                inuse = true;
+                break;
+            }
+        }
+        if (inuse) {
+            // skip up to next hv_slice
+            while ((i + step) / ACTION_HV_XBAR_SLICE_SIZE == hv_slice) i += step;
+            continue;
+        }
+        for (unsigned byte = i & ~mask1; byte <= ((i + bytes - 1) | mask2); ++byte)
+            if (tbl->stage->action_bus_use[Stage::action_bus_slot_map[byte]]) {
+                LOG5("  output byte "
+                     << byte << " in use by "
+                     << tbl->stage->action_bus_use[Stage::action_bus_slot_map[byte]]->name());
+                inuse = true;
+                break;
+            }
+        if (inuse) continue;
+        for (avail = 1; avail < bytes; avail++)
+            if (tbl->stage->action_bus_use[Stage::action_bus_slot_map[i + avail]]) break;
+        if (avail >= bytes) return i;
+    }
+    return -1;
+}
+
+/**
+ * find_merge -- find any adjacent/overlapping data on the action input bus that means the
+ * data at 'offset' actually already on the action output bus
+ *   offset     offset (in bits) on the action input bus of the data we're interested in
+ *   bytes      how many bytes of data on the action input bus
+ *   use        bitmask of the sizes of phv that need to access this on the action output bus
+ */
+int ActionBus::find_merge(Table *tbl, int offset, int bytes, int use) {
+    LOG4("find_merge(" << offset << ", " << bytes << ", " << use << ")");
+    bool is_action_data = dynamic_cast<ActionTable *>(tbl) != nullptr;
+    for (auto &alloc : by_byte) {
+        if (use & 1) {
+            if (alloc.first >= 32) break;
+        } else if (use & 2) {
+            if (alloc.first < 32) continue;
+            if (alloc.first >= 96) break;
+        }
+        if (alloc.second.is_table_output()) continue;  // can't merge table output with immediate
+        int inbyte = alloc.second.lo(tbl) / 8U;
+        int align = 4;
+        if (is_action_data)
+            align = action_hv_slice_group_align.at(alloc.first / 16U).at(inbyte % 16U);
+        int outbyte = alloc.first & ~(align - 1);
+        inbyte &= ~(align - 1);
+        if (offset >= inbyte * 8 && offset + bytes * 8 <= (inbyte + align) * 8)
+            return outbyte + offset / 8 - inbyte;
+    }
+    return -1;
+}
+
+void ActionBus::do_alloc(Table *tbl, ActionBusSource src, unsigned use, int lobyte, int bytes,
+                         unsigned offset) {
+    LOG2("putting " << src << '(' << offset << ".." << (offset + bytes * 8 - 1) << ")["
+                    << (lobyte * 8) << ".." << ((lobyte + bytes) * 8 - 1) << "] at action_bus "
+                    << use);
+    unsigned hv_slice = use / ACTION_HV_XBAR_SLICE_SIZE;
+    auto &hv_groups = action_hv_slice_byte_groups.at(hv_slice);
+    for (unsigned byte = lobyte; byte < unsigned(lobyte + bytes); ++byte) {
+        if (action_hv_slice_use.size() <= byte / 16) action_hv_slice_use.resize(byte / 16 + 1);
+        action_hv_slice_use.at(byte / 16).at(hv_slice) |= hv_groups.at(byte % 16);
+    }
+    while (bytes > 0) {
+        int slot = Stage::action_bus_slot_map[use];
+        int slotsize = Stage::action_bus_slot_size[slot];
+        auto slot_tbl = tbl->stage->action_bus_use[slot];
+        // Atcam tables are mutually exclusive and should be allowed to share
+        // bytes on action bus
+        if (slot_tbl && !Table::allow_bus_sharing(tbl, slot_tbl))
+            BUG_CHECK(slot_tbl == tbl || slot_tbl->action_bus->by_byte.at(use).data.count(src));
+        tbl->stage->action_bus_use[slot] = tbl;
+        Slot &sl = by_byte.emplace(use, Slot(src.name(tbl), use, bytes * 8U)).first->second;
+        if (sl.size < bytes * 8U) sl.size = bytes * 8U;
+        sl.data.emplace(src, offset);
+        LOG4("  slot " << sl.byte << "(" << sl.name << ") data += " << src.toString(tbl)
+                       << " off=" << offset);
+        offset += slotsize;
+        bytes -= slotsize / 8U;
+        use += slotsize / 8U;
+    }
+}
+
+const unsigned ActionBus::size_masks[8] = {7, 7, 15, 15, 31, 31, 31, 31};
+
+void ActionBus::alloc_field(Table *tbl, ActionBusSource src, unsigned offset,
+                            unsigned sizes_needed) {
+    LOG4("alloc_field(" << src << ", " << offset << ", " << sizes_needed << ")");
+    int lineno = this->lineno;
+    bool is_action_data = dynamic_cast<ActionTable *>(tbl) != nullptr;
+    int lo, hi, use;
+    bool can_merge = true;
+    if (src.type == ActionBusSource::Field) {
+        lo = src.field->immed_bit(offset);
+        hi = src.field->immed_bit(src.field->size) - 1;
+        lineno = tbl->find_field_lineno(src.field);
+    } else {
+        lo = offset;
+        if (src.type == ActionBusSource::TableOutput || src.type == ActionBusSource::TableColor ||
+            src.type == ActionBusSource::TableAddress || src.type == ActionBusSource::RandomGen)
+            can_merge = false;
+        if (src.type == ActionBusSource::HashDist &&
+            !(src.hd->xbar_use & HashDistribution::IMMEDIATE_LOW))
+            lo += 16;
+        hi = lo | size_masks[sizes_needed];
+    }
+    if (lo / 32U != hi / 32U) {
+        /* Can't go across 32-bit boundary so chop it down as needed */
+        hi = lo | 31U;
+    }
+    int bytes = hi / 8U - lo / 8U + 1;
+    int step = 4;
+    if (is_action_data) step = (lo % 128U) < 32 ? 2 : (lo % 128U) < 64 ? 4 : 8;
+    if (sizes_needed & 1) {
+        /* need 8-bit */
+        if ((lo % 8U) && (lo / 8U != hi / 8U)) {
+            error(lineno,
+                  "%s not correctly aligned for 8-bit use on "
+                  "action bus",
+                  src.toString(tbl).c_str());
+            return;
+        }
+        unsigned start = (lo / 8U) % step;
+        int bytes_needed = (sizes_needed & 4) ? bytes : 1;
+        if ((use = find(tbl->stage, src, lo, hi, 1)) >= 0 ||
+            (can_merge && (use = find_merge(tbl, lo, bytes_needed, 1)) >= 0) ||
+            (use = find_free(tbl, start, 31, step, lo / 8U, bytes_needed)) >= 0)
+            do_alloc(tbl, src, use, lo / 8U, bytes_needed, offset);
+        else
+            error(lineno, "Can't allocate space on 8-bit part of action bus for %s",
+                  src.toString(tbl).c_str());
+    }
+    step = (lo % 128U) < 64 ? 4 : 8;
+    if (sizes_needed & 2) {
+        /* need 16-bit */
+        if (lo % 16U) {
+            if (lo / 16U != hi / 16U) {
+                error(lineno,
+                      "%s not correctly aligned for 16-bit use "
+                      "on action bus",
+                      src.toString(tbl).c_str());
+                return;
+            }
+            if (can_merge && (use = find_merge(tbl, lo, bytes, 2)) >= 0) {
+                do_alloc(tbl, src, use, lo / 8U, bytes, offset);
+                return;
+            }
+        }
+        if (!(sizes_needed & 4) && bytes > 2) bytes = 2;
+        unsigned start = 32 + (lo / 8U) % step;
+        if ((use = find(tbl->stage, src, lo, hi, 2)) >= 0 ||
+            (can_merge && (use = find_merge(tbl, lo, bytes, 2)) >= 0) ||
+            (use = find_free(tbl, start, 63, step, lo / 8U, bytes)) >= 0 ||
+            (use = find_free(tbl, start + 32, 95, 8, lo / 8U, bytes)) >= 0)
+            do_alloc(tbl, src, use, lo / 8U, bytes, offset);
+        else
+            error(lineno, "Can't allocate space on 16-bit part of action bus for %s",
+                  src.toString(tbl).c_str());
+    }
+    if (sizes_needed == 4) {
+        /* need only 32-bit */
+        unsigned odd = (lo / 8U) & (4 & step);
+        unsigned start = (lo / 8U) % step;
+        if (lo % 32U) {
+            if (can_merge && (use = find_merge(tbl, lo, bytes, 4)) >= 0) {
+                do_alloc(tbl, src, use, lo / 8U, bytes, offset);
+                return;
+            }
+        }
+        if ((use = find(tbl->stage, src, lo, hi, 4)) >= 0 ||
+            (can_merge && (use = find_merge(tbl, lo, bytes, 4)) >= 0) ||
+            (use = find_free(tbl, 96 + start + odd, 127, 8, lo / 8U, bytes)) >= 0 ||
+            (use = find_free(tbl, 64 + start + odd, 95, 8, lo / 8U, bytes)) >= 0 ||
+            (use = find_free(tbl, 32 + start, 63, step, lo / 8U, bytes)) >= 0 ||
+            (use = find_free(tbl, 0 + start, 31, step, lo / 8U, bytes)) >= 0)
+            do_alloc(tbl, src, use, lo / 8U, bytes, offset);
+        else
+            error(lineno, "Can't allocate space on action bus for %s", src.toString(tbl).c_str());
+    }
+}
+
+void ActionBus::pass3(Table *tbl) {
+    bool is_action_data = dynamic_cast<ActionTable *>(tbl) != nullptr;
+    LOG1("ActionBus::pass3(" << tbl->name() << ") " << (is_action_data ? "[action]" : "[immed]"));
+    for (auto &d : need_place)
+        for (auto &bits : d.second) alloc_field(tbl, d.first, bits.first, bits.second);
+    int rnguse = -1;
+    for (auto &slot : by_byte) {
+        for (auto &d : slot.second.data) {
+            if (d.first.type == ActionBusSource::RandomGen) {
+                if (rnguse >= 0 && rnguse != d.first.rng.unit)
+                    error(lineno, "Can't use both rng units in a single table");
+                rnguse = d.first.rng.unit;
+            }
+        }
+    }
+}
+
+static int slot_sizes[] = {
+    5, /* 8-bit or 32-bit */
+    6, /* 16-bit or 32-bit */
+    6, /* 16-bit or 32-bit */
+    4  /* 32-bit only */
+};
+
+/**
+ * ActionBus::find
+ * @brief find an action bus slot that contains the requested thing.
+ *
+ * Overloads allow looking for different kinds of things -- a Format::Field,
+ * a HashDistribution, a RandomNumberGen, or something by name (generally a table output).
+ * @param f     a Format::Field to look for
+ * @param name  named slot to look for -- generally a table output, but may be a field
+ * @param hd    a HashDistribution to look for
+ * @param rng   a RandomNumberGen to look for
+ * @param lo, hi range of bits in the thing specified by the first arg
+ * @param size  bitmask of needed size classes -- 3 bits that denote need for a 8/16/32 bit
+ *              actionbus slot.  Generally will only have 1 bit set, but might be 0.
+ */
+int ActionBus::find(const char *name, TableOutputModifier mod, int lo, int hi, int size, int *len) {
+    if (auto *tbl = ::get(Table::all, name))
+        return find(ActionBusSource(tbl, mod), lo, hi, size, -1, len);
+    if (mod != TableOutputModifier::NONE) return -1;
+    for (auto &slot : by_byte) {
+        int offset = lo;
+        if (slot.second.name != name) continue;
+        if (size && !(size & static_cast<int>(slot_sizes[slot.first / 32U]))) continue;
+        if (offset >= static_cast<int>(slot.second.size)) continue;
+        if (len) *len = slot.second.size;
+        return slot.first + offset / 8;
+    }
+    return -1;
+}
+
+int ActionBus::find(const ActionBusSource &src, int lo, int hi, int size, int pos, int *len) {
+    bool hd1Found = true;
+    int hd1Pos = -1;
+    for (auto &slot : by_byte) {
+        if (!slot.second.data.count(src)) continue;
+        int offset = slot.second.data[src];
+        // FIXME -- HashDist is 16 bits in either half of the 32-bit immediate path; we call
+        // the high half (16..31), but we address it directly (as if it was 16 bits) for
+        // non-32 bit accesses. So we ignore the top bit of the offset bit index when
+        // accessing it for 8- or 16- bit slots.
+        // There should be a better way of doing this.
+        if ((src.type == ActionBusSource::HashDist || src.type == ActionBusSource::HashDistPair) &&
+            size < 4)
+            offset &= 15;
+        // Table Color is 8 bits which is ORed into the top of the immediate;  The offset is
+        // thus >= 24, but we want to ignore that here and just use the offset within the byte
+        if (src.type == ActionBusSource::TableColor) offset &= 7;
+        if (offset > lo) continue;
+        if (offset + static_cast<int>(slot.second.size) <= hi) continue;
+        if (size && !(size & slot_sizes[slot.first / 32U])) continue;
+        if (len) *len = slot.second.size;
+        auto bus_pos = slot.first + (lo - offset) / 8;
+        if (pos >= 0 && bus_pos != pos) continue;
+        return bus_pos;
+    }
+    return -1;
+}
+
+int ActionBus::find(Stage *stage, ActionBusSource src, int lo, int hi, int size, int *len) {
+    int rv = -1;
+    for (auto tbl : stage->tables)
+        if (tbl->action_bus && (rv = tbl->action_bus->find(src, lo, hi, size, -1, len)) >= 0)
+            return rv;
+    return rv;
+}
+
+template <class REGS>
+void ActionBus::write_action_regs(REGS &regs, Table *tbl, int home_row, unsigned action_slice) {
+    LOG2("--- ActionBus write_action_regs(" << tbl->name() << ", " << home_row << ", "
+                                            << action_slice << ")");
+    bool is_action_data = dynamic_cast<ActionTable *>(tbl) != nullptr;
+    auto &action_hv_xbar = regs.rams.array.row[home_row / 2].action_hv_xbar;
+    unsigned side = home_row % 2; /* 0 == left,  1 == right */
+    for (auto &el : by_byte) {
+        if (!is_action_data && !el.second.is_table_output()) {
+            // Nasty hack -- meter/stateful output uses the action bus on the meter row,
+            // so we need this routine to set it up, but we only want to do it for the
+            // meter bus output; the rest of this ActionBus is for immediate data (set
+            // up by write_immed_regs below)
+            continue;
+        }
+        LOG5("    " << el.first << ": " << el.second);
+        unsigned byte = el.first;
+        BUG_CHECK(byte == el.second.byte);
+        unsigned slot = Stage::action_bus_slot_map[byte];
+        unsigned bit = 0, size = 0;
+        std::string srcname;
+        for (auto &data : el.second.data) {
+            // FIXME -- this loop feels like a hack -- the size SHOULD already be set in
+            // el.second.size (the max of the sizes of everything in the data we're looping
+            // over), so should not need recomputing.  We do need to figure out the source
+            // bit location, and ignore things in other wide words, but that should be stored
+            // in the Slot object?  What about wired-ors, writing two inputs to the same
+            // slot -- it is possible but is it useful?
+            unsigned data_bit = 0, data_size = 0;
+            if (data.first.type == ActionBusSource::Field) {
+                auto f = data.first.field;
+                if ((f->bit(data.second) >> 7) != action_slice) continue;
+                data_bit = f->bit(data.second) & 0x7f;
+                data_size = std::min(el.second.size, f->size - data.second);
+                srcname = "field " + tbl->find_field(f);
+            } else if (data.first.type == ActionBusSource::TableOutput) {
+                if (data.first.table->home_row() != home_row) {
+                    // skip tables not on this home row
+                    continue;
+                }
+                data_bit = data.second;
+                data_size = el.second.size;
+                srcname = "table " + data.first.table->name_;
+            } else {
+                // HashDist and RandomGen only work in write_immed_regs
+                BUG();
+            }
+            LOG3("    byte " << byte << " (slot " << slot << "): " << srcname << " (" << data.second
+                             << ".." << (data.second + data_size - 1) << ")" << " [" << data_bit
+                             << ".." << (data_bit + data_size - 1) << "]");
+            if (size) {
+                BUG_CHECK(bit == data_bit);  // checked in pass1; maintained by pass3
+                size = std::max(size, data_size);
+            } else {
+                bit = data_bit;
+                size = data_size;
+            }
+        }
+        if (size == 0) continue;
+        if (bit + size > 128) {
+            error(lineno,
+                  "Action bus setup can't deal with field %s split across "
+                  "SRAM rows",
+                  el.second.name.c_str());
+            continue;
+        }
+        unsigned bytemask = (1U << ((size + 7) / 8U)) - 1;
+        switch (Stage::action_bus_slot_size[slot]) {
+            case 8:
+                for (unsigned sbyte = bit / 8; sbyte <= (bit + size - 1) / 8;
+                     sbyte++, byte++, slot++) {
+                    unsigned code = 0, mask = 0;
+                    switch (sbyte >> 2) {
+                        case 0:
+                            code = sbyte >> 1;
+                            mask = 1;
+                            break;
+                        case 1:
+                            code = 2;
+                            mask = 3;
+                            break;
+                        case 2:
+                        case 3:
+                            code = 3;
+                            mask = 7;
+                            break;
+                        default:
+                            BUG();
+                    }
+                    if ((sbyte ^ byte) & mask) {
+                        error(lineno, "Can't put field %s into byte %d on action xbar",
+                              el.second.name.c_str(), byte);
+                        break;
+                    }
+                    auto &ctl = action_hv_xbar.action_hv_ixbar_ctl_byte[side];
+                    switch (code) {
+                        case 0:
+                            ctl.action_hv_ixbar_ctl_byte_1to0_ctl = slot / 2;
+                            ctl.action_hv_ixbar_ctl_byte_1to0_enable = 1;
+                            break;
+                        case 1:
+                            ctl.action_hv_ixbar_ctl_byte_3to2_ctl = slot / 2;
+                            ctl.action_hv_ixbar_ctl_byte_3to2_enable = 1;
+                            break;
+                        case 2:
+                            ctl.action_hv_ixbar_ctl_byte_7to4_ctl = slot / 4;
+                            ctl.action_hv_ixbar_ctl_byte_7to4_enable = 1;
+                            break;
+                        case 3:
+                            ctl.action_hv_ixbar_ctl_byte_15to8_ctl = slot / 8;
+                            ctl.action_hv_ixbar_ctl_byte_15to8_enable = 1;
+                            break;
+                    }
+                    if (!(bytemask & 1))
+                        LOG1("WARNING: " << SrcInfo(lineno) << ": putting " << el.second.name
+                                         << " on action bus byte " << byte
+                                         << " even though bit in bytemask is "
+                                            "not set");
+                    action_hv_xbar.action_hv_ixbar_input_bytemask[side] |= 1 << sbyte;
+                    bytemask >>= 1;
+                }
+                break;
+            case 16:
+                byte &= ~1;
+                slot -= ACTION_DATA_8B_SLOTS;
+                bytemask <<= ((bit / 8) & 1);
+                for (unsigned word = bit / 16; word <= (bit + size - 1) / 16;
+                     word++, byte += 2, slot++) {
+                    unsigned code = 0, mask = 0;
+                    switch (word >> 1) {
+                        case 0:
+                            code = 1;
+                            mask = 3;
+                            break;
+                        case 1:
+                            code = 2;
+                            mask = 3;
+                            break;
+                        case 2:
+                        case 3:
+                            code = 3;
+                            mask = 7;
+                            break;
+                        default:
+                            BUG();
+                    }
+                    if (((word << 1) ^ byte) & mask) {
+                        error(lineno, "Can't put field %s into byte %d on action xbar",
+                              el.second.name.c_str(), byte);
+                        break;
+                    }
+                    auto &ctl = action_hv_xbar.action_hv_ixbar_ctl_halfword[slot / 8][side];
+                    unsigned subslot = slot % 8U;
+                    switch (code) {
+                        case 1:
+                            ctl.action_hv_ixbar_ctl_halfword_3to0_ctl = subslot / 2;
+                            ctl.action_hv_ixbar_ctl_halfword_3to0_enable = 1;
+                            break;
+                        case 2:
+                            ctl.action_hv_ixbar_ctl_halfword_7to4_ctl = subslot / 2;
+                            ctl.action_hv_ixbar_ctl_halfword_7to4_enable = 1;
+                            break;
+                        case 3:
+                            ctl.action_hv_ixbar_ctl_halfword_15to8_ctl = subslot / 4;
+                            ctl.action_hv_ixbar_ctl_halfword_15to8_enable = 1;
+                            break;
+                    }
+                    action_hv_xbar.action_hv_ixbar_input_bytemask[side] |= (bytemask & 3)
+                                                                           << (word * 2);
+                    bytemask >>= 2;
+                }
+                break;
+            case 32: {
+                byte &= ~3;
+                slot -= ACTION_DATA_8B_SLOTS + ACTION_DATA_16B_SLOTS;
+                unsigned word = bit / 32;
+                unsigned code = 1 + word / 2;
+                bit %= 32;
+                bytemask <<= bit / 8;
+                if (((word << 2) ^ byte) & 7) {
+                    error(lineno, "Can't put field %s into byte %d on action xbar",
+                          el.second.name.c_str(), byte);
+                    break;
+                }
+                auto &ctl = action_hv_xbar.action_hv_ixbar_ctl_word[slot / 4][side];
+                slot %= 4U;
+                switch (code) {
+                    case 1:
+                        ctl.action_hv_ixbar_ctl_word_7to0_ctl = slot / 2;
+                        ctl.action_hv_ixbar_ctl_word_7to0_enable = 1;
+                        break;
+                    case 2:
+                        ctl.action_hv_ixbar_ctl_word_15to8_ctl = slot / 2;
+                        ctl.action_hv_ixbar_ctl_word_15to8_enable = 1;
+                        break;
+                }
+                action_hv_xbar.action_hv_ixbar_input_bytemask[side] |= (bytemask & 15)
+                                                                       << (word * 4);
+                bytemask >>= 4;
+                break;
+            }
+            default:
+                BUG();
+        }
+        if (bytemask)
+            LOG1("WARNING: " << SrcInfo(lineno) << ": excess bits " << hex(bytemask)
+                             << " set in bytemask for " << el.second.name);
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void ActionBus::write_action_regs, mau_regs &,
+                      Table *, int, unsigned)
+
+template <class REGS>
+void ActionBus::write_immed_regs(REGS &regs, Table *tbl) {
+    LOG2("--- ActionBus write_immed_regs(" << tbl->name() << ")");
+    auto &adrdist = regs.rams.match.adrdist;
+    int tid = tbl->logical_id;
+    unsigned rngmask = 0;
+    for (auto &f : by_byte) {
+        if (f.second.is_table_output()) continue;
+        LOG5("    " << f.first << ": " << f.second);
+        int slot = Stage::action_bus_slot_map[f.first];
+        unsigned off = 0;
+        unsigned size = f.second.size;
+        if (!f.second.data.empty()) {
+            off = f.second.data.begin()->second;
+            if (f.second.data.begin()->first.type == ActionBusSource::Field)
+                off -= f.second.data.begin()->first.field->immed_bit(0);
+            for (auto &d : f.second.data) {
+                if (d.first.type == ActionBusSource::RandomGen) {
+                    rngmask |= d.first.rng.unit << 4;
+                    rngmask |= ((1 << (size / 8)) - 1) << d.second / 8;
+                }
+            }
+        }
+        switch (Stage::action_bus_slot_size[slot]) {
+            case 8:
+                for (unsigned b = off / 8; b <= (off + size - 1) / 8; b++) {
+                    BUG_CHECK((b & 3) == (slot & 3));
+                    adrdist.immediate_data_8b_enable[tid / 8] |= 1U << ((tid & 7) * 4 + b);
+                    // we write these ctl regs twice if we use both bytes in a pair.  That will
+                    // cause a WARNING in the log file if both uses are the same -- it should be
+                    // impossible to get an ERROR for conflicting uses, as that should have caused
+                    // an error in pass1 above, and never made it to this point.
+                    setup_muxctl(adrdist.immediate_data_8b_ixbar_ctl[tid * 2 + b / 2], slot++ / 4);
+                }
+                break;
+            case 16:
+                slot -= ACTION_DATA_8B_SLOTS;
+                for (unsigned w = off / 16; w <= (off + size - 1) / 16; w++) {
+                    BUG_CHECK((w & 1) == (slot & 1));
+                    setup_muxctl(adrdist.immediate_data_16b_ixbar_ctl[tid * 2 + w], slot++ / 2);
+                }
+                break;
+            case 32:
+                slot -= ACTION_DATA_8B_SLOTS + ACTION_DATA_16B_SLOTS;
+                setup_muxctl(adrdist.immediate_data_32b_ixbar_ctl[tid], slot);
+                break;
+            default:
+                BUG();
+        }
+    }
+    if (rngmask) {
+        regs.rams.match.adrdist.immediate_data_rng_enable = 1;
+        regs.rams.match.adrdist.immediate_data_rng_logical_map_ctl[tbl->logical_id / 4]
+            .set_subfield(rngmask, 5 * (tbl->logical_id % 4U), 5);
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void ActionBus::write_immed_regs, mau_regs &,
+                      Table *)
+
+std::string ActionBusSource::name(Table *tbl) const {
+    switch (type) {
+        case Field:
+            return tbl->find_field(field);
+        case TableOutput:
+        case TableColor:
+        case TableAddress:
+            return table->name();
+        case NameRef:
+        case ColorRef:
+        case AddressRef:
+            return name_ref->name;
+        default:
+            return "";
+    }
+}
+
+std::string ActionBusSource::toString(Table *tbl) const {
+    std::stringstream tmp;
+    switch (type) {
+        case None:
+            return "<none source>";
+        case Field:
+            return tbl->find_field(field);
+        case HashDist:
+            tmp << "hash_dist " << hd->id;
+            return tmp.str();
+        case RandomGen:
+            tmp << "rng " << rng.unit;
+            return tmp.str();
+        case TableOutput:
+            return table->name();
+        case TableColor:
+            return table->name_ + " color";
+        case TableAddress:
+            return table->name_ + " address";
+        case Ealu:
+            return "ealu";
+        case XcmpData:
+            tmp << "xcmp(" << xcmp_data.xcmp_group << ":" << xcmp_data.xcmp_byte << ")";
+            return tmp.str();
+        case NameRef:
+        case ColorRef:
+        case AddressRef:
+            tmp << "name ";
+            if (name_ref)
+                tmp << name_ref->name;
+            else
+                tmp << "(meter)";
+            if (type == ColorRef) tmp << " color";
+            if (type == AddressRef) tmp << " address";
+            return tmp.str();
+        default:
+            tmp << "<invalid source " << int(type) << ">";
+            return tmp.str();
+    }
+}
+
+std::ostream &operator<<(std::ostream &out, TableOutputModifier mod) {
+    switch (mod) {
+        case TableOutputModifier::Color:
+            out << " color";
+            break;
+        case TableOutputModifier::Address:
+            out << " address";
+            break;
+        default:
+            break;
+    }
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ActionBus::Slot &sl) {
+    out << sl.name << " byte=" << sl.byte << " size=" << sl.size;
+    for (auto &d : sl.data) out << "\n\t" << d.first << ": " << d.second;
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ActionBus &a) {
+    for (auto &slot : a.by_byte) out << slot.first << ": " << slot.second << std::endl;
+    for (auto &np : a.need_place) {
+        out << np.first << " {";
+        const char *sep = " ";
+        for (auto &el : np.second) {
+            out << sep << el.first << ":" << el.second;
+            sep = ", ";
+        }
+        out << (sep + 1) << "}" << std::endl;
+    }
+    out << "byte_use: " << a.byte_use << std::endl;
+    for (auto &hvslice : a.action_hv_slice_use) {
+        for (auto v : hvslice) out << "  " << hex(v, 4, '0');
+        out << std::endl;
+    }
+    return out;
+}
+
+void dump(const ActionBus *a) { std::cout << *a; }
diff --git a/backends/tofino/bf-asm/action_bus.h b/backends/tofino/bf-asm/action_bus.h
new file mode 100644
index 00000000000..16b86e43fe6
--- /dev/null
+++ b/backends/tofino/bf-asm/action_bus.h
@@ -0,0 +1,247 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_ACTION_BUS_H_
+#define BACKENDS_TOFINO_BF_ASM_ACTION_BUS_H_
+
+#include <array>
+
+#include "backends/tofino/bf-asm/tables.h"
+
+// static struct MeterBus_t {} MeterBus;
+struct MeterBus_t {};
+
+struct ActionBusSource {
+    enum {
+        None,
+        Field,
+        HashDist,
+        HashDistPair,
+        RandomGen,
+        TableOutput,
+        TableColor,
+        TableAddress,
+        Ealu,
+        XcmpData,
+        NameRef,
+        ColorRef,
+        AddressRef
+    } type;
+    union {
+        Table::Format::Field *field;
+        HashDistribution *hd;
+        struct {
+            HashDistribution *hd1, *hd2;
+        } hd_tuple;
+        Table *table;
+        Table::Ref *name_ref;
+        RandomNumberGen rng;
+        struct {
+            short xcmp_group, xcmp_byte;
+        } xcmp_data;
+    };
+    ActionBusSource() : type(None) { field = nullptr; }
+    ActionBusSource(Table::Format::Field *f) : type(Field) {  // NOLINT(runtime/explicit)
+        field = f;
+    }
+    ActionBusSource(HashDistribution *h) : type(HashDist) { hd = h; }  // NOLINT(runtime/explicit)
+    ActionBusSource(HashDistribution *h1, HashDistribution *h2) : type(HashDistPair) {
+        hd_tuple.hd1 = h1;
+        hd_tuple.hd2 = h2;
+    }
+    ActionBusSource(Table *t,
+                    TableOutputModifier m = TableOutputModifier::NONE)  // NOLINT(runtime/explicit)
+        : type(TableOutput) {
+        switch (m) {
+            case TableOutputModifier::Color:
+                type = TableColor;
+                break;
+            case TableOutputModifier::Address:
+                type = TableAddress;
+                break;
+            default:
+                break;
+        }
+        table = t;
+    }
+    ActionBusSource(Table::Ref *t,
+                    TableOutputModifier m = TableOutputModifier::NONE)  // NOLINT(runtime/explicit)
+        : type(NameRef) {
+        switch (m) {
+            case TableOutputModifier::Color:
+                type = ColorRef;
+                break;
+            case TableOutputModifier::Address:
+                type = AddressRef;
+                break;
+            default:
+                break;
+        }
+        name_ref = t;
+    }
+    ActionBusSource(MeterBus_t,
+                    TableOutputModifier m = TableOutputModifier::NONE)  // NOLINT(runtime/explicit)
+        : type(NameRef) {
+        switch (m) {
+            case TableOutputModifier::Color:
+                type = ColorRef;
+                break;
+            case TableOutputModifier::Address:
+                type = AddressRef;
+                break;
+            default:
+                break;
+        }
+        name_ref = nullptr;
+    }
+    ActionBusSource(RandomNumberGen r) : type(RandomGen) {  // NOLINT(runtime/explicit)
+        field = nullptr;
+        rng = r;
+    }
+    ActionBusSource(InputXbar::Group grp, int byte) : type(XcmpData) {
+        BUG_CHECK(grp.type == InputXbar::Group::XCMP, "Not xcmp ixbar");
+        field = nullptr;
+        xcmp_data.xcmp_group = grp.index;
+        xcmp_data.xcmp_byte = byte;
+    }
+    bool operator==(const ActionBusSource &a) const {
+        if (type == XcmpData)
+            return a.type == XcmpData && xcmp_data.xcmp_group == a.xcmp_data.xcmp_group &&
+                   xcmp_data.xcmp_byte == a.xcmp_data.xcmp_byte;
+        if (type == HashDistPair && hd_tuple.hd2 != a.hd_tuple.hd2) return false;
+        return type == a.type && field == a.field;
+    }
+    bool operator<(const ActionBusSource &a) const {
+        if (type != a.type) return type < a.type;
+        switch (type) {
+            case HashDistPair:
+                return hd_tuple.hd1 == a.hd_tuple.hd1 ? hd_tuple.hd2 < a.hd_tuple.hd2
+                                                      : hd_tuple.hd1 < a.hd_tuple.hd1;
+            case XcmpData:
+                return xcmp_data.xcmp_group == a.xcmp_data.xcmp_group
+                           ? xcmp_data.xcmp_byte < a.xcmp_data.xcmp_byte
+                           : xcmp_data.xcmp_group < a.xcmp_data.xcmp_group;
+            default:
+                return field < a.field;
+        }
+    }
+    std::string name(Table *tbl) const;
+    std::string toString(Table *tbl) const;
+    friend std::ostream &operator<<(std::ostream &, const ActionBusSource &);
+};
+
+class ActionBus {
+ protected:
+    // Check two ActionBusSource refs to ensure that they are compatible (can be at the same
+    // location on the aciton bus -- basically the same data)
+    static bool compatible(const ActionBusSource &a, unsigned a_off, const ActionBusSource &b,
+                           unsigned b_off);
+    struct Slot {
+        std::string name;
+        unsigned byte, size;  // size in bits
+        ordered_map<ActionBusSource, unsigned> data;
+        // offset in the specified source is in this slot -- corresponding bytes for different
+        // action data formats will go into the same slot.
+        Slot(std::string n, unsigned b, unsigned s) : name(n), byte(b), size(s) {}
+        Slot(std::string n, unsigned b, unsigned s, ActionBusSource src, unsigned off)
+            : name(n), byte(b), size(s) {
+            data.emplace(src, off);
+        }
+        unsigned lo(Table *tbl) const;  // low bit on the action data bus
+        bool is_table_output() const {
+            for (auto &d : data) {
+                BUG_CHECK(d.first.type != ActionBusSource::NameRef);
+                if (d.first.type == ActionBusSource::TableOutput) return true;
+            }
+            return false;
+        }
+    };
+    friend std::ostream &operator<<(std::ostream &, const Slot &);
+    friend std::ostream &operator<<(std::ostream &, const ActionBus &);
+    ordered_map<unsigned, Slot> by_byte;
+    ordered_map<ActionBusSource, ordered_map<unsigned, unsigned>> need_place;
+    // bytes from the given sources are needed on the action bus -- the pairs in the map
+    // are (offset,use) where offset is offset in bits, and use is a bitset of the needed
+    // uses (bit index == log2 of the access size in bytes)
+
+    std::vector<std::array<unsigned, ACTION_HV_XBAR_SLICES>> action_hv_slice_use;
+    // which bytes of input to the ixbar are used in each action_hv_xbar slice, for each
+    // 128-bit slice of the action bus.
+    bitvec byte_use;  // bytes on the action data (input) bus or immediate bus in use
+                      // for wide action tables, this may be >16 bytes...
+
+    void setup_slot(int lineno, Table *tbl, const char *name, unsigned idx, ActionBusSource src,
+                    unsigned sz, unsigned off);
+
+    int find_free(Table *tbl, unsigned min, unsigned max, unsigned step, unsigned lobyte,
+                  unsigned bytes);
+    int find_merge(Table *tbl, int offset, int bytes, int use);
+    bool check_atcam_sharing(Table *tbl1, Table *tbl2);
+    bool check_slot_sharing(ActionBus::Slot &slot, bitvec &action_bus);
+
+    ActionBus() : lineno(-1) {}
+    ActionBus(Table *, VECTOR(pair_t) &);
+
+ public:
+    int lineno;
+    static std::unique_ptr<ActionBus> create();
+    static std::unique_ptr<ActionBus> create(Table *, VECTOR(pair_t) &);
+
+    void pass1(Table *tbl);
+    void pass2(Table *tbl) {}
+    void pass3(Table *tbl);
+    template <class REGS>
+    void write_immed_regs(REGS &regs, Table *tbl);
+    template <class REGS>
+    void write_action_regs(REGS &regs, Table *tbl, int homerow, unsigned action_slice);
+
+    void do_alloc(Table *tbl, ActionBusSource src, unsigned use, int lobyte, int bytes,
+                  unsigned offset);
+    static const unsigned size_masks[8];
+    virtual void alloc_field(Table *, ActionBusSource src, unsigned offset, unsigned sizes_needed);
+    void need_alloc(Table *tbl, const ActionBusSource &src, unsigned lo, unsigned hi,
+                    unsigned size);
+    void need_alloc(Table *tbl, Table *attached, TableOutputModifier mod, unsigned lo, unsigned hi,
+                    unsigned size) {
+        need_alloc(tbl, ActionBusSource(attached, mod), lo, hi, size);
+    }
+
+    int find(const char *name, TableOutputModifier mod, int lo, int hi, int size, int *len = 0);
+    int find(const char *name, int lo, int hi, int size, int *len = 0) {
+        return find(name, TableOutputModifier::NONE, lo, hi, size, len);
+    }
+    int find(const std::string &name, TableOutputModifier mod, int lo, int hi, int size,
+             int *len = 0) {
+        return find(name.c_str(), mod, lo, hi, size, len);
+    }
+    int find(const std::string &name, int lo, int hi, int size, int *len = 0) {
+        return find(name.c_str(), lo, hi, size, len);
+    }
+    int find(const ActionBusSource &src, int lo, int hi, int size, int pos = -1, int *len = 0);
+    int find(Table *attached, TableOutputModifier mod, int lo, int hi, int size, int *len = 0) {
+        return find(ActionBusSource(attached, mod), lo, hi, size, -1, len);
+    }
+    static int find(Stage *stage, ActionBusSource src, int lo, int hi, int size, int *len = 0);
+    unsigned size() {
+        unsigned rv = 0;
+        for (auto &slot : by_byte) rv += slot.second.size;
+        return rv;
+    }
+    auto slots() const { return Values(by_byte); }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_ACTION_BUS_H_ */
diff --git a/backends/tofino/bf-asm/action_table.cpp b/backends/tofino/bf-asm/action_table.cpp
new file mode 100644
index 00000000000..7e24acee618
--- /dev/null
+++ b/backends/tofino/bf-asm/action_table.cpp
@@ -0,0 +1,794 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "lib/algorithm.h"
+
+// template specialization declarations
+#include "tofino/action_table.h"
+
+/// See 6.2.8.4.3 of the MAU Micro-Architecture document.
+const unsigned MAX_AD_SHIFT = 5U;
+
+std::string ActionTable::find_field(Table::Format::Field *field) {
+    for (auto &af : action_formats) {
+        auto name = af.second->find_field(field);
+        if (!name.empty() && name[0] != '<') return af.first + ":" + name;
+    }
+    return Table::find_field(field);
+}
+
+int ActionTable::find_field_lineno(Table::Format::Field *field) {
+    int rv = -1;
+    for (auto &af : action_formats)
+        if ((rv = af.second->find_field_lineno(field)) >= 0) return rv;
+    return Table::find_field_lineno(field);
+}
+
+Table::Format::Field *ActionTable::lookup_field(const std::string &name,
+                                                const std::string &action) const {
+    if (action == "*" || action == "") {
+        if (auto *rv = format ? format->field(name) : 0) return rv;
+        if (action == "*")
+            for (auto &fmt : action_formats)
+                if (auto *rv = fmt.second->field(name)) return rv;
+    } else {
+        if (action_formats.count(action)) {
+            if (auto *rv = action_formats.at(action)->field(name)) return rv;
+        } else if (auto *rv = format ? format->field(name) : 0) {
+            return rv;
+        }
+    }
+    for (auto *match_table : match_tables) {
+        BUG_CHECK((Table *)match_table != (Table *)this);
+        if (auto *rv = match_table->lookup_field(name)) return rv;
+    }
+    return 0;
+}
+void ActionTable::pad_format_fields() {
+    format->size = get_size();
+    format->log2size = get_log2size();
+    for (auto &fmt : action_formats) {
+        if (fmt.second->size < format->size) {
+            fmt.second->size = format->size;
+            fmt.second->log2size = format->log2size;
+        }
+    }
+}
+
+void ActionTable::apply_to_field(const std::string &n, std::function<void(Format::Field *)> fn) {
+    for (auto &fmt : action_formats) fmt.second->apply_to_field(n, fn);
+    if (format) format->apply_to_field(n, fn);
+}
+int ActionTable::find_on_actionbus(const ActionBusSource &src, int lo, int hi, int size, int pos) {
+    int rv;
+    if (action_bus && (rv = action_bus->find(src, lo, hi, size, pos)) >= 0) return rv;
+    for (auto *match_table : match_tables) {
+        BUG_CHECK((Table *)match_table != (Table *)this);
+        if ((rv = match_table->find_on_actionbus(src, lo, hi, size, pos)) >= 0) return rv;
+    }
+    return -1;
+}
+
+int ActionTable::find_on_actionbus(const char *name, TableOutputModifier mod, int lo, int hi,
+                                   int size, int *len) {
+    int rv;
+    if (action_bus && (rv = action_bus->find(name, mod, lo, hi, size, len)) >= 0) return rv;
+    for (auto *match_table : match_tables) {
+        BUG_CHECK((Table *)match_table != (Table *)this);
+        if ((rv = match_table->find_on_actionbus(name, mod, lo, hi, size, len)) >= 0) return rv;
+    }
+    return -1;
+}
+
+void ActionTable::need_on_actionbus(const ActionBusSource &src, int lo, int hi, int size) {
+    if (src.type == ActionBusSource::Field) {
+        auto f = src.field;
+        if (f->fmt == format.get()) {
+            Table::need_on_actionbus(src, lo, hi, size);
+            return;
+        }
+        for (auto &af : Values(action_formats)) {
+            if (f->fmt == af.get()) {
+                Table::need_on_actionbus(f, lo, hi, size);
+                return;
+            }
+        }
+        for (auto *match_table : match_tables) {
+            BUG_CHECK((Table *)match_table != (Table *)this);
+            if (f->fmt == match_table->get_format()) {
+                match_table->need_on_actionbus(f, lo, hi, size);
+                return;
+            }
+        }
+        BUG_CHECK(!"Can't find table associated with field");
+        // TBD - Add allocation for ActionBusSource::HashDistPair. Compiler does
+        // action bus allocation so this path is never used.
+    } else if (src.type == ActionBusSource::HashDist) {
+        auto hd = src.hd;
+        for (auto &hash_dist : this->hash_dist) {
+            if (&hash_dist == hd) {
+                Table::need_on_actionbus(hd, lo, hi, size);
+                return;
+            }
+        }
+        for (auto *match_table : match_tables) {
+            if (match_table->find_hash_dist(hd->id) == hd) {
+                match_table->need_on_actionbus(hd, lo, hi, size);
+                return;
+            }
+        }
+        BUG_CHECK(!"Can't find table associated with hash_dist");
+    } else if (src.type == ActionBusSource::RandomGen) {
+        auto rng = src.rng;
+        int attached_count = 0;
+        for (auto *match_table : match_tables) {
+            match_table->need_on_actionbus(rng, lo, hi, size);
+            ++attached_count;
+        }
+        if (attached_count > 1) {
+            error(-1,
+                  "Assembler cannot allocate action bus space for rng %d as it "
+                  "used by mulitple tables",
+                  rng.unit);
+        }
+    } else {
+        error(-1, "Assembler cannot allocate action bus space for %s", src.toString(this).c_str());
+    }
+}
+
+void ActionTable::need_on_actionbus(Table *att, TableOutputModifier mod, int lo, int hi, int size) {
+    int attached_count = 0;
+    for (auto *match_table : match_tables) {
+        if (match_table->is_attached(att)) {
+            match_table->need_on_actionbus(att, mod, lo, hi, size);
+            ++attached_count;
+        }
+    }
+    if (attached_count > 1) {
+        error(att->lineno,
+              "Assembler cannot allocate action bus space for table %s as it "
+              "used by mulitple tables",
+              att->name());
+    }
+}
+
+/**
+ * Necessary for determining the actiondata_adr_exact/tcam_shiftcount register value.
+ */
+unsigned ActionTable::determine_shiftcount(Table::Call &call, int group, unsigned word,
+                                           int tcam_shift) const {
+    int lo_huffman_bits =
+        std::min(get_log2size() - 2, static_cast<unsigned>(ACTION_ADDRESS_ZERO_PAD));
+    int extra_shift = ACTION_ADDRESS_ZERO_PAD - lo_huffman_bits;
+    if (call.args[0] == "$DIRECT") {
+        return 64 + extra_shift + tcam_shift;
+    } else if (call.args[0].field()) {
+        BUG_CHECK(call.args[0].field()->by_group[group]->bit(0) / 128U == word);
+        return call.args[0].field()->by_group[group]->bit(0) % 128U + extra_shift;
+    } else if (call.args[1].field()) {
+        return call.args[1].field()->bit(0) + ACTION_ADDRESS_ZERO_PAD;
+    }
+    return 0;
+}
+
+/**
+ * Calculates the actiondata_adr_default value.  Will default in the required huffman bits
+ * described in section 6.2.8.4.3 Action RAM Addressing of the uArch, as well as the
+ * per flow enable bit if indicated
+ */
+unsigned ActionTable::determine_default(Table::Call &call) const {
+    int huffman_ones = std::max(static_cast<int>(get_log2size()) - 3, 0);
+    BUG_CHECK(huffman_ones <= ACTION_DATA_HUFFMAN_BITS);
+    unsigned huffman_mask = (1 << huffman_ones) - 1;
+    // lower_huffman_mask == 0x1f, upper_huffman_mask = 0x60
+    unsigned lower_huffman_mask = (1U << ACTION_DATA_LOWER_HUFFMAN_BITS) - 1;
+    unsigned upper_huffman_mask = ((1U << ACTION_DATA_HUFFMAN_BITS) - 1) & ~lower_huffman_mask;
+    unsigned rv = (huffman_mask & upper_huffman_mask) << ACTION_DATA_HUFFMAN_DIFFERENCE;
+    rv |= huffman_mask & lower_huffman_mask;
+    if (call.args[1].name() && call.args[1] == "$DEFAULT") {
+        rv |= 1 << ACTION_DATA_PER_FLOW_ENABLE_START_BIT;
+    }
+    return rv;
+}
+
+/**
+ * Calculates the actiondata_adr_mask value for a given table.
+ */
+unsigned ActionTable::determine_mask(Table::Call &call) const {
+    int lo_huffman_bits =
+        std::min(get_log2size() - 2, static_cast<unsigned>(ACTION_DATA_LOWER_HUFFMAN_BITS));
+    unsigned rv = 0;
+    if (call.args[0] == "$DIRECT") {
+        rv |= ((1U << ACTION_ADDRESS_BITS) - 1) & (~0U << lo_huffman_bits);
+    } else if (call.args[0].field()) {
+        rv = ((1U << call.args[0].size()) - 1) << lo_huffman_bits;
+    }
+    return rv;
+}
+
+/**
+ * Calculates the actiondata_adr_vpn_shiftcount register.  As described in section 6.2.8.4.3
+ * for action data tables sized at 256, 512 and 1024, the Huffman bits for these addresses are
+ * no longer at the bottom of the address, but rather near the top.  For direct action data
+ * addresses, a hole in the address needs to be created.
+ */
+unsigned ActionTable::determine_vpn_shiftcount(Table::Call &call) const {
+    if (call.args[0].name() && call.args[0] == "$DIRECT") {
+        return std::max(0, static_cast<int>(get_log2size()) - 2 - ACTION_DATA_LOWER_HUFFMAN_BITS);
+    }
+    return 0;
+}
+
+int ActionTable::get_start_vpn() {
+    // Based on the format width, the starting vpn is determined as follows (See
+    // Section 6.2.8.4.3 in MAU MicroArchitecture Doc)
+    //    WIDTH     LOG2SIZE START_VPN
+    // <= 128 bits  -  7       - 0
+    //  = 256 bits  -  8       - 0
+    //  = 512 bits  -  9       - 1
+    //  = 1024 bits - 10       - 3
+    int size = get_log2size();
+    if (size <= 8) return 0;
+    if (size == 9) return 1;
+    if (size == 10) return 3;
+    return 0;
+}
+
+void ActionTable::vpn_params(int &width, int &depth, int &period, const char *&period_name) const {
+    width = 1;
+    depth = layout_size();
+    period = format ? 1 << std::max(static_cast<int>(format->log2size) - 7, 0) : 0;
+    // Based on the format width, the vpn are numbered as follows (See Section
+    // 6.2.8.4.3 in MAU MicroArchitecture Doc)
+    //    WIDTH     PERIOD  VPN'S
+    // <= 128 bits  - +1 - 0, 1, 2, 3, ...
+    //  = 256 bits  - +2 - 2, 4, 6, 8, ...
+    //  = 512 bits  - +4 - 1, 5, 9, 13, ...
+    //  = 1024 bits - +8 - 3, 11, 19, 27, ...
+    for (auto &fmt : Values(action_formats))
+        period = std::max(period, 1 << std::max(static_cast<int>(fmt->log2size) - 7, 0));
+    period_name = "action data width";
+}
+
+void ActionTable::setup(VECTOR(pair_t) & data) {
+    action_id = -1;
+    setup_layout(layout, data);
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (kv.key == "format") {
+            const char *action = nullptr;
+            if (kv.key.type == tCMD) {
+                if (!PCHECKTYPE(kv.key.vec.size > 1, kv.key[1], tSTR)) continue;
+                if (action_formats.count((action = kv.key[1].s))) {
+                    error(kv.key.lineno, "Multiple formats for action %s", kv.key[1].s);
+                    continue;
+                }
+            }
+            if (CHECKTYPEPM(kv.value, tMAP, kv.value.map.size > 0, "non-empty map")) {
+                auto *fmt = new Format(this, kv.value.map, true);
+                if (fmt->size < 8) {  // pad out to minimum size
+                    fmt->size = 8;
+                    fmt->log2size = 3;
+                }
+                if (action)
+                    action_formats[action].reset(fmt);
+                else
+                    format.reset(fmt);
+            }
+        }
+    }
+    if (!format && action_formats.empty()) error(lineno, "No format in action table %s", name());
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (kv.key == "format") {
+            /* done above to be done before action_bus and vpns */
+        } else if (kv.key.type == tCMD && kv.key[0] == "format") {
+            /* done above to be done before action_bus */
+        } else if (kv.key == "actions") {
+            if (CHECKTYPE(kv.value, tMAP)) actions.reset(new Actions(this, kv.value.map));
+        } else if (kv.key == "action_bus") {
+            if (CHECKTYPE(kv.value, tMAP)) action_bus = ActionBus::create(this, kv.value.map);
+        } else if (kv.key == "action_id") {
+            if (CHECKTYPE(kv.value, tINT)) action_id = kv.value.i;
+        } else if (kv.key == "vpns") {
+            if (kv.value == "null")
+                no_vpns = true;
+            else if (CHECKTYPE(kv.value, tVEC))
+                setup_vpns(layout, &kv.value.vec);
+        } else if (kv.key == "home_row") {
+            home_lineno = kv.value.lineno;
+            // Builds the map of home rows possible per word, as different words of the
+            // action row is on different home rows
+            if (CHECKTYPE2(kv.value, tINT, tVEC)) {
+                int word = 0;
+                if (kv.value.type == tINT) {
+                    if (kv.value.i >= 0 || kv.value.i < LOGICAL_SRAM_ROWS)
+                        home_rows_per_word[word].setbit(kv.value.i);
+                    else
+                        error(kv.value.lineno, "Invalid home row %" PRId64 "", kv.value.i);
+                } else {
+                    for (auto &v : kv.value.vec) {
+                        if (CHECKTYPE2(v, tINT, tVEC)) {
+                            if (v.type == tINT) {
+                                if (v.i >= 0 || v.i < LOGICAL_SRAM_ROWS)
+                                    home_rows_per_word[word].setbit(v.i);
+                                else
+                                    error(v.lineno, "Invalid home row %" PRId64 "", v.i);
+                            } else if (v.type == tVEC) {
+                                for (auto &v2 : v.vec) {
+                                    if (CHECKTYPE(v2, tINT)) {
+                                        if (v2.i >= 0 || v2.i < LOGICAL_SRAM_ROWS)
+                                            home_rows_per_word[word].setbit(v2.i);
+                                        else
+                                            error(v.lineno, "Invalid home row %" PRId64 "", v2.i);
+                                    }
+                                }
+                            }
+                        }
+                        word++;
+                    }
+                }
+            }
+        } else if (kv.key == "p4") {
+            if (CHECKTYPE(kv.value, tMAP))
+                p4_table = P4Table::get(P4Table::ActionData, kv.value.map);
+        } else if (kv.key == "context_json") {
+            setup_context_json(kv.value);
+        } else if (kv.key == "row" || kv.key == "logical_row" || kv.key == "column" ||
+                   kv.key == "word") {
+            /* already done in setup_layout */
+        } else if (kv.key == "logical_bus") {
+            if (CHECKTYPE2(kv.value, tSTR, tVEC)) {
+                if (kv.value.type == tSTR) {
+                    if (*kv.value.s != 'A' && *kv.value.s != 'O' && *kv.value.s != 'S')
+                        error(kv.value.lineno, "Invalid logical bus %s", kv.value.s);
+                } else {
+                    for (auto &v : kv.value.vec) {
+                        if (CHECKTYPE(v, tSTR)) {
+                            if (*v.s != 'A' && *v.s != 'O' && *v.s != 'S')
+                                error(v.lineno, "Invalid logical bus %s", v.s);
+                        }
+                    }
+                }
+            }
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (Target::SRAM_GLOBAL_ACCESS())
+        alloc_global_srams();
+    else
+        alloc_rams(true, stage->sram_use, 0);
+    if (!action_bus) action_bus = ActionBus::create();
+}
+
+void ActionTable::pass1() {
+    LOG1("### Action table " << name() << " pass1 " << loc());
+    if (default_action.empty()) default_action = get_default_action();
+    if (!p4_table)
+        p4_table = P4Table::alloc(P4Table::ActionData, this);
+    else
+        p4_table->check(this);
+    alloc_vpns();
+    std::sort(layout.begin(), layout.end(), [](const Layout &a, const Layout &b) -> bool {
+        if (a.word != b.word) return a.word < b.word;
+        return a.row > b.row;
+    });
+    int width = format ? (format->size - 1) / 128 + 1 : 1;
+    for (auto &fmt : action_formats) {
+#if 0
+        for (auto &fld : *fmt.second) {
+            if (auto *f = format ? format->field(fld.first) : 0) {
+                if (fld.second.bits != f->bits || fld.second.size != f->size) {
+                    error(fmt.second->lineno, "Action %s format for field %s incompatible "
+                          "with default format", fmt.first.c_str(), fld.first.c_str());
+                    continue; } }
+            for (auto &fmt2 : action_formats) {
+                if (fmt.second == fmt2.second) break;
+                if (auto *f = fmt2.second->field(fld.first)) {
+                    if (fld.second.bits != f->bits || fld.second.size != f->size) {
+                        error(fmt.second->lineno, "Action %s format for field %s incompatible "
+                              "with action %s format", fmt.first.c_str(), fld.first.c_str(),
+                              fmt2.first.c_str());
+                        break; } } } }
+#endif
+        width = std::max(width, int((fmt.second->size - 1) / 128U + 1));
+    }
+    unsigned depth = layout_size() / width;
+    std::vector<int> slice_size(width, 0);
+    unsigned idx = 0;  // ram index within depth
+    int word = 0;      // word within wide table;
+    int home_row = -1;
+    std::map<int, bitvec> final_home_rows;
+    Layout *prev = nullptr;
+    for (auto row = layout.begin(); row != layout.end(); ++row) {
+        if (row->word > 0) word = row->word;
+        if (!prev || prev->word != word || home_rows_per_word[word].getbit(row->row) ||
+            home_row / 2 - row->row / 2 > 5 /* can't go over 5 physical rows for timing */
+            || (!Target::SUPPORT_OVERFLOW_BUS() && home_row >= 8 && row->row < 8)
+            /* can't flow between logical row 7 and 8 in JBay*/
+        ) {
+            if (prev && prev->row == row->row) prev->home_row = false;
+            home_row = row->row;
+            row->home_row = true;
+            final_home_rows[word].setbit(row->row);
+            need_bus(row->lineno, stage->action_data_use, row->row, "action data");
+        }
+        if (row->word >= 0) {
+            if (row->word > width) {
+                error(row->lineno, "Invalid word %u for row %d", row->word, row->row);
+                continue;
+            }
+            slice_size[row->word] += row->memunits.size();
+        } else {
+            if (slice_size[word] + row->memunits.size() > depth) {
+                int split = depth - slice_size[word];
+                row = layout.insert(row, Layout(*row));
+                row->memunits.erase(row->memunits.begin() + split, row->memunits.end());
+                row->vpns.erase(row->vpns.begin() + split, row->vpns.end());
+                auto next = row + 1;
+                next->memunits.erase(next->memunits.begin(), next->memunits.begin() + split);
+                next->vpns.erase(next->vpns.begin(), next->vpns.begin() + split);
+            }
+            row->word = word;
+            if ((slice_size[word] += row->memunits.size()) == int(depth)) ++word;
+        }
+        prev = &*row;
+    }
+    if (!home_rows_per_word.empty()) {
+        for (word = 0; word < width; ++word) {
+            for (unsigned row : home_rows_per_word[word] - final_home_rows[word]) {
+                error(home_lineno, "home row %u not present in table %s", row, name());
+                break;
+            }
+        }
+    }
+    home_rows_per_word = final_home_rows;
+    for (word = 0; word < width; ++word)
+        if (slice_size[word] != int(depth)) {
+            error(layout.front().lineno, "Incorrect size for word %u in layout of table %s", word,
+                  name());
+            break;
+        }
+    for (auto &r : layout) LOG4("  " << r);
+    action_bus->pass1(this);
+    if (actions) actions->pass1(this);
+    AttachedTable::pass1();
+    SelectionTable *selector = nullptr;
+    for (auto mtab : match_tables) {
+        auto *s = mtab->get_selector();
+        if (s && selector && s != selector)
+            error(lineno, "Inconsistent selectors %s and %s for table %s", s->name(),
+                  selector->name(), name());
+        if (s) selector = s;
+    }
+}
+
+void ActionTable::pass2() {
+    LOG1("### Action table " << name() << " pass2 " << loc());
+    if (match_tables.empty()) error(lineno, "No match table for action table %s", name());
+    if (!format) format.reset(new Format(this));
+    /* Driver does not support formats with different widths. Need all formats
+     * to be the same size, so pad them out */
+    pad_format_fields();
+    if (actions) actions->pass2(this);
+    if (action_bus) action_bus->pass2(this);
+}
+
+/**
+ * FIXME: Due to get_match_tables function not being a const function (which itself should be
+ * a separate PR), in order to get all potentialy pack formats from all of the actions in all
+ * associated match tables, an initial pass is required to perform this lookup.
+ *
+ * Thus a map is saved in this pass containing a copy of an action, with a listing of all of
+ * the possible aliases.  This will only currently work if the aliases are identical across
+ * actions, which at the moment, they are.  We will need to change this functionality when
+ * actions could potentially be different across action profiles, either by gathering a union
+ * of the aliases across actions with the same action handle, or perhaps de-alias the pack
+ * formats before context JSON generation
+ */
+void ActionTable::pass3() {
+    LOG1("### Action table " << name() << " pass3 " << loc());
+    action_bus->pass3(this);
+
+    if (!actions) {
+        Actions *tbl_actions = nullptr;
+        for (auto mt : get_match_tables()) {
+            if (mt->actions) {
+                tbl_actions = mt->actions.get();
+            } else if (auto tern = mt->to<TernaryMatchTable>()) {
+                if (tern->indirect && tern->indirect->actions) {
+                    tbl_actions = tern->indirect->actions.get();
+                }
+            }
+            BUG_CHECK(tbl_actions);
+            for (auto &act : *tbl_actions) {
+                if (pack_actions.count(act.name) == 0) pack_actions[act.name] = &act;
+            }
+        }
+    } else {
+        for (auto &act : *actions) {
+            if (pack_actions.count(act.name) == 0) pack_actions[act.name] = &act;
+        }
+    }
+
+    for (auto &fmt : action_formats) {
+        if (pack_actions.count(fmt.first) == 0) {
+            error(fmt.second->lineno, "Format for non-existant action %s", fmt.first.c_str());
+            continue;
+        }
+    }
+}
+
+template <class REGS>
+static void flow_selector_addr(REGS &regs, int from, int to) {
+    BUG_CHECK(from > to);
+    BUG_CHECK((from & 3) == 3);
+    if (from / 2 == to / 2) {
+        /* R to L */
+        regs.rams.map_alu.selector_adr_switchbox.row[from / 4]
+            .ctl.l_oflo_adr_o_mux_select.l_oflo_adr_o_sel_selector_adr_r_i = 1;
+        return;
+    }
+    if (from & 1) /* R down */
+        regs.rams.map_alu.selector_adr_switchbox.row[from / 4]
+            .ctl.b_oflo_adr_o_mux_select.b_oflo_adr_o_sel_selector_adr_r_i = 1;
+    // else
+    //     /* L down */
+    //     regs.rams.map_alu.selector_adr_switchbox.row[from/4].ctl
+    //         .b_oflo_adr_o_mux_select.b_oflo_adr_o_sel_selector_adr_l_i = 1;
+
+    /* Include all selection address switchboxes needed when the action RAMs
+     * reside on overflow rows */
+    for (int row = from / 4 - 1; row >= to / 4; row--)
+        if (row != to / 4 || (to % 4) < 2) /* top to bottom */
+            regs.rams.map_alu.selector_adr_switchbox.row[row]
+                .ctl.b_oflo_adr_o_mux_select.b_oflo_adr_o_sel_oflo_adr_t_i = 1;
+
+    switch (to & 3) {
+        case 3:
+            /* flow down to R */
+            regs.rams.map_alu.selector_adr_switchbox.row[to / 4].ctl.r_oflo_adr_o_mux_select = 1;
+            break;
+        case 2:
+            /* flow down to L */
+            regs.rams.map_alu.selector_adr_switchbox.row[to / 4]
+                .ctl.l_oflo_adr_o_mux_select.l_oflo_adr_o_sel_oflo_adr_t_i = 1;
+            break;
+        default:
+            /* even physical rows are hardwired to flow down to both L and R */
+            break;
+    }
+}
+
+template <class REGS>
+void ActionTable::write_regs_vt(REGS &regs) {
+    LOG1("### Action table " << name() << " write_regs " << loc());
+    unsigned fmt_log2size = format ? format->log2size : 0;
+    unsigned width = format ? (format->size - 1) / 128 + 1 : 1;
+    for (auto &fmt : Values(action_formats)) {
+        fmt_log2size = std::max(fmt_log2size, fmt->log2size);
+        width = std::max(width, (fmt->size - 1) / 128U + 1);
+    }
+    unsigned depth = layout_size() / width;
+    bool push_on_overflow = false;  // true if we overflow from bottom to top
+    unsigned idx = 0;
+    int word = 0;
+    Layout *home = nullptr;
+    int prev_logical_row = -1;
+    decltype(regs.rams.array.switchbox.row[0].ctl) *home_switch_ctl = 0, *prev_switch_ctl = 0;
+    auto &adrdist = regs.rams.match.adrdist;
+    auto &icxbar = adrdist.adr_dist_action_data_adr_icxbar_ctl;
+    for (Layout &logical_row : layout) {
+        unsigned row = logical_row.row / 2;
+        unsigned side = logical_row.row & 1; /* 0 == left  1 == right */
+        unsigned top = logical_row.row >= 8; /* 0 == bottom  1 == top */
+        auto vpn = logical_row.vpns.begin();
+        auto &switch_ctl = regs.rams.array.switchbox.row[row].ctl;
+        auto &map_alu_row = regs.rams.map_alu.row[row];
+        if (logical_row.home_row) {
+            home = &logical_row;
+            home_switch_ctl = &switch_ctl;
+            action_bus->write_action_regs(regs, this, logical_row.row, word);
+            if (side)
+                switch_ctl.r_action_o_mux_select.r_action_o_sel_action_rd_r_i = 1;
+            else
+                switch_ctl.r_l_action_o_mux_select.r_l_action_o_sel_action_rd_l_i = 1;
+            for (auto mtab : match_tables)
+                icxbar[mtab->logical_id].address_distr_to_logical_rows |= 1U << logical_row.row;
+        } else {
+            BUG_CHECK(home);
+            // FIXME use DataSwitchboxSetup for this somehow?
+            if (&switch_ctl == home_switch_ctl) {
+                /* overflow from L to R action */
+                switch_ctl.r_action_o_mux_select.r_action_o_sel_oflo_rd_l_i = 1;
+            } else {
+                if (side) {
+                    /* overflow R up */
+                    switch_ctl.t_oflo_rd_o_mux_select.t_oflo_rd_o_sel_oflo_rd_r_i = 1;
+                } else {
+                    /* overflow L up */
+                    switch_ctl.t_oflo_rd_o_mux_select.t_oflo_rd_o_sel_oflo_rd_l_i = 1;
+                }
+                if (prev_switch_ctl != &switch_ctl) {
+                    if (prev_switch_ctl != home_switch_ctl)
+                        prev_switch_ctl->t_oflo_rd_o_mux_select.t_oflo_rd_o_sel_oflo_rd_b_i = 1;
+                    else if (home->row & 1)
+                        home_switch_ctl->r_action_o_mux_select.r_action_o_sel_oflo_rd_b_i = 1;
+                    else
+                        home_switch_ctl->r_l_action_o_mux_select.r_l_action_o_sel_oflo_rd_b_i = 1;
+                }
+            }
+            /* if we're skipping over full rows and overflowing over those rows, need to
+             * propagate overflow from bottom to top.  This effectively uses only the
+             * odd (right side) overflow busses.  L ovfl can still go to R action */
+            for (int r = prev_logical_row / 2 - 1; r > static_cast<int>(row); r--) {
+                prev_switch_ctl = &regs.rams.array.switchbox.row[r].ctl;
+                prev_switch_ctl->t_oflo_rd_o_mux_select.t_oflo_rd_o_sel_oflo_rd_b_i = 1;
+            }
+
+            auto &oflo_adr_xbar = map_alu_row.vh_xbars.adr_dist_oflo_adr_xbar_ctl[side];
+            if ((home->row >= 8) == top) {
+                oflo_adr_xbar.adr_dist_oflo_adr_xbar_source_index = home->row % 8;
+                oflo_adr_xbar.adr_dist_oflo_adr_xbar_source_sel = 0;
+            } else {
+                BUG_CHECK(home->row >= 8);
+                BUG_CHECK(options.target == TOFINO);
+                oflo_adr_xbar.adr_dist_oflo_adr_xbar_source_index = 0;
+                oflo_adr_xbar.adr_dist_oflo_adr_xbar_source_sel = 3;
+                push_on_overflow = true;
+                for (auto mtab : match_tables)
+                    if (!icxbar[mtab->logical_id].address_distr_to_overflow)
+                        icxbar[mtab->logical_id].address_distr_to_overflow = 1;
+            }
+            oflo_adr_xbar.adr_dist_oflo_adr_xbar_enable = 1;
+        }
+        SelectionTable *selector = get_selector();
+        if (selector) {
+            if (logical_row.row != selector->home_row()) {
+                if (logical_row.row > selector->home_row())
+                    error(lineno, "Selector data from %s on row %d cannot flow up to %s on row %d",
+                          selector->name(), selector->home_row(), name(), logical_row.row);
+                else
+                    flow_selector_addr(regs, selector->home_row(), logical_row.row);
+            }
+        }
+        for (auto &memunit : logical_row.memunits) {
+            int logical_col = memunit.col;
+            unsigned col = logical_col + 6 * side;
+            auto &ram = regs.rams.array.row[row].ram[col];
+            auto &unitram_config = map_alu_row.adrmux.unitram_config[side][logical_col];
+            if (logical_row.home_row) unitram_config.unitram_action_subword_out_en = 1;
+            ram.unit_ram_ctl.match_ram_write_data_mux_select = UnitRam::DataMux::NONE;
+            ram.unit_ram_ctl.match_ram_read_data_mux_select =
+                home == &logical_row ? UnitRam::DataMux::ACTION : UnitRam::DataMux::OVERFLOW;
+            unitram_config.unitram_type = UnitRam::ACTION;
+            if (!no_vpns) unitram_config.unitram_vpn = *vpn++;
+            unitram_config.unitram_logical_table = action_id >= 0 ? action_id : logical_id;
+            if (gress == INGRESS || gress == GHOST)
+                unitram_config.unitram_ingress = 1;
+            else
+                unitram_config.unitram_egress = 1;
+            unitram_config.unitram_enable = 1;
+            auto &ram_mux = map_alu_row.adrmux.ram_address_mux_ctl[side][logical_col];
+            auto &adr_mux_sel = ram_mux.ram_unitram_adr_mux_select;
+            if (selector) {
+                int shift = std::min(fmt_log2size - 2, MAX_AD_SHIFT);
+                auto &shift_ctl = regs.rams.map_alu.mau_selector_action_adr_shift[row];
+                if (logical_row.row == selector->layout[0].row) {
+                    /* we're on the home row of the selector, so use it directly */
+                    if (home == &logical_row)
+                        adr_mux_sel = UnitRam::AdrMux::SELECTOR_ALU;
+                    else
+                        adr_mux_sel = UnitRam::AdrMux::SELECTOR_ACTION_OVERFLOW;
+                    if (side)
+                        shift_ctl.mau_selector_action_adr_shift_right = shift;
+                    else
+                        shift_ctl.mau_selector_action_adr_shift_left = shift;
+                } else {
+                    /* not on the home row -- use overflows */
+                    if (home == &logical_row)
+                        adr_mux_sel = UnitRam::AdrMux::SELECTOR_OVERFLOW;
+                    else
+                        adr_mux_sel = UnitRam::AdrMux::SELECTOR_ACTION_OVERFLOW;
+                    if (side)
+                        shift_ctl.mau_selector_action_adr_shift_right_oflo = shift;
+                    else
+                        shift_ctl.mau_selector_action_adr_shift_left_oflo = shift;
+                }
+            } else {
+                if (home == &logical_row) {
+                    adr_mux_sel = UnitRam::AdrMux::ACTION;
+                } else {
+                    adr_mux_sel = UnitRam::AdrMux::OVERFLOW;
+                    ram_mux.ram_oflo_adr_mux_select_oflo = 1;
+                }
+            }
+            if (gress == EGRESS)
+                regs.cfg_regs.mau_cfg_uram_thread[col / 4U] |= 1U << (col % 4U * 8U + row);
+            regs.rams.array.row[row].actiondata_error_uram_ctl[timing_thread(gress)] |=
+                1 << (col - 2);
+            if (++idx == depth) {
+                idx = 0;
+                home = nullptr;
+                ++word;
+            }
+        }
+        prev_switch_ctl = &switch_ctl;
+        prev_logical_row = logical_row.row;
+    }
+    if (push_on_overflow) adrdist.oflo_adr_user[0] = adrdist.oflo_adr_user[1] = AdrDist::ACTION;
+    if (actions) actions->write_regs(regs, this);
+}
+
+// Action data address huffman encoding
+//    { 0,      {"xxx", "xxxxx"} },
+//    { 8,      {"xxx", "xxxx0"} },
+//    { 16,     {"xxx", "xxx01"} },
+//    { 32,     {"xxx", "xx011"} },
+//    { 64,     {"xxx", "x0111"} },
+//    { 128,    {"xxx", "01111"} },
+//    { 256,    {"xx0", "11111"} },
+//    { 512,    {"x01", "11111"} },
+//    { 1024,   {"011", "11111"} };
+
+// Track the actions added to json per action table. gen_tbl_cfg can be called
+// multiple times for the same action for each stage table in case of an action
+// table split across multiple stages, but must be added to json only once.
+static std::map<std::string, std::set<std::string>> actions_in_json;
+void ActionTable::gen_tbl_cfg(json::vector &out) const {
+    // FIXME -- this is wrong if actions have different format sizes
+    unsigned number_entries = (layout_size() * 128 * 1024) / (1 << format->log2size);
+    json::map &tbl = *base_tbl_cfg(out, "action_data", number_entries);
+    json::map &stage_tbl = *add_stage_tbl_cfg(tbl, "action_data", number_entries);
+    for (auto &act : pack_actions) {
+        auto *fmt = format.get();
+        if (action_formats.count(act.first)) fmt = action_formats.at(act.first).get();
+        add_pack_format(stage_tbl, fmt, true, true, act.second);
+        auto p4Name = p4_name();
+        if (!p4Name) {
+            error(lineno, "No p4 table name found for table : %s", name());
+            continue;
+        }
+        std::string tbl_name = p4Name;
+        std::string act_name = act.second->name;
+        if (actions_in_json.count(tbl_name) == 0) {
+            actions_in_json[tbl_name].insert(act_name);
+            act.second->gen_simple_tbl_cfg(tbl["actions"]);
+        } else {
+            auto acts_added = actions_in_json[tbl_name];
+            if (acts_added.count(act_name) == 0) {
+                actions_in_json[tbl_name].emplace(act_name);
+                act.second->gen_simple_tbl_cfg(tbl["actions"]);
+            }
+        }
+    }
+    stage_tbl["memory_resource_allocation"] =
+        gen_memory_resource_allocation_tbl_cfg("sram", layout);
+    // FIXME: what is the check for static entries?
+    tbl["static_entries"] = json::vector();
+    std::string hr = how_referenced();
+    if (hr.empty()) hr = indirect ? "indirect" : "direct";
+    tbl["how_referenced"] = hr;
+    merge_context_json(tbl, stage_tbl);
+}
+
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(ActionTable, TARGET_CLASS)  // NOLINT(readability/fn_size)
diff --git a/backends/tofino/bf-asm/alias_array.h b/backends/tofino/bf-asm/alias_array.h
new file mode 100644
index 00000000000..0c4fb161e8e
--- /dev/null
+++ b/backends/tofino/bf-asm/alias_array.h
@@ -0,0 +1,142 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_ALIAS_ARRAY_H_
+#define BACKENDS_TOFINO_BF_ASM_ALIAS_ARRAY_H_
+
+#include <stdlib.h>
+
+#include "bfas.h"  // for BUG_CHECK
+
+template <size_t S, typename T>
+class alias_array;
+
+template <typename T>
+class alias_array_base {
+ protected:
+    class iterator {
+        T **ptr;
+
+     public:
+        explicit iterator(T **p) : ptr(p) {}
+        iterator &operator++() {
+            ++ptr;
+            return *this;
+        }
+        iterator &operator--() {
+            --ptr;
+            return *this;
+        }
+        iterator &operator++(int) {
+            auto copy = *this;
+            ++ptr;
+            return copy;
+        }
+        iterator &operator--(int) {
+            auto copy = *this;
+            --ptr;
+            return copy;
+        }
+        bool operator==(const iterator &i) const { return ptr == i.ptr; }
+        bool operator!=(const iterator &i) const { return ptr != i.ptr; }
+        T &operator*() const { return **ptr; }
+        T *operator->() const { return *ptr; }
+    };
+
+ public:
+    virtual T &operator[](size_t) = 0;
+    virtual const T &operator[](size_t) const = 0;
+    virtual size_t size() const = 0;
+    virtual iterator begin() = 0;
+    virtual iterator end() = 0;
+    virtual bool modified() const = 0;
+    virtual void set_modified(bool v = true) = 0;
+    virtual bool disabled() const = 0;
+    virtual bool disable() = 0;
+    virtual bool disable_if_zero() = 0;
+    virtual void enable() = 0;
+};
+
+template <size_t S, typename T>
+class alias_array : public alias_array_base<T> {
+    T *data[S];
+    using typename alias_array_base<T>::iterator;
+
+ public:
+    alias_array(const std::initializer_list<T *> &v) {
+        auto it = v.begin();
+        for (auto &e : data) {
+            BUG_CHECK(it != v.end(), "Not enough initializers for alias array");
+            e = *it++;
+        }
+        BUG_CHECK(it == v.end(), "Too many initializers for alias array");
+    }
+    T &operator[](size_t idx) {
+        BUG_CHECK(idx < S, "alias array index %zd out of bounds %zd", idx, S);
+        return *data[idx];
+    }
+    const T &operator[](size_t idx) const {
+        BUG_CHECK(idx < S, "alias array index %zd out of bounds %zd", idx, S);
+        return *data[idx];
+    }
+    size_t size() const { return S; }
+    iterator begin() { return iterator(data); }
+    iterator end() { return iterator(data + S); }
+    bool modified() const {
+        for (size_t i = 0; i < S; i++)
+            if (data[i]->modified()) return true;
+        return false;
+    }
+    void set_modified(bool v = true) {
+        for (size_t i = 0; i < S; i++) data[i]->set_modified(v);
+    }
+    bool disabled() const {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i]->disabled()) rv = false;
+        return rv;
+    }
+    bool disable() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i]->disable()) rv = false;
+        return rv;
+    }
+    void enable() {
+        for (size_t i = 0; i < S; i++) data[i]->enable();
+    }
+    bool disable_if_unmodified() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i]->disable_if_unmodified()) rv = false;
+        return rv;
+    }
+    bool disable_if_zero() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i]->disable_if_zero()) rv = false;
+        return rv;
+    }
+    bool disable_if_reset_value() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i]->disable_if_reset_value()) rv = false;
+        return rv;
+    }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_ALIAS_ARRAY_H_ */
diff --git a/backends/tofino/bf-asm/alloc.h b/backends/tofino/bf-asm/alloc.h
new file mode 100644
index 00000000000..e3aac68e5e1
--- /dev/null
+++ b/backends/tofino/bf-asm/alloc.h
@@ -0,0 +1,230 @@
+#ifndef BACKENDS_TOFINO_BF_ASM_ALLOC_H_
+#define BACKENDS_TOFINO_BF_ASM_ALLOC_H_
+
+#include <stdlib.h>
+
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+
+namespace BFN {
+
+template <class T>
+class Alloc1Dbase {
+    int size_;
+    T *data;
+    Alloc1Dbase() = delete;
+    Alloc1Dbase(const Alloc1Dbase &) = delete;
+    Alloc1Dbase &operator=(const Alloc1Dbase &) = delete;
+    Alloc1Dbase &operator=(Alloc1Dbase &&) = delete;
+
+ public:
+    explicit Alloc1Dbase(int sz) : size_(sz) { data = sz ? new T[sz]{} : nullptr; }
+    Alloc1Dbase(Alloc1Dbase &&a) noexcept : size_(a.size_), data(a.data) { a.data = 0; }
+    virtual ~Alloc1Dbase() { delete[] data; }
+
+    typedef T *iterator;
+    typedef T *const_iterator;
+    T &operator[](int i) {
+        if (i < 0 || i >= size_) throw std::out_of_range("Alloc1D");
+        return data[i];
+    }
+    const T &operator[](int i) const {
+        if (i < 0 || i >= size_) throw std::out_of_range("Alloc1D");
+        return data[i];
+    }
+    bool operator==(const Alloc1Dbase<T> &t) const {
+        return std::equal(data, data + size_, t.data, t.data + t.size_);
+    }
+    bool operator!=(const Alloc1Dbase<T> &t) const { return !(*this == t); }
+
+    int size() const { return size_; }
+    void clear() { std::fill(data, data + size_, T()); }
+    T *begin() { return data; }
+    T *end() { return data + size_; }
+};
+
+template <class T, int S>
+class Alloc1D : public Alloc1Dbase<T> {
+ public:
+    Alloc1D() : Alloc1Dbase<T>(S) {}
+    Alloc1Dbase<T> &base() { return *this; }
+    bool operator!=(const Alloc1D<T, S> &t) const { return Alloc1Dbase<T>::operator!=(t); }
+};
+
+template <class T>
+class Alloc3Dbase;
+
+template <class T>
+class Alloc2Dbase {
+    int nrows, ncols;
+    T *data;
+    template <class U>
+    class rowref {
+        U *row;
+        int ncols;
+        friend class Alloc2Dbase;
+        friend class Alloc3Dbase<U>;
+        rowref(U *r, int c) : row(r), ncols(c) {}
+
+     public:
+        typedef U *iterator;
+        typedef const U *const_iterator;
+        U &operator[](int i) const {
+            if (i < 0 || i >= ncols) throw std::out_of_range("Alloc2D");
+            return row[i];
+        }
+        U *begin() const { return row; }
+        U *end() const { return row + ncols; }
+    };
+    Alloc2Dbase() = delete;
+    Alloc2Dbase(const Alloc2Dbase &) = delete;
+    Alloc2Dbase &operator=(const Alloc2Dbase &) = delete;
+    Alloc2Dbase &operator=(Alloc2Dbase &&) = delete;
+    friend class Alloc3Dbase<T>;
+
+ public:
+    Alloc2Dbase(int r, int c) : nrows(r), ncols(c) {
+        size_t sz = r * c;
+        data = sz ? new T[sz]{} : nullptr;
+    }
+    Alloc2Dbase(Alloc2Dbase &&a) noexcept : nrows(a.nrows), ncols(a.ncols), data(a.data) {
+        a.data = 0;
+    }
+    virtual ~Alloc2Dbase() { delete[] data; }
+
+    rowref<T> operator[](int i) {
+        if (i < 0 || i >= nrows) throw std::out_of_range("Alloc2D");
+        return {data + i * ncols, ncols};
+    }
+    rowref<const T> operator[](int i) const {
+        if (i < 0 || i >= nrows) throw std::out_of_range("Alloc2D");
+        return {data + i * ncols, ncols};
+    }
+    T &at(int i, int j) {
+        if (i < 0 || i >= nrows || j < 0 || j >= ncols) throw std::out_of_range("Alloc2D");
+        return data[i * ncols + j];
+    }
+    const T &at(int i, int j) const {
+        if (i < 0 || i >= nrows || j < 0 || j >= ncols) throw std::out_of_range("Alloc2D");
+        return data[i * ncols + j];
+    }
+    T &operator[](std::pair<int, int> i) {
+        if (i.first < 0 || i.first >= nrows || i.second < 0 || i.second >= ncols)
+            throw std::out_of_range("Alloc2D");
+        return data[i.first * ncols + i.second];
+    }
+    const T &operator[](std::pair<int, int> i) const {
+        if (i.first < 0 || i.first >= nrows || i.second < 0 || i.second >= ncols)
+            throw std::out_of_range("Alloc2D");
+        return data[i.first * ncols + i.second];
+    }
+    bool operator==(const Alloc2Dbase<T> &t) const {
+        int sz = nrows * ncols;
+        if (nrows != t.nrows || ncols != t.ncols) return false;
+        return std::equal(data, data + sz, t.data);
+    }
+    bool operator!=(const Alloc2Dbase<T> &t) const { return !(*this == t); }
+
+    int rows() const { return nrows; }
+    int cols() const { return ncols; }
+    void clear() { std::fill(data, data + nrows * ncols, T()); }
+};
+
+template <class T, int R, int C>
+class Alloc2D : public Alloc2Dbase<T> {
+ public:
+    Alloc2D() : Alloc2Dbase<T>(R, C) {}
+    Alloc2Dbase<T> &base() { return *this; }
+};
+
+template <class T>
+class Alloc3Dbase {
+    int nmats, nrows, ncols;
+    T *data;
+    template <class U>
+    class matref {
+        U *matrix;
+        int nrows, ncols;
+        friend class Alloc3Dbase;
+
+     public:
+        typename Alloc2Dbase<T>::template rowref<U> operator[](int i) const {
+            if (i < 0 || i >= nrows) throw std::out_of_range("Alloc3D");
+            return {matrix + i * ncols, ncols};
+        }
+        U &operator[](std::pair<int, int> i) const {
+            if (i.first < 0 || i.first >= nrows || i.second < 0 || i.second >= ncols)
+                throw std::out_of_range("Alloc3D");
+            return matrix[i.first * ncols + i.second];
+        }
+    };
+    Alloc3Dbase() = delete;
+    Alloc3Dbase(const Alloc3Dbase &) = delete;
+    Alloc3Dbase &operator=(const Alloc3Dbase &) = delete;
+    Alloc3Dbase &operator=(Alloc3Dbase &&) = delete;
+
+ public:
+    Alloc3Dbase(int m, int r, int c) : nmats(m), nrows(r), ncols(c) {
+        size_t sz = m * r * c;
+        data = sz ? new T[sz]{} : nullptr;
+    }
+    Alloc3Dbase(Alloc3Dbase &&a) noexcept
+        : nmats(a.nmats), nrows(a.nrows), ncols(a.ncols), data(a.data) {
+        a.data = 0;
+    }
+    virtual ~Alloc3Dbase() { delete[] data; }
+
+    matref<T> operator[](int i) {
+        if (i < 0 || i >= nmats) throw std::out_of_range("Alloc3D");
+        return {data + i * nrows * ncols, nrows, ncols};
+    }
+    matref<const T> operator[](int i) const {
+        if (i < 0 || i >= nmats) throw std::out_of_range("Alloc3D");
+        return {data + i * nrows * ncols, nrows, ncols};
+    }
+    T &at(int i, int j, int k) {
+        if (i < 0 || i >= nmats || j < 0 || j >= nrows || k < 0 || k >= ncols)
+            throw std::out_of_range("Alloc3D");
+        return data[i * nrows * ncols + j * ncols + k];
+    }
+    const T &at(int i, int j, int k) const {
+        if (i < 0 || i >= nmats || j < 0 || j >= nrows || k < 0 || k >= ncols)
+            throw std::out_of_range("Alloc3D");
+        return data[i * nrows * ncols + j * ncols + k];
+    }
+    T &operator[](std::tuple<int, int, int> i) {
+        if (std::get<0>(i) < 0 || std::get<0>(i) >= nmats || std::get<1>(i) < 0 ||
+            std::get<1>(i) >= nrows || std::get<2>(i) < 0 || std::get<2>(i) >= ncols)
+            throw std::out_of_range("Alloc3D");
+        return data[std::get<0>(i) * nrows * ncols + std::get<1>(i) * ncols + std::get<2>(i)];
+    }
+    const T &operator[](std::tuple<int, int, int> i) const {
+        if (std::get<0>(i) < 0 || std::get<0>(i) >= nmats || std::get<1>(i) < 0 ||
+            std::get<1>(i) >= nrows || std::get<2>(i) < 0 || std::get<2>(i) >= ncols)
+            throw std::out_of_range("Alloc3D");
+        return data[std::get<0>(i) * nrows * ncols + std::get<1>(i) * ncols + std::get<2>(i)];
+    }
+    bool operator==(const Alloc3Dbase<T> &t) const {
+        int sz = nmats * nrows * ncols;
+        if (nmats != t.nmats || nrows != t.nrows || ncols != t.ncols) return false;
+        return std::equal(data, data + sz, t.data);
+    }
+    bool operator!=(const Alloc3Dbase<T> &t) const { return !(*this == t); }
+
+    int matrixes() const { return nmats; }
+    int rows() const { return nrows; }
+    int cols() const { return ncols; }
+    void clear() { std::fill(data, data + nmats * nrows * ncols, T()); }
+};
+
+template <class T, int B, int R, int C>
+class Alloc3D : public Alloc3Dbase<T> {
+ public:
+    Alloc3D() : Alloc3Dbase<T>(B, R, C) {}
+    Alloc3Dbase<T> &base() { return *this; }
+};
+
+}  // namespace BFN
+
+#endif /* BACKENDS_TOFINO_BF_ASM_ALLOC_H_ */
diff --git a/backends/tofino/bf-asm/asm-parse.ypp b/backends/tofino/bf-asm/asm-parse.ypp
new file mode 100644
index 00000000000..8c9c5e1a463
--- /dev/null
+++ b/backends/tofino/bf-asm/asm-parse.ypp
@@ -0,0 +1,446 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+%{
+#define YYDEBUG 1
+#include "backends/tofino/bf-asm/asm-types.h"
+#include <cinttypes>
+#include <string.h>
+#include "backends/tofino/bf-asm/sections.h"
+#include <map>
+#include <string>
+static int yylex();
+static void yyerror(const char *, ...);
+static int lineno;
+static std::map<int, std::pair<std::string, int>> line_file_map;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+/* DANGER -- The value/command functions take non-const references to
+ * value_t and MOVE them, so the source should not be used or cleaned
+ * up afterwards.  This matches up with how bison  actions work -- in
+ * the normal case it does NOT try to destroy stuff on the value stack,
+ * but rather just pops it and lets it go.  Do not try to use them
+ * outside of bison action code */
+static value_t value(int64_t v, int lineno_adj) {
+    value_t rv = {tINT, lineno - lineno_adj};
+    rv.i = v;
+    return rv; }
+static value_t value(VECTOR(uintptr_t) &v, int lineno_adj) {
+    value_t rv{tBIGINT, lineno - lineno_adj};
+    rv.bigi = v;
+    return rv; }
+static value_t value(int lo, int hi, int lineno_adj) {
+    value_t rv{tRANGE, lineno - lineno_adj};
+    rv.range.lo = lo;
+    rv.range.hi = hi;
+    return rv; }
+static value_t value(char *v, int lineno_adj) {
+    value_t rv{tSTR, lineno - lineno_adj};
+    rv.s = v;
+    return rv; }
+static value_t value(match_t v, int lineno_adj) {
+    value_t rv{tMATCH, lineno - lineno_adj};
+    rv.m = v;
+    return rv; }
+static value_t value(VECTOR(match_t) v, int lineno_adj) {
+    value_t rv{tBIGMATCH, lineno - lineno_adj};
+    rv.bigm = v;
+    return rv; }
+static value_t value(VECTOR(value_t) &v, int lineno_adj) {
+    value_t rv{tVEC, lineno - lineno_adj};
+    if (v.size > 0) rv.lineno = v.data[0].lineno;
+    rv.vec = v;
+    return rv; }
+static value_t value(VECTOR(pair_t) &v, int lineno_adj) {
+    value_t rv{tMAP, lineno - lineno_adj};
+    if (v.size > 0) rv.lineno = v.data[0].key.lineno;
+    rv.map = v;
+    return rv; }
+static value_t empty_vector(int lineno_adj) {
+    value_t rv{tVEC, lineno - lineno_adj};
+    memset(&rv.vec, 0, sizeof(rv.vec));
+    return rv; }
+static value_t empty_map(int lineno_adj) {
+    value_t rv{tMAP, lineno - lineno_adj};
+    memset(&rv.vec, 0, sizeof(rv.vec));
+    return rv; }
+static value_t singleton_map(const value_t &k, const value_t &v) {
+    value_t rv{tMAP, k.lineno};
+    VECTOR_init1(rv.map, pair_t(k, v));
+    return rv; }
+static value_t command(char *cmd, const VECTOR(value_t) &args, int lineno_adj) {
+    value_t rv{tCMD, lineno - lineno_adj};
+    if (args.size && args.data[0].lineno < rv.lineno)
+        rv.lineno = args.data[0].lineno;
+    rv.vec = args;
+    VECTOR_insert(rv.vec, 0, 1);
+    rv[0] = value(cmd, 0);
+    rv[0].lineno = rv.lineno;
+    return rv; }
+static value_t command(char *cmd, value_t &arg, int lineno_adj) {
+    value_t rv{tCMD, lineno - lineno_adj};
+    if (arg.lineno < rv.lineno)
+        rv.lineno = arg.lineno;
+    VECTOR_init2(rv.vec, value(cmd, 0), arg);
+    rv[0].lineno = rv.lineno;
+    return rv; }
+static value_t command(char *cmd, value_t &&arg, int lineno_adj) {
+    return command(cmd, arg, lineno_adj); }
+static value_t command(char *cmd, bool merge, value_t &a1, value_t &a2, int lineno_adj) {
+    if (merge && a1.type == tCMD && a1 == cmd && a1.vec.size > 2) {
+        free(cmd);
+        VECTOR_add(a1.vec, a2);
+        return a1; }
+    if (merge && a2.type == tCMD && a2 == cmd && a2.vec.size > 2) {
+        free(cmd);
+        VECTOR_insert(a2.vec, 1);
+        a2.vec[1] = a1;
+        return a2; }
+    value_t rv{tCMD, lineno - lineno_adj};
+    if (a1.lineno < rv.lineno)
+        rv.lineno = a1.lineno;
+    VECTOR_init3(rv.vec, value(cmd, 0), a1, a2);
+    rv[0].lineno = rv.lineno;
+    return rv; }
+
+#define VAL(...)  value(__VA_ARGS__, yychar == '\n' ? 1 : 0)
+#define CMD(...)  command(__VA_ARGS__, yychar == '\n' ? 1 : 0)
+
+#pragma GCC diagnostic pop
+%}
+
+%define parse.error verbose
+%define lr.default-reduction accepting
+
+%nonassoc LOW_PREC
+%left '|' '^'
+%left '&'
+%left '<' '>'
+%nonassoc UNARY
+
+%union {
+    int64_t             i;
+    VECTOR(uintptr_t)   bigi;
+    char                *str;
+    match_t             match;
+    VECTOR(match_t)     bigm;
+    value_t             value;
+    VECTOR(value_t)     vec;
+    pair_t              pair;
+    VECTOR(pair_t)      map;
+}
+
+%token          INDENT UNINDENT DOTDOT
+%token<i>       INT
+%token<bigi>    BIGINT
+%token<str>     ID
+%token<str>     STR
+%token<match>   MATCH
+%token<bigm>    BIGMATCH
+
+%type<value>    param param_expr list_element key value elements opt_indent_elements
+                indent_elements flow_value
+%type<vec>      opt_params params comma_params linewrapped_value_list list_elements value_list dotvals
+%type<pair>     map_element pair
+%type<map>      map_elements pair_list
+
+%destructor   { free($$); } <str>
+%destructor   { VECTOR_fini($$); } <bigi>
+%destructor   { free_value(&$$); } <value>
+%destructor   { VECTOR_foreach($$, free_value); VECTOR_fini($$); } <vec>
+%destructor   { free_pair(&$$); } <pair>
+%destructor   { VECTOR_foreach($$, free_pair); VECTOR_fini($$); } <map>
+
+%printer  { fprintf(yyoutput, "%" PRId64, $$); } <i>
+%printer  { fprintf(yyoutput, "0x%" PRIuPTR, $$.data[$$.size-1]);
+            for (int i = $$.size-2; i >= 0; i--)
+                fprintf(yyoutput, "%016" PRIuPTR, $$.data[i]); } <bigi>
+%printer  { if ($$) fprintf(yyoutput, "'%s'", $$); else fprintf(yyoutput, "null"); } <str>
+%printer  { print_match(yyoutput, $$); } <match>
+%printer  { fprintf(yyoutput, "%s", value_desc(&$$)); } <value>
+%printer  { fprintf(yyoutput, "vec of size %d", $$.size); } <vec>
+%printer  { fprintf(yyoutput, "map of size %d", $$.size); } <map>
+
+%%
+
+start: INDENT sections UNINDENT | sections | /* epsilon */;
+
+sections: sections section | section ;
+
+section : ID opt_params ':'
+            { $<i>$ = Section::start_section(lineno, $1, $2); }
+          '\n' opt_indent_elements
+            { if (!$<i>4) Section::asm_section($1, $2, $6);
+              VECTOR_foreach($2, free_value);
+              VECTOR_fini($2);
+              free_value(&$6);
+              free($1); }
+        | ID opt_params ':'
+            { $<i>$ = Section::start_section(lineno, $1, $2); }
+          value '\n'
+            { if (!$<i>4) Section::asm_section($1, $2, $5);
+              VECTOR_foreach($2, free_value);
+              VECTOR_fini($2);
+              free_value(&$5);
+              free($1); }
+;
+
+opt_params: /* empty */ { memset(&$$, 0, sizeof($$)); }
+        | params
+        ;
+params  : param %prec LOW_PREC { VECTOR_init1($$, $1); }
+        | params param  { $$ = $1; VECTOR_add($$, $2); }
+        ;
+comma_params
+        : param ',' value { VECTOR_init2($$, $1, $3); }
+        | comma_params ',' value { $$ = $1; VECTOR_add($$, $3); }
+        | param_expr ',' value { VECTOR_init2($$, $1, $3); }
+        | '(' value ')' ',' value { VECTOR_init2($$, $2, $5); }
+        ;
+param   : INT { $$ = VAL($1); }
+        | ID { $$ = VAL($1); }
+        | '-' INT { $$ = VAL(-$2); }
+        | '!' ID { $$ = CMD(strdup("!"), VAL($2)); }
+        | INT DOTDOT INT { $$ = VAL($1, $3); }
+        | ID '(' value ')' { $$ = CMD($1, $3); }
+        | ID '(' value_list ')' { $$ = CMD($1, $3); }
+        | flow_value { $$ = $1; }
+        ;
+param_expr
+        : param '^' value { $$ = CMD(strdup("^"), true, $1, $3); }
+        | param '|' value { $$ = CMD(strdup("|"), true, $1, $3); }
+        | param '&' value { $$ = CMD(strdup("&"), true, $1, $3); }
+        /* rule duplication to get precedence correct */
+        | param_expr '^' value { $$ = CMD(strdup("^"), true, $1, $3); }
+        | param_expr '|' value { $$ = CMD(strdup("|"), true, $1, $3); }
+        | param_expr '&' value { $$ = CMD(strdup("&"), true, $1, $3); }
+        ;
+
+opt_indent_elements: { $$ = empty_map(1); }
+        | indent_elements
+        ;
+
+indent_elements
+        : INDENT elements UNINDENT { $$ = $2; }
+        | INDENT error { $<i>$ = lineno; } error_resync UNINDENT { $$ = empty_map(lineno-$<i>3); }
+        ;
+elements: list_elements { $$ = VAL($1); }
+        | list_elements error error_resync { $$ = VAL($1); }
+        | map_elements { $$ = VAL($1); }
+        | map_elements error error_resync { $$ = VAL($1); }
+        ;
+map_elements: map_elements map_element { $$ = $1; VECTOR_add($$, $2); }
+        | map_element { VECTOR_init1($$, $1); }
+        ;
+list_elements: list_elements list_element { $$ = $1; VECTOR_add($$, $2); }
+        | list_element { VECTOR_init1($$, $1); }
+        ;
+
+map_element
+        : key ':' value '\n' { $$ = pair_t($1, $3); }
+        | key ':' '\n' indent_elements { $$ = pair_t($1, $4); }
+        | key ':' '\n' list_elements { $$ = pair_t($1, VAL($4)); }
+        | key ':' '\n' { $$ = pair_t($1, empty_map(1)); }
+        | '?' value ':' value '\n' { $$ = pair_t($2, $4); }
+        | '?' value ':' '\n' indent_elements { $$ = pair_t($2, $5); }
+        | '?' value ':' '\n' list_elements { $$ = pair_t($2, VAL($5)); }
+        | '?' value '\n' ':' value '\n' { $$ = pair_t($2, $5); }
+        ;
+
+list_element
+        : '-' key ':' value '\n' { $$ = singleton_map($2, $4); }
+        | '-' key ':' value '\n' INDENT map_elements UNINDENT {
+            VECTOR_insert($7, 0);
+            $7.data[0] = pair_t($2, $4);
+            $$ = VAL($7); }
+        | '-' '?' value ':' value '\n' { $$ = singleton_map($3, $5); }
+        | '-' '?' value ':' value '\n' INDENT map_elements UNINDENT {
+            VECTOR_insert($8, 0);
+            $8.data[0] = pair_t($3, $5);
+            $$ = VAL($8); }
+        | '-' value '\n' { $$ = $2; }
+        | '-' ID comma_params '\n' { $$ = command($2, $3, yychar == '\n' ? 2 : 1); }
+        | '-' ID comma_params ',' '\n' linewrapped_value_list
+              { VECTOR_addcopy($3, $6.data, $6.size);
+                $$ = command($2, $3, yychar == '\n' ? 2 : 1);
+                VECTOR_fini($6); }
+        | '-' ID param ',' '\n' linewrapped_value_list
+              { VECTOR_insert($6, 0); $6.data[0] = $3;
+                $$ = command($2, $6, yychar == '\n' ? 2 : 1); }
+        | '-' key ':' '\n' indent_elements { $$ = singleton_map($2, $5); }
+        | '-' '?' value ':' '\n' indent_elements { $$ = singleton_map($3, $6); }
+        | '-' '\n' { $$ = value(strdup(""), yychar == '\n' ? 2 : 1); }
+        ;
+
+key : ID { $$ = VAL($1); }
+    | ID params { $$ = CMD($1, $2); }
+    | INT { $$ = VAL($1); }
+    | BIGINT { $$ = VAL($1); }
+    | MATCH { $$ = VAL($1); }
+    | BIGMATCH { $$ = VAL($1); }
+    | INT DOTDOT INT { $$ = VAL($1, $3); }
+    | ID '(' value_list ')' { $$ = CMD($1, $3); }
+    | ID '(' value ')' { $$ = CMD($1, $3); }
+    | ID '(' ')' { $$ = VAL($1); }
+    | flow_value
+    ;
+
+value: key
+    | '-' value %prec UNARY { if (($$=$2).type == tINT) $$.i = -$$.i; else $$ = CMD(strdup("-"), $2); }
+    | '!' value %prec UNARY { $$ = CMD(strdup("!"), $2); }
+    | dotvals INT { VECTOR_add($1, VAL($2)); $$ = VAL($1); }
+    | value '^' value { $$ = CMD(strdup("^"), true, $1, $3); }
+    | value '|' value { $$ = CMD(strdup("|"), true, $1, $3); }
+    | value '&' value { $$ = CMD(strdup("&"), true, $1, $3); }
+    | value '<' '<' value { $$ = CMD(strdup("<<"), false, $1, $4); }
+    | value '>' '>' value { $$ = CMD(strdup(">>"), false, $1, $4); }
+    | '(' value ')' { $$ = $2; }
+    | STR { $$ = VAL($1); }
+    ;
+
+flow_value
+    : '[' value_list ']' { $$ = VAL($2); }
+    | '[' value ']' { VECTOR(value_t) tmp; VECTOR_init1(tmp, $2); $$ = VAL(tmp); }
+    | '[' value_list error error_resync ']' { $$ = VAL($2); }
+    | '[' value error error_resync ']' {
+            VECTOR(value_t) tmp; VECTOR_init1(tmp, $2); $$ = VAL(tmp); }
+    | '{' pair_list '}' { $$ = VAL($2); }
+    | '{' pair_list error error_resync '}' { $$ = VAL($2); }
+    | '[' ']' { $$ = empty_vector(yychar == '\n' ? 1 : 0); }
+    | '[' error error_resync ']' { $$ = empty_vector(yychar == '\n' ? 1 : 0); }
+    | '{' '}' { $$ = empty_map(yychar == '\n' ? 1 : 0); }
+    | '{' error error_resync '}' { $$ = empty_map(yychar == '\n' ? 1 : 0); }
+    ;
+
+value_list
+    : value_list ',' value { $$ = $1; VECTOR_add($$, $3); }
+    | value ',' value { VECTOR_init2($$, $1, $3); }
+    ;
+linewrapped_value_list
+    : value_list '\n' { $$ = $1; }
+    | value '\n' { VECTOR_init1($$, $1); }
+    | value_list ',' '\n' linewrapped_value_list
+          { $$ = $1; VECTOR_addcopy($$, $4.data, $4.size); VECTOR_fini($4); }
+    | value ',' '\n' linewrapped_value_list
+          { VECTOR_init1($$, $1); VECTOR_addcopy($$, $4.data, $4.size); VECTOR_fini($4); }
+    | INDENT value_list '\n' UNINDENT { $$ = $2; }
+    | INDENT value '\n' UNINDENT { VECTOR_init1($$, $2); }
+    | INDENT value_list ',' '\n' linewrapped_value_list UNINDENT
+          { $$ = $2; VECTOR_addcopy($$, $5.data, $5.size); VECTOR_fini($5); }
+    | INDENT value ',' '\n' linewrapped_value_list UNINDENT
+          { VECTOR_init1($$, $2); VECTOR_addcopy($$, $5.data, $5.size); VECTOR_fini($5); }
+    ;
+
+pair_list
+    : pair_list ',' pair { $$ = $1; VECTOR_add($$, $3); }
+    | pair { VECTOR_init1($$, $1); }
+    ;
+pair: value ':' value { $$ = pair_t($1, $3); }
+    ;
+
+dotvals : dotvals INT '.' { $$ = $1; VECTOR_add($$, VAL($2)); }
+        | INT '.' { VECTOR_init1($$, VAL($1)); }
+
+error_resync: /* epsilon */ | error_resync indent_elements { free_value(&$2); }
+    | error_resync INT | error_resync ID { free($2); } | error_resync MATCH
+    | error_resync BIGMATCH { VECTOR_fini($2); }
+    | error_resync BIGINT { VECTOR_fini($2); } | error_resync ':' | error_resync '-'
+    | error_resync ',' | error_resync '(' | error_resync ')' | error_resync DOTDOT
+    | error_resync '\n' | error_resync flow_value { free_value(&$2); }
+    ;
+
+%%
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wdeprecated-register"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#include "backends/tofino/bf-asm/gen/lex-yaml.c"
+#pragma GCC diagnostic pop
+
+int error_count = 0;
+int warn_count = 0;
+
+std::ostream &operator<<(std::ostream &out, const SrcInfo &s) {
+    auto it = line_file_map.upper_bound(s.lineno);
+    it--;
+    out << it->second.first << ':' << (s.lineno - it->first + it->second.second);
+    return out;
+}
+
+void warning(int lineno, const char *fmt, va_list args) {
+    auto it = line_file_map.upper_bound(lineno);
+    if (it == line_file_map.begin()) {
+        fprintf(stderr, "<unknown location>: warning: ");
+    } else {
+        --it;
+        fprintf(stderr, "%s:%d: warning: ", it->second.first.c_str(),
+                lineno - it->first + it->second.second); }
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    fflush(stderr);
+    warn_count++; }
+
+void error(int lineno, const char *fmt, va_list args) {
+    auto it = line_file_map.upper_bound(lineno);
+    if (it == line_file_map.begin()) {
+        fprintf(stderr, "<unknown location>: error: ");
+    } else {
+        --it;
+        fprintf(stderr, "%s:%d: error: ", it->second.first.c_str(),
+                lineno - it->first + it->second.second); }
+    vfprintf(stderr, fmt, args);
+    fprintf(stderr, "\n");
+    fflush(stderr);
+    error_count++; }
+
+static void yyerror(const char *fmt, ...) {
+    va_list     args;
+    va_start(args, fmt);
+    error(lineno, fmt, args);
+    va_end(args);
+}
+
+int asm_parse_file(const char *name, FILE *in) {
+#ifdef YYDEBUG
+    if (const char *p = getenv("YYDEBUG"))
+        yydebug = atoi(p);
+#endif /* YYDEBUG */
+    yyrestart(in);
+    line_file_map[lineno++] = std::make_pair(name, 0);
+    if (yyparse())
+        error_count++;
+    return error_count;
+}
+
+int asm_parse_string(const char* in) {
+    YY_BUFFER_STATE buf;
+#ifdef YYDEBUG
+    if (const char *p = getenv("YYDEBUG"))
+        yydebug = atoi(p);
+#endif /* YYDEBUG */
+    // Reset state in case func is called multiple times
+    BEGIN(INITIAL);
+    buf = yy_scan_string(in);
+    if (yyparse())
+        error_count++;
+    yy_delete_buffer(buf);
+    return error_count;
+}
+
+std::map<std::string, Section *> *Section::sections = 0;
diff --git a/backends/tofino/bf-asm/asm-types.cpp b/backends/tofino/bf-asm/asm-types.cpp
new file mode 100644
index 00000000000..bfcc5e524d8
--- /dev/null
+++ b/backends/tofino/bf-asm/asm-types.cpp
@@ -0,0 +1,320 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "asm-types.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "misc.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+void VECTOR(pair_t)::push_back(const char *s, value_t &&v) {  // NOLINT(whitespace/operators)
+    pair_t entry{{tSTR, v.lineno}, v};
+    entry.key.s = strdup(s);
+    VECTOR_push(*this, entry);
+    memset(&v, 0, sizeof(v));
+}
+
+void push_back(VECTOR(pair_t) & m, const char *s, value_t &&v) {  // NOLINT(whitespace/operators)
+    m.push_back(s, std::move(v));
+}
+
+VECTOR(value_t) & VECTOR(value_t)::add(value_t &&v) {
+    VECTOR_add(*this, std::move(v));
+    return *this;
+}
+VECTOR(value_t) & VECTOR(value_t)::add(int v) {
+    value_t tmp{tINT, v};
+    VECTOR_add(*this, tmp);
+    return *this;
+}
+VECTOR(value_t) & VECTOR(value_t)::add(const char *v) {
+    value_t tmp{tSTR, -1};
+    tmp.s = const_cast<char *>(v);
+    VECTOR_add(*this, tmp);
+    return *this;
+}
+
+/** check a value and see if it is a list of maps -- if so, concatenate the
+ * maps into a single map and replace the list with that */
+void collapse_list_of_maps(value_t &v, bool singleton_only) {
+    if (v.type != tVEC || v.vec.size == 0) return;
+    for (int i = 0; i < v.vec.size; i++) {
+        if (v[i].type != tMAP) return;
+        if (singleton_only && v[i].map.size != 1) return;
+    }
+    VECTOR(pair_t) map = v[0].map;
+    for (int i = 1; i < v.vec.size; i++) {
+        VECTOR_addcopy(map, v[i].map.data, v[i].map.size);
+        VECTOR_fini(v[i].map);
+    }
+    VECTOR_fini(v.vec);
+    v.type = tMAP;
+    v.map = map;
+}
+
+std::unique_ptr<json::obj> toJson(value_t &v) {
+    switch (v.type) {
+        case tINT:
+            return json::mkuniq<json::number>(v.i);
+        case tBIGINT:
+            if (v.bigi.size == 1 && v.bigi.data[0] < INT64_MAX)
+                return json::mkuniq<json::number>(v.bigi.data[0]);
+            // fall through
+        case tRANGE:
+        case tMATCH:
+            return json::mkuniq<json::string>(value_desc(v));
+        case tSTR:
+            if (v == "true") return json::mkuniq<json::True>();
+            if (v == "false") return json::mkuniq<json::False>();
+            if (v == "null") return std::unique_ptr<json::obj>();
+            return json::mkuniq<json::string>(v.s);
+        case tVEC:
+            return toJson(v.vec);
+        case tMAP:
+            return toJson(v.map);
+        case tCMD:
+            return toJson(v.vec);
+        default:
+            assert(0);
+    }
+    return std::unique_ptr<json::obj>();
+}
+
+std::unique_ptr<json::vector> toJson(VECTOR(value_t) & v) {
+    auto rv = json::mkuniq<json::vector>();
+    auto &vec = *rv;
+    for (auto &el : v) vec.push_back(toJson(el));
+    return rv;
+}
+
+std::unique_ptr<json::map> toJson(pair_t &kv) {
+    auto rv = json::mkuniq<json::map>();
+    auto &map = *rv;
+    map[toJson(kv.key)] = toJson(kv.value);
+    return rv;
+}
+
+std::unique_ptr<json::map> toJson(VECTOR(pair_t) & m) {
+    auto rv = json::mkuniq<json::map>();
+    auto &map = *rv;
+    for (auto &kv : m) map[toJson(kv.key)] = toJson(kv.value);
+    return rv;
+}
+
+bool get_bool(const value_t &v) {
+    if (v == "true")
+        return true;
+    else if (v == "false")
+        return false;
+    else if (CHECKTYPE(v, tINT))
+        return v.i != 0;
+    return false;
+}
+
+bitvec get_bitvec(const value_t &v, unsigned max_bits, const char *error_message) {
+    bitvec bv;
+    if (CHECKTYPE2(v, tINT, tBIGINT)) {
+        if (v.type == tINT) {
+            bv.setraw(v.i);
+        } else {
+            if (!v.bigi.size) return bv;
+            bv.setraw(v.bigi.data, v.bigi.size);
+        }
+    }
+    if (!max_bits) return bv;
+    int bits = bv.max().index() + 1;
+    if (error_message && bits > max_bits) error(v.lineno, "%s", error_message);
+    bv.clrrange(max_bits, bits);
+    return bv;
+}
+
+uint64_t get_int64(const value_t &v, unsigned max_bits, const char *error_message) {
+    BUG_CHECK(max_bits <= 64);
+    bool too_large = false;
+    uint64_t value = 0;
+    if (CHECKTYPE2(v, tINT, tBIGINT)) {
+        if (v.type == tINT) {
+            value = (uint64_t)v.i;
+        } else {
+            if (!v.bigi.size) return 0;
+            if (sizeof(uintptr_t) == sizeof(uint32_t)) {
+                value = ((uint64_t)v.bigi.data[1] << 32) + v.bigi.data[0];
+                too_large = v.bigi.size > 2;
+            } else {
+                BUG_CHECK(sizeof(uintptr_t) == sizeof(uint64_t));
+                value = v.bigi.data[0];
+                too_large = v.bigi.size > 1;
+            }
+        }
+    }
+    if (!max_bits) return value;
+    uint64_t masked = value;
+    if (max_bits < 64) masked &= (1ULL << max_bits) - 1;
+    if (error_message && (too_large || masked != value)) error(v.lineno, "%s", error_message);
+    return masked;
+}
+
+static int chkmask(const match_t &m, int maskbits) {
+    uint64_t mask = bitMask(maskbits);
+    int shift = 0;
+    while (mask && ((m.word0 | m.word1) >> shift)) {
+        if ((mask & m.word0 & m.word1) && (mask & m.word0 & m.word1) != mask) return -1;
+        mask <<= maskbits;
+        shift += maskbits;
+    }
+    return shift - maskbits;
+}
+
+std::ostream &operator<<(std::ostream &out, match_t m) {
+    int shift, bits;
+    if ((shift = chkmask(m, (bits = 4))) >= 0)
+        out << "0x";
+    else if ((shift = chkmask(m, (bits = 3))) >= 0)
+        out << "0o";
+    else if ((shift = chkmask(m, (bits = 1))) >= 0)
+        out << "0b";
+    else if ((shift = chkmask(m, (bits = 0))) == 0)
+        out << "0b*";
+    else
+        assert(0);
+    uint64_t mask = bitMask(bits) << shift;
+    for (; mask; shift -= bits, mask >>= bits)
+        if (mask & m.word0 & m.word1)
+            out << '*';
+        else
+            out << "0123456789abcdef"[(m.word1 & mask) >> shift];
+    return out;
+}
+
+void print_match(FILE *fp, match_t m) {
+    std::stringstream tmp;
+    tmp << m;
+    fputs(tmp.str().c_str(), fp);
+}
+
+const char *value_type_desc[] = {"integer",    "bigint",           "range",
+                                 "identifier", "match pattern",    "big match",
+                                 "list",       "key: value pairs", "operation"};
+
+const char *value_desc(const value_t *p) {
+    static char buffer[32];
+    switch (p->type) {
+        case tINT:
+            snprintf(buffer, sizeof(buffer), "%" PRId64 "", p->i);
+            return buffer;
+        case tBIGINT:
+            return "<bigint>";
+        case tRANGE:
+            snprintf(buffer, sizeof(buffer), "%d..%d", p->range.lo, p->range.hi);
+            return buffer;
+        case tMATCH:
+            return "<pattern>";
+        case tBIGMATCH:
+            return "<bigmatch>";
+        case tSTR:
+            return p->s;
+        case tVEC:
+            return "<list>";
+        case tMAP:
+            return "<map>";
+        case tCMD:
+            if (p->vec.size > 0 && p->vec.data[0].type == tSTR) return p->vec.data[0].s;
+            return "<cmd>";
+    }
+    assert(false && "unknown value type");
+    return "";
+}
+
+void free_value(value_t *p) {
+    switch (p->type) {
+        case tBIGINT:
+            VECTOR_fini(p->bigi);
+            break;
+        case tSTR:
+            free(p->s);
+            break;
+        case tVEC:
+        case tCMD:
+            VECTOR_foreach(p->vec, free_value);
+            VECTOR_fini(p->vec);
+            break;
+        case tMAP:
+            VECTOR_foreach(p->map, free_pair);
+            VECTOR_fini(p->map);
+            break;
+        default:
+            break;
+    }
+}
+
+bool operator==(const struct value_t &a, const struct value_t &b) {
+    int i;
+    if (a.type != b.type) {
+        if (a.type == tINT && b.type == tBIGINT) {
+            if (a.i < 0 || (size_t)a.i != b.bigi.data[0]) return false;
+            for (i = 1; i < b.bigi.size; i++)
+                if (b.bigi.data[i]) return false;
+            return true;
+        } else if (a.type == tBIGINT && b.type == tINT) {
+            if (b.i < 0 || (size_t)b.i != a.bigi.data[0]) return false;
+            for (i = 1; i < a.bigi.size; i++)
+                if (a.bigi.data[i]) return false;
+            return true;
+        }
+        return false;
+    }
+    switch (a.type) {
+        case tINT:
+            return a.i == b.i;
+        case tBIGINT:
+            for (i = 0; i < a.bigi.size && i < b.bigi.size; i++)
+                if (a.bigi.data[i] != b.bigi.data[i]) return false;
+            for (; i < a.bigi.size; i++)
+                if (a.bigi.data[i]) return false;
+            for (; i < b.bigi.size; i++)
+                if (b.bigi.data[i]) return false;
+            return true;
+        case tRANGE:
+            return a.range.lo == b.range.lo && a.range.hi == b.range.hi;
+        case tSTR:
+            return !strcmp(a.s, b.s);
+        case tMATCH:
+            return a.m.word0 == b.m.word0 && a.m.word1 == b.m.word1;
+        case tVEC:
+        case tCMD:
+            if (a.vec.size != b.vec.size) return false;
+            for (int i = 0; i < a.vec.size; i++)
+                if (a.vec.data[i] != b.vec.data[i]) return false;
+            return true;
+        case tMAP:
+            if (a.map.size != b.map.size) return false;
+            for (int i = 0; i < a.map.size; i++) {
+                if (a.map.data[i].key != b.map.data[i].key) return false;
+                if (a.map.data[i].value != b.map.data[i].value) return false;
+            }
+            return true;
+        case tBIGMATCH:
+        default:
+            break;
+    }
+    assert(false && "unknown value type");
+    return "";
+}
+#pragma GCC diagnostic pop
diff --git a/backends/tofino/bf-asm/asm-types.h b/backends/tofino/bf-asm/asm-types.h
new file mode 100644
index 00000000000..994e5f74e10
--- /dev/null
+++ b/backends/tofino/bf-asm/asm-types.h
@@ -0,0 +1,494 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_ASM_TYPES_H_
+#define BACKENDS_TOFINO_BF_ASM_ASM_TYPES_H_
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <set>
+#include <sstream>
+
+#include "backends/tofino/bf-asm/json.h"
+#include "backends/tofino/bf-asm/map.h"
+#include "bfas.h"
+#include "lib/bitops.h"
+#include "lib/bitvec.h"
+#include "mask_counter.h"
+#include "vector.h"
+
+enum gress_t { INGRESS, EGRESS, GHOST, NUM_GRESS_T };
+
+/* All timing related uses combine the INGRESS and GHOST threads (they run in lockstep), so
+ * we remap GHOST->INGRESS when dealing with timing */
+inline gress_t timing_thread(gress_t gress) { return gress == GHOST ? INGRESS : gress; }
+/* imem similarly shares color between INGRESS and GHOST */
+inline gress_t imem_thread(gress_t gress) { return gress == GHOST ? INGRESS : gress; }
+
+struct match_t {
+    uint64_t word0, word1;
+#ifdef __cplusplus
+    operator bool() const { return (word0 | word1) != 0; }
+    bool operator==(const match_t &a) const { return word0 == a.word0 && word1 == a.word1; }
+    bool matches(uint64_t v) const {
+        return (v | word1) == word1 && ((~v & word1) | word0) == word0;
+    }
+    bool matches(const match_t &v) const {
+        assert(0);
+        return false;
+    }
+    unsigned dirtcam(unsigned width, unsigned bit);
+#endif /* __cplusplus */
+};
+
+DECLARE_VECTOR(match_t);
+
+struct wmatch_t {
+    bitvec word0, word1;
+#ifdef __cplusplus
+    wmatch_t() = default;
+    wmatch_t(const wmatch_t &) = default;
+    wmatch_t(wmatch_t &&) = default;
+    wmatch_t &operator=(const wmatch_t &) = default;
+    wmatch_t &operator=(wmatch_t &&) = default;
+    wmatch_t(const match_t &v) : word0(v.word0), word1(v.word1) {}  // NOLINT(runtime/explicit)
+    wmatch_t(const VECTOR(match_t) & v) {                           // NOLINT(runtime/explicit)
+        for (int i = 0; i < v.size; ++i) {
+            word0.putrange(i * 64, 64, v.data[i].word0);
+            word1.putrange(i * 64, 64, v.data[i].word1);
+        }
+    }
+    operator bool() const { return word0 || word1; }
+    bool operator==(const wmatch_t &a) const { return word0 == a.word0 && word1 == a.word1; }
+    bool matches(bitvec v) const { return (v | word1) == word1 && ((word1 - v) | word0) == word0; }
+    bool matches(const wmatch_t &v) const {
+        assert(0);
+        return false;
+    }
+    unsigned dirtcam(unsigned width, unsigned bit);
+#endif /* __cplusplus */
+};
+
+enum value_type { tINT, tBIGINT, tRANGE, tSTR, tMATCH, tBIGMATCH, tVEC, tMAP, tCMD };
+extern const char *value_type_desc[];
+
+struct value_t;
+struct pair_t;
+#ifdef __cplusplus
+DECLARE_VECTOR(
+    value_t, value_t &operator[](int) const; value_t & back() const;
+    value_t * begin() const { return data; } value_t * end() const; value_t & front() const;
+    VECTOR(value_t) & add(value_t &&); VECTOR(value_t) & add(int);
+    VECTOR(value_t) & add(const char *);)
+DECLARE_VECTOR(
+    pair_t, void push_back(const char *, value_t &&);  // NOLINT(whitespace/operators)
+    pair_t & operator[](int) const; pair_t * operator[](const char *) const; pair_t & back() const;
+    pair_t * begin() const { return data; } pair_t * end() const; pair_t & front() const;)
+#else
+DECLARE_VECTOR(value_t)
+DECLARE_VECTOR(pair_t)
+#endif /* __cplusplus */
+DECLARE_VECTOR(uintptr_t);
+
+struct value_t {
+    enum value_type type;
+    int lineno;
+    union {
+        int64_t i;
+        VECTOR(uintptr_t) bigi;
+        struct {
+            int lo;
+            int hi;
+        } range;
+        char *s;
+        match_t m;
+        VECTOR(match_t) bigm;
+        VECTOR(value_t) vec;
+        VECTOR(pair_t) map;
+    };
+#ifdef __cplusplus
+    value_t &operator[](int i) const {
+        assert(type == tVEC || type == tCMD);
+        return vec[i];
+    }
+    bool startsWith(const char *pfx) const {
+        if (type == tSTR) return strncmp(s, pfx, strlen(pfx)) == 0;
+        if (type == tCMD && vec.size > 0 && vec[0].type == tSTR)
+            return strncmp(vec[0].s, pfx, strlen(pfx)) == 0;
+        return false;
+    }
+    bool checkSize() const {
+        if (type == tVEC) return (vec.size > 0);
+        if (type == tMAP) return (map.size > 0);
+        if (type == tCMD) return (vec.size > 0);
+        return true;
+    }
+#endif /* __cplusplus */
+};
+
+struct pair_t {
+    struct value_t key, value;
+#ifdef __cplusplus
+    pair_t() = default;
+    pair_t(const value_t &k, const value_t &v) : key(k), value(v) {}
+#endif /* __cplusplus */
+};
+
+void free_value(value_t *p);
+const char *value_desc(const value_t *v);
+static inline void free_pair(pair_t *p) {
+    free_value(&p->key);
+    free_value(&p->value);
+}
+bool get_bool(const value_t &v);
+
+// If max_bits is zero, no testing or masking is carried out.
+// If error_message is set, values larger than max_bits will error, otherwise the value is masked.
+bitvec get_bitvec(const value_t &v, unsigned max_bits = 0, const char *error_message = nullptr);
+uint64_t get_int64(const value_t &v, unsigned max_bits = 0, const char *error_message = nullptr);
+
+#ifdef __cplusplus
+bool operator==(const struct value_t &, const struct value_t &);
+inline bool operator==(const struct value_t &a, const char *b) {
+    if (a.type == tCMD && a.vec.size > 0 && a[0].type == tSTR) return !strcmp(a[0].s, b);
+    return a.type == tSTR && !strcmp(a.s, b);
+}
+inline bool operator==(const char *a, const struct value_t &b) {
+    if (b.type == tCMD && b.vec.size > 0 && b[0].type == tSTR) return !strcmp(a, b[0].s);
+    return b.type == tSTR && !strcmp(a, b.s);
+}
+inline bool operator==(const struct value_t &a, int b) { return a.type == tINT && a.i == b; }
+inline bool operator==(int a, const struct value_t &b) { return b.type == tINT && a == b.i; }
+
+inline const char *value_desc(const value_t &v) { return value_desc(&v); }
+
+template <class A, class B>
+inline bool operator!=(A a, B b) {
+    return !(a == b);
+}
+
+inline value_t &VECTOR(value_t)::operator[](int i) const {
+    assert(i >= 0 && i < size);
+    return data[i];
+}
+inline pair_t &VECTOR(pair_t)::operator[](int i) const {
+    assert(i >= 0 && i < size);
+    return data[i];
+}
+inline pair_t *VECTOR(pair_t)::operator[](const char *k) const {
+    for (int i = 0; i < size; i++)
+        if (data[i].key == k) return &data[i];
+    return 0;
+}
+inline value_t *VECTOR(value_t)::end() const { return data + size; }
+inline value_t &VECTOR(value_t)::front() const {
+    assert(0 < size);
+    return data[0];
+}
+inline value_t &VECTOR(value_t)::back() const {
+    assert(0 < size);
+    return data[size - 1];
+}
+inline pair_t *VECTOR(pair_t)::end() const { return data + size; }
+inline pair_t &VECTOR(pair_t)::front() const {
+    assert(0 < size);
+    return data[0];
+}
+inline pair_t &VECTOR(pair_t)::back() const {
+    assert(0 < size);
+    return data[size - 1];
+}
+
+/* can't call VECTOR(pair_t)::push_back directly except from the compilation unit where
+ * it is defined, due to gcc bug.  Workaround via global function */
+extern void push_back(VECTOR(pair_t) & m, const char *s,
+                      value_t &&v);  // NOLINT(whitespace/operators)
+
+inline void fini(value_t &v) { free_value(&v); }
+inline void fini(pair_t &p) { free_pair(&p); }
+inline void fini(VECTOR(value_t) & v) {
+    VECTOR_foreach(v, free_value);
+    VECTOR_fini(v);
+}
+inline void fini(VECTOR(pair_t) & v) {
+    VECTOR_foreach(v, free_pair);
+    VECTOR_fini(v);
+}
+void collapse_list_of_maps(value_t &, bool singleton_only = false);
+
+std::unique_ptr<json::obj> toJson(value_t &);
+std::unique_ptr<json::vector> toJson(VECTOR(value_t) &);
+std::unique_ptr<json::map> toJson(pair_t &);
+std::unique_ptr<json::map> toJson(VECTOR(pair_t) &);
+
+#endif /* __cplusplus */
+
+#define CHECKTYPE(V, T) \
+    ((V).type == (T) || (error((V).lineno, "Syntax error, expecting %s", value_type_desc[T]), 0))
+#define CHECKTYPESIZE(V, T) \
+    (CHECKTYPE(V, T) &&     \
+     ((V).checkSize() || (error((V).lineno, "Syntax error, empty %s", value_type_desc[T]), 0)))
+#define PCHECKTYPE(P, V, T)      \
+    (((P) && (V).type == (T)) || \
+     (error((V).lineno, "Syntax error, expecting %s", value_type_desc[T]), 0))
+#define CHECKTYPEM(V, T, M) \
+    ((V).type == (T) || (error((V).lineno, "Syntax error, expecting %s", M), 0))
+#define CHECKTYPEPM(V, T, P, M) \
+    (((V).type == (T) && (P)) || (error((V).lineno, "Syntax error, expecting %s", M), 0))
+#define PCHECKTYPEM(P, V, T, M) \
+    (((P) && (V).type == (T)) || (error((V).lineno, "Syntax error, expecting %s", M), 0))
+#define CHECKTYPE2(V, T1, T2)                                                               \
+    ((V).type == (T1) || (V).type == (T2) ||                                                \
+     (error((V).lineno, "Syntax error, expecting %s or %s but got %s", value_type_desc[T1], \
+            value_type_desc[T2], value_desc(V)),                                            \
+      0))
+#define CHECKTYPE3(V, T1, T2, T3)                                                      \
+    ((V).type == (T1) || (V).type == (T2) || (V).type == (T3) ||                       \
+     (error((V).lineno, "Syntax error, expecting %s or %s or %s", value_type_desc[T1], \
+            value_type_desc[T2], value_type_desc[T3]),                                 \
+      0))
+#define PCHECKTYPE2(P, V, T1, T2)                                                \
+    (((P) && ((V).type == (T1) || (V).type == (T2))) ||                          \
+     (error((V).lineno, "Syntax error, expecting %s or %s", value_type_desc[T1], \
+            value_type_desc[T2]),                                                \
+      0))
+#define CHECKTYPE2M(V, T1, T2, M)            \
+    ((V).type == (T1) || (V).type == (T2) || \
+     (error((V).lineno, "Syntax error, expecting %s but got %s", M, value_desc(V)), 0))
+#define PCHECKTYPE2M(P, V, T1, T2, M)                   \
+    (((P) && ((V).type == (T1) || (V).type == (T2))) || \
+     (error((V).lineno, "Syntax error, expecting %s", M), 0))
+#define VALIDATE_RANGE(V)                                  \
+    ((V).type != tRANGE || (V).range.lo <= (V).range.hi || \
+     (error((V).lineno, "Invalid range %d..%d", (V).range.lo, (V).range.hi), 0))
+
+inline value_t *get(VECTOR(pair_t) & map, const char *key) {
+    for (auto &kv : map)
+        if (kv.key == key) return &kv.value;
+    return 0;
+}
+inline const value_t *get(const VECTOR(pair_t) & map, const char *key) {
+    for (auto &kv : map)
+        if (kv.key == key) return &kv.value;
+    return 0;
+}
+
+#ifdef __cplusplus
+
+template <class T>
+inline void parse_vector(std::vector<T> &vec, const VECTOR(value_t) & data) {
+    for (auto &v : data) vec.emplace_back(v);
+}
+template <>
+inline void parse_vector(std::vector<int> &vec, const VECTOR(value_t) & data) {
+    for (auto &v : data)
+        if (CHECKTYPE(v, tINT)) vec.push_back(v.i);
+}
+template <>
+inline void parse_vector(std::vector<int64_t> &vec, const VECTOR(value_t) & data) {
+    for (auto &v : data)
+        if (CHECKTYPE(v, tINT)) vec.push_back(v.i);
+}
+template <>
+inline void parse_vector(std::vector<std::string> &vec, const VECTOR(value_t) & data) {
+    for (auto &v : data)
+        if (CHECKTYPE(v, tSTR)) vec.emplace_back(v.s);
+}
+template <class T>
+inline void parse_vector(std::vector<T> &vec, const value_t &data) {
+    if (data.type == tVEC)
+        parse_vector(vec, data.vec);
+    else
+        vec.emplace_back(data);
+}
+template <>
+inline void parse_vector(std::vector<int> &vec, const value_t &data) {
+    if (CHECKTYPE2(data, tINT, tVEC)) {
+        if (data.type == tVEC)
+            parse_vector(vec, data.vec);
+        else
+            vec.push_back(data.i);
+    }
+}
+template <>
+inline void parse_vector(std::vector<int64_t> &vec, const value_t &data) {
+    if (CHECKTYPE2(data, tINT, tVEC)) {
+        if (data.type == tVEC)
+            parse_vector(vec, data.vec);
+        else
+            vec.push_back(data.i);
+    }
+}
+template <>
+inline void parse_vector(std::vector<std::string> &vec, const value_t &data) {
+    if (CHECKTYPE2(data, tSTR, tVEC)) {
+        if (data.type == tVEC)
+            parse_vector(vec, data.vec);
+        else
+            vec.push_back(data.s);
+    }
+}
+
+std::ostream &operator<<(std::ostream &out, match_t m);
+void print_match(FILE *fp, match_t m);
+
+inline std::ostream &operator<<(std::ostream &out, gress_t gress) {
+    switch (gress) {
+        case INGRESS:
+            out << "ingress";
+            break;
+        case EGRESS:
+            out << "egress";
+            break;
+        case GHOST:
+            out << "ghost";
+            break;
+        default:
+            out << "(invalid gress " << static_cast<int>(gress) << ")";
+    }
+    return out;
+}
+
+template <typename T>
+inline std::string to_string(T val) {
+    std::stringstream tmp;
+    tmp << val;
+    return tmp.str();
+}
+
+class MapIterChecked {
+    /* Iterate through a map (VECTOR(pair_t)), giving errors for non-string and
+     * duplicate keys (and skipping them) */
+    const VECTOR(pair_t) & map;
+    bool allow;  // allow non-string keys
+    std::set<std::string> duplicates_allowed;
+    std::map<std::string, int> keys_seen;
+    class iter {
+        MapIterChecked *self;
+        pair_t *p;
+        void check() {
+            while (p != self->map.end()) {
+                if (self->allow && p->key.type != tSTR) break;
+                if (!CHECKTYPE(p->key, tSTR)) {
+                    p++;
+                    continue;
+                }
+                if (self->duplicates_allowed.count(p->key.s)) break;
+                if (self->keys_seen.count(p->key.s)) {
+                    error(p->key.lineno, "Duplicate element %s", p->key.s);
+                    warning(self->keys_seen[p->key.s], "previous element %s", p->key.s);
+                    p++;
+                    continue;
+                }
+                self->keys_seen[p->key.s] = p->key.lineno;
+                break;
+            }
+        }
+
+     public:
+        iter(MapIterChecked *s, pair_t *p_) : self(s), p(p_) { check(); }
+        pair_t &operator*() const { return *p; }
+        pair_t *operator->() const { return p; }
+        bool operator==(iter &a) const { return p == a.p; }
+        iter &operator++() {
+            p++;
+            check();
+            return *this;
+        }
+    };
+
+ public:
+    explicit MapIterChecked(const VECTOR(pair_t) & map_, bool o = false,
+                            const std::set<std::string> &dup = {})
+        : map(map_), allow(o), duplicates_allowed(dup) {}
+    MapIterChecked(const VECTOR(pair_t) & map_, const std::set<std::string> &dup)
+        : map(map_), allow(false), duplicates_allowed(dup) {}
+    iter begin() { return iter(this, map.begin()); }
+    iter end() { return iter(this, map.end()); }
+};
+
+class MatchIter {
+    /* Iterate through the integers that match a match_t */
+    match_t m;
+    class iter : public MaskCounter {
+        MatchIter *self;
+
+     public:
+        explicit iter(MatchIter *s) : MaskCounter(s->m.word0 & s->m.word1), self(s) {
+            if (!(self->m.word1 | self->m.word0)) overflow();
+        }
+        unsigned operator*() const {
+            return this->operator unsigned() | (self->m.word1 & ~self->m.word0);
+        }
+        iter &end() {
+            overflow();
+            return *this;
+        }
+    };
+
+ public:
+    explicit MatchIter(match_t m_) : m(m_) {}
+    iter begin() { return iter(this); }
+    iter end() { return iter(this).end(); }
+};
+
+class SrcInfo {
+    int lineno;
+    friend std::ostream &operator<<(std::ostream &, const SrcInfo &);
+
+ public:
+    explicit SrcInfo(int l) : lineno(l) {}
+};
+
+struct RegisterSetBase {
+    virtual ~RegisterSetBase() = default;
+};
+
+struct ParserRegisterSet : public RegisterSetBase {};
+
+/// An interface for parsing a section of a .bfa file
+class Parsable {
+ public:
+    /// @param data entire map/sequence of elements
+    virtual void input(VECTOR(value_t) args, value_t data) = 0;
+    virtual ~Parsable() = default;
+};
+
+/// An interface for writing into registers
+class Configurable {
+ public:
+    virtual void write_config(RegisterSetBase &regs, json::map &json, bool legacy = true) = 0;
+    virtual ~Configurable() = default;
+};
+
+/// An interface for generating context.json
+class Contextable {
+ public:
+    virtual void output(json::map &ctxtJson) = 0;
+    virtual ~Contextable() = default;
+};
+
+#endif /* __cplusplus */
+
+#endif /* BACKENDS_TOFINO_BF_ASM_ASM_TYPES_H_ */
diff --git a/backends/tofino/bf-asm/atcam_match.cpp b/backends/tofino/bf-asm/atcam_match.cpp
new file mode 100644
index 00000000000..4b36885f439
--- /dev/null
+++ b/backends/tofino/bf-asm/atcam_match.cpp
@@ -0,0 +1,558 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "lib/algorithm.h"
+#include "lib/hex.h"
+#include "misc.h"
+
+void AlgTcamMatchTable::setup(VECTOR(pair_t) & data) {
+    common_init_setup(data, false, P4Table::MatchEntry);
+    for (auto &kv : MapIterChecked(data, {"meter", "stats", "stateful"})) {
+        if (common_setup(kv, data, P4Table::MatchEntry)) {
+        } else if (kv.key == "number_partitions") {
+            if (CHECKTYPE(kv.value, tINT)) number_partitions = kv.value.i;
+        } else if (kv.key == "partition_field_name") {
+            if (CHECKTYPE(kv.value, tSTR)) {
+                partition_field_name = kv.value.s;
+                if (auto *p = find_p4_param(partition_field_name))
+                    if (!p->key_name.empty()) partition_field_name = p->key_name;
+            }
+        } else if (kv.key == "subtrees_per_partition") {
+            if (CHECKTYPE(kv.value, tINT)) max_subtrees_per_partition = kv.value.i;
+        } else if (kv.key == "bins_per_partition") {
+            if (CHECKTYPE(kv.value, tINT)) bins_per_partition = kv.value.i;
+        } else if (kv.key == "atcam_subset_width") {
+            if (CHECKTYPE(kv.value, tINT)) atcam_subset_width = kv.value.i;
+        } else if (kv.key == "shift_granularity") {
+            if (CHECKTYPE(kv.value, tINT)) shift_granularity = kv.value.i;
+        } else if (kv.key == "search_bus" || kv.key == "result_bus") {
+            // already dealt with in Table::setup_layout via common_init_setup
+        } else {
+            common_sram_setup(kv, data);
+        }
+    }
+    common_sram_checks();
+}
+
+// TODO: This could probably be rewritten in a simpler way. Below
+// function checks the ways extracted from assembly for atcam and assumes the
+// way no's are not sorted with column priority. Therefore the code sorts the
+// first ram column and sets the column priority based on this column. Then this
+// ordering is used to check if column priority is maintained if the ways are
+// traversed in this column priority order for all other columns
+void AlgTcamMatchTable::setup_column_priority() {
+    int no_ways = ways.size();
+    int no_entries_per_way = ways[0].rams.size();
+    // FIXME-P4C: Ideally RAM's 6 & 7 can be on both left and right RAM Units.
+    // Brig currently does not support this behavior and RAM 6 is always on
+    // left, while RAM 7 on right. Once this supported is added below function
+    // must be modified accordingly to accommodate these rams in lrams and rrams
+    // and the traversal mechanism must be changed to determine column priority
+    std::set<int> lrams = {2, 3, 4, 5, 6};
+    std::set<int> rrams = {7, 8, 9, 10, 11};
+    // Check if column is on left(0) or right(1) RAMs
+
+    std::vector<std::pair<int, int>> first_entry_priority;
+    // Determine the side and which way corresponds to which column
+    int side = -1;
+    for (int w = 0; w < no_ways; w++) {
+        int col = ways[w].rams[0].col;
+        int row = ways[w].rams[0].row;
+        if (side == 0) {
+            if (lrams.find(col) == lrams.end()) {
+                error(lineno,
+                      "ram(%d, %d) is not on correct side compared to rest in column "
+                      "priority",
+                      row, col);
+            }
+        } else if (side == 1) {
+            if (rrams.find(col) == rrams.end()) {
+                error(lineno,
+                      "ram(%d, %d) is not on correct side compare to rest of column "
+                      "priority",
+                      row, col);
+            }
+        } else if (lrams.find(col) != lrams.end()) {
+            side = 0;
+        } else if (rrams.find(col) != rrams.end()) {
+            side = 1;
+        } else {
+            error(lineno, "ram(%d, %d) invalid for ATCAM", row, col);
+        }
+        first_entry_priority.push_back(std::make_pair(w, col));
+    }
+
+    // Sort ways based on column priority for first column
+    std::sort(first_entry_priority.begin(), first_entry_priority.end(),
+              [side](const std::pair<int, int> &a, const std::pair<int, int> &b) {
+                  return side == 0 ? a.second < b.second : a.second > b.second;
+              });
+
+    int index = 0;
+    for (auto &entry : first_entry_priority) {
+        col_priority_way[index] = entry.first;
+        index++;
+    }
+
+    // Ensure that the remaining columns match up with the first column ram
+    for (int i = 1; i < no_entries_per_way; i++) {
+        auto way_it = col_priority_way.begin();
+        side = -1;
+        int prev_col = -1;
+        int prev_row = -1;
+        while (way_it != col_priority_way.end()) {
+            int row = ways[way_it->second].rams[i].row;
+            int col = ways[way_it->second].rams[i].col;
+            if (way_it != col_priority_way.begin()) {
+                if (!(((side == 0 && prev_col < col && lrams.find(col) != lrams.end()) ||
+                       (side == 1 && prev_col > col && rrams.find(col) != rrams.end())) &&
+                      prev_row == row)) {
+                    error(lineno,
+                          "ram(%d, %d) and ram(%d, %d) column priority is not "
+                          "compatible",
+                          prev_row, prev_col, row, col);
+                }
+            }
+            way_it++;
+            prev_col = col;
+            prev_row = row;
+            if (lrams.find(col) != lrams.end())
+                side = 0;
+            else if (rrams.find(col) != rrams.end())
+                side = 1;
+            else
+                error(lineno, "ram(%d, %d) invalid for ATCAM", row, col);
+        }
+    }
+}
+
+/**
+ * Guarantees that the order of the entries provided in the ATCAM table format are order
+ * in HW priority 0-4, (where in HW entry 4 will be favored).  This is required to guarantee that
+ * the entries in the ATCAM pack format are in priority order for the driver.
+ *
+ * @seealso bf-p4c/mau/table_format.cpp - no_overhead_atcam_result_bus_words
+ */
+void AlgTcamMatchTable::verify_entry_priority() {
+    int result_bus_word = -1;
+    for (int i = 0; i < static_cast<int>(group_info.size()); i++) {
+        BUG_CHECK(group_info[i].result_bus_word >= 0);
+        if (result_bus_word == -1) {
+            result_bus_word = group_info[i].result_bus_word;
+        } else if (result_bus_word != group_info[i].result_bus_word) {
+            error(format->lineno, "ATCAM tables can at most have only one overhead word");
+            return;
+        }
+        auto mg_it = group_info[i].match_group.find(result_bus_word);
+        if (mg_it == group_info[i].match_group.end() || mg_it->second != i) {
+            error(format->lineno,
+                  "Each ATCAM entry must coordinate its entry with the "
+                  "correct priority");
+            return;
+        }
+    }
+
+    if (word_info[result_bus_word].size() != group_info.size()) {
+        error(format->lineno, "ATCAM tables do not chain to the same overhead word");
+        return;
+    }
+
+    for (int i = 0; i < static_cast<int>(word_info[result_bus_word].size()); i++) {
+        if (i != word_info[result_bus_word][i]) {
+            error(format->lineno, "ATCAM priority not correctly formatted in the compiler");
+            return;
+        }
+    }
+}
+
+/**
+ * @seealso bf-p4c/mau/table_format.cpp - no_overhead_atcam_result_bus_words.  This matches
+ * this function exactly
+ */
+void AlgTcamMatchTable::no_overhead_determine_result_bus_usage() {
+    int result_bus_word = -1;
+    int shared_groups = 0;
+    for (int i = group_info.size() - 1; i >= 0; i--) {
+        if (result_bus_word == -1) {
+            result_bus_word = group_info[i].match_group.begin()->first;
+        }
+        bool is_shared_group = false;
+
+        if (group_info[i].match_group.size() > 1)
+            is_shared_group = true;
+        else if (group_info[i].match_group.begin()->first != result_bus_word)
+            is_shared_group = true;
+
+        if (is_shared_group) {
+            if (i > 1) error(format->lineno, "ATCAM chaining of shared groups is not correct");
+            shared_groups++;
+        }
+
+        group_info[i].result_bus_word = result_bus_word;
+        group_info[i].match_group[result_bus_word] = i;
+    }
+
+    word_info[result_bus_word].clear();
+    for (int i = 0; i < static_cast<int>(group_info.size()); i++) {
+        word_info[result_bus_word].push_back(i);
+    }
+
+    if (shared_groups > 2)
+        error(format->lineno, "ATCAM cannot safely send hit signals to same result bus");
+}
+
+void AlgTcamMatchTable::verify_format(Target::Tofino targ) {
+    SRamMatchTable::verify_format(targ);
+    if (!error_count) verify_entry_priority();
+}
+
+void AlgTcamMatchTable::pass1() {
+    LOG1("### ATCAM match table " << name() << " pass1 " << loc());
+    SRamMatchTable::pass1();
+    if (format) {
+        setup_column_priority();
+        find_tcam_match();
+    }
+}
+
+void AlgTcamMatchTable::setup_nibble_mask(Table::Format::Field *match, int group,
+                                          std::map<int, match_element> &elems, bitvec &mask) {
+    for (auto &el : Values(elems)) {
+        int bit = match->bit(el.offset);
+        if (match->hi(bit) < bit + el.width - 1)
+            error(el.field->lineno, "match bits for %s not contiguous in match(%d)",
+                  el.field->desc().c_str(), group);
+        // Determining the nibbles dedicated to s0q1 or s1q0
+        int start_bit = bit;
+        int end_bit = start_bit + el.width - 1;
+        int start_nibble = start_bit / 4U;
+        int end_nibble = end_bit / 4U;
+        mask.setrange(start_nibble, end_nibble - start_nibble + 1);
+    }
+}
+
+void AlgTcamMatchTable::find_tcam_match() {
+    std::map<Phv::Slice, match_element> exact;
+    std::map<Phv::Slice, std::pair<match_element, match_element>> tcam;
+    unsigned off = 0;
+    /* go through the match fields and find duplicates -- those are the tcam matches */
+    for (auto match_field : match) {
+        auto phv_p = dynamic_cast<Phv::Ref *>(match_field);
+        if (phv_p == nullptr) {
+            BUG();
+            continue;
+        }
+        auto phv_ref = *phv_p;
+        auto sl = *phv_ref;
+        if (!sl) continue;
+        if (exact.count(sl)) {
+            if (tcam.count(sl))
+                error(phv_ref.lineno, "%s appears more than twice in atcam match",
+                      phv_ref.desc().c_str());
+            if ((sl.size() % 4U) != 0) {
+                if ((sl.size() == 1) && (phv_ref.desc().find("$valid") != std::string::npos)) {
+                } else
+                    warning(phv_ref.lineno, "tcam match field %s not a multiple of 4 bits",
+                            phv_ref.desc().c_str());
+            }
+            tcam.emplace(sl, std::make_pair(exact.at(sl),
+                                            match_element{new Phv::Ref(phv_ref), off, sl->size()}));
+            exact.erase(sl);
+        } else {
+            exact.emplace(sl, match_element{new Phv::Ref(phv_ref), off, sl->size()});
+        }
+        off += sl.size();
+    }
+    for (auto e : exact)
+        for (auto t : tcam)
+            if (e.first.overlaps(t.first))
+                error(e.second.field->lineno, "%s overlaps %s in atcam match",
+                      e.second.field->desc().c_str(), t.second.first.field->desc().c_str());
+    if (error_count > 0) return;
+
+    /* for the tcam pairs, treat first as s0q1 and second as s1q0 */
+    for (auto &el : Values(tcam)) {
+        s0q1[el.first.offset] = el.first;
+        s1q0[el.second.offset] = el.second;
+    }
+    /* now find the bits in each group that match with the tcam pairs, ensure that they
+     * are nibble-aligned, and setup the nibble masks */
+    for (unsigned i = 0; i < format->groups(); i++) {
+        if (Format::Field *match = format->field("match", i)) {
+            setup_nibble_mask(match, i, s0q1, s0q1_nibbles);
+            setup_nibble_mask(match, i, s1q0, s1q0_nibbles);
+            if (!(s0q1_nibbles & s1q0_nibbles).empty())
+                error(format->lineno, "Cannot determine if a ternary nibble is s0q1 or s1q0");
+        } else {
+            error(format->lineno, "no 'match' field in format group %d", i);
+        }
+    }
+}
+
+void AlgTcamMatchTable::pass2() {
+    LOG1("### ATCAM match table " << name() << " pass2 " << loc());
+    if (logical_id < 0) choose_logical_id();
+    for (auto &ixb : input_xbar) ixb->pass2();
+    setup_word_ixbar_group();
+    ixbar_subgroup.resize(word_ixbar_group.size());
+    ixbar_mask.resize(word_ixbar_group.size());
+    // FIXME -- need a method of specifying these things in the asm code?
+    // FIXME -- should at least check that these are sane
+    for (unsigned i = 0; i < word_ixbar_group.size(); ++i) {
+        if (word_ixbar_group[i] < 0) {
+            // Word with no match data, only version/valid; used for direct lookup
+            // tables -- can it happen with an atcam table?
+            continue;
+        }
+        BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+        bitvec ixbar_use = input_xbar[0]->hash_group_bituse(word_ixbar_group[i]);
+        // Which 10-bit address group to use for this word -- use the lowest one with
+        // a bit set in the hash group.  Can it be different for different words?
+        ixbar_subgroup[i] = ixbar_use.min().index() / EXACT_HASH_ADR_BITS;
+        // Assume that any hash bits usuable for select are used for select
+        ixbar_mask[i] = ixbar_use.getrange(EXACT_HASH_FIRST_SELECT_BIT, EXACT_HASH_SELECT_BITS);
+    }
+    if (actions) actions->pass2(this);
+    if (action_bus) action_bus->pass2(this);
+    if (gateway) gateway->pass2();
+    if (idletime) idletime->pass2();
+    if (format) format->pass2(this);
+    for (auto &hd : hash_dist) hd.pass2(this);
+}
+
+void AlgTcamMatchTable::pass3() {
+    LOG1("### ATCAM match table " << name() << " pass3 " << loc());
+    SRamMatchTable::pass3();
+    if (action_bus) action_bus->pass3(this);
+}
+
+template <class REGS>
+void AlgTcamMatchTable::write_regs_vt(REGS &regs) {
+    LOG1("### ATCAM match table " << name() << " write_regs " << loc());
+    SRamMatchTable::write_regs(regs);
+
+    for (auto &row : layout) {
+        auto &rams_row = regs.rams.array.row[row.row];
+        for (auto &ram : row.memunits) {
+            auto &way = way_map[ram];
+            BUG_CHECK(ram.stage == INT_MIN && ram.row == row.row, "bogus %s in row %d", ram.desc(),
+                      row.row);
+            auto &ram_cfg = rams_row.ram[ram.col];
+            ram_cfg.match_nibble_s0q1_enable = version_nibble_mask.getrange(way.word * 32U, 32) &
+                                               ~s1q0_nibbles.getrange(way.word * 32U, 32);
+            ram_cfg.match_nibble_s1q0_enable =
+                0xffffffffUL & ~s0q1_nibbles.getrange(way.word * 32U, 32);
+        }
+    }
+}
+
+std::unique_ptr<json::vector> AlgTcamMatchTable::gen_memory_resource_allocation_tbl_cfg() const {
+    if (col_priority_way.size() == 0)
+        error(lineno, "No column priority determined for table %s", name());
+    unsigned fmt_width = format ? (format->size + 127) / 128 : 0;
+    json::vector mras;
+    for (auto &entry : col_priority_way) {
+        json::map mra;
+        mra["column_priority"] = entry.first;
+        json::vector mem_units;
+        json::vector &mem_units_and_vpns = mra["memory_units_and_vpns"] = json::vector();
+        auto &way = ways[entry.second];
+        unsigned vpn_ctr = 0;
+        for (auto &ram : way.rams) {
+            if (mem_units.empty())
+                vpn_ctr = layout_get_vpn(ram);
+            else
+                BUG_CHECK(vpn_ctr == layout_get_vpn(ram));
+            mem_units.push_back(json_memunit(ram));
+            if (mem_units.size() == fmt_width) {
+                json::map tmp;
+                tmp["memory_units"] = std::move(mem_units);
+                mem_units = json::vector();
+                json::vector vpns;
+                // Because the entries in the context JSON are reversed, the VPNs have to
+                // be reversed as well
+                for (unsigned i = 0; i < format->groups(); i++) {
+                    vpns.push_back(vpn_ctr + format->groups() - 1 - i);
+                }
+                vpn_ctr += format->groups();
+                tmp["vpns"] = std::move(vpns);
+                mem_units_and_vpns.push_back(std::move(tmp));
+            }
+        }
+        BUG_CHECK(mem_units.empty());
+        mras.push_back(std::move(mra));
+    }
+    return json::mkuniq<json::vector>(std::move(mras));
+}
+
+std::string AlgTcamMatchTable::get_match_mode(const Phv::Ref &pref, int offset) const {
+    for (auto &p : s0q1) {
+        if ((p.first == offset) && (*p.second.field == pref)) return "s0q1";
+    }
+    for (auto &p : s1q0) {
+        if ((p.first == offset) && (*p.second.field == pref)) return "s1q0";
+    }
+    return "unused";
+}
+
+void AlgTcamMatchTable::gen_unit_cfg(json::vector &units, int size) const {
+    json::map tbl;
+    tbl["direction"] = P4Table::direction_name(gress);
+    tbl["handle"] =
+        p4_table ? is_alpm() ? p4_table->get_alpm_atcam_table_handle() : p4_table->get_handle() : 0;
+    tbl["name"] = name();
+    tbl["size"] = size;
+    tbl["table_type"] = "match";
+    json::map &stage_tbl =
+        *add_common_sram_tbl_cfgs(tbl, "algorithmic_tcam_unit", "algorithmic_tcam_match");
+    // Assuming atcam next hit table cannot be multiple tables
+    stage_tbl["default_next_table"] =
+        !hit_next.empty() ? hit_next[0].next_table_id() : Target::END_OF_PIPE();
+    stage_tbl["memory_resource_allocation"] = gen_memory_resource_allocation_tbl_cfg();
+    // Hash functions not necessary currently for ATCAM matches, as the result comes from
+    // the partition_field_name
+    stage_tbl["hash_functions"] = json::vector();
+    add_pack_format(stage_tbl, format.get(), false);
+    units.push_back(std::move(tbl));
+}
+
+bool AlgTcamMatchTable::has_directly_attached_synth2port() const {
+    auto mt = this;
+    if (auto a = mt->get_attached()) {
+        if (a->selector && is_directly_referenced(a->selector)) return true;
+        for (auto &m : a->meters) {
+            if (is_directly_referenced(m)) return true;
+        }
+        for (auto &s : a->stats) {
+            if (is_directly_referenced(s)) return true;
+        }
+        for (auto &s : a->statefuls) {
+            if (is_directly_referenced(s)) return true;
+        }
+    }
+    return false;
+}
+
+void AlgTcamMatchTable::gen_alpm_cfg(json::map &tbl) const {
+    tbl["default_action_handle"] = get_default_action_handle();
+    tbl["action_profile"] = action_profile();
+    // FIXME -- setting next_table_mask unconditionally only works because we process the
+    // stage table in stage order (so we'll end up with the value from the last stage table,
+    // which is what we want.)  Should we check in case the ordering ever changes?
+    tbl["default_next_table_mask"] = next_table_adr_mask;
+    // FIXME -- the driver currently always assumes this is 0, so we arrange for it to be
+    // when choosing the action encoding.  But we should be able to choose something else
+    tbl["default_next_table_default"] = 0;
+    // FIXME-JSON: PD related, check glass examples for false (ALPM)
+    tbl["is_resource_controllable"] = true;
+    tbl["uses_range"] = false;
+    if (p4_table && p4_table->disable_atomic_modify) tbl["disable_atomic_modify"] = true;
+    tbl["ap_bind_indirect_res_to_match"] = json::vector();
+    tbl["static_entries"] = json::vector();
+    if (context_json) {
+        add_json_node_to_table(tbl, "ap_bind_indirect_res_to_match");
+    }
+    LOG1("populate alpm " << name());
+    // FIXME-DRIVER
+    // 'actions' and 'table_refs' on the alpm are redundant as they are
+    // already present in the atcam table. These should probably be cleaned
+    // up from the context json and driver parsing.
+    if (actions) {
+        actions->gen_tbl_cfg(tbl["actions"]);
+    } else if (action && action->actions) {
+        action->actions->gen_tbl_cfg(tbl["actions"]);
+    }
+    add_all_reference_tables(tbl);
+    json::map &alpm_match_attributes = tbl["match_attributes"];
+    alpm_match_attributes["max_subtrees_per_partition"] = max_subtrees_per_partition;
+    alpm_match_attributes["partition_field_name"] = get_partition_field_name();
+    alpm_match_attributes["lpm_field_name"] = get_lpm_field_name();
+    alpm_match_attributes["bins_per_partition"] = bins_per_partition;
+    alpm_match_attributes["atcam_subset_width"] = atcam_subset_width;
+    alpm_match_attributes["shift_granularity"] = shift_granularity;
+    if (context_json) {
+        add_json_node_to_table(alpm_match_attributes, "excluded_field_msb_bits");
+    }
+    auto pa_hdl = get_partition_action_handle();
+    // Throw an error if partition action handle is not set. The alpm
+    // pre-classifier should have a single action which sets the partition
+    // handle. If no handle is present, it is either not generated by the
+    // compiler or assembler is not able to find it within actions. In
+    // either case this is a problem as driver will error out
+    if (pa_hdl.empty())
+        error(lineno, "Cannot find partition action handle for ALPM table %s", name());
+    // backward-compatible mode
+    if (pa_hdl.size() == 1) {
+        alpm_match_attributes["set_partition_action_handle"] = *pa_hdl.begin();
+    } else {
+        json::vector &action_handles = alpm_match_attributes["set_partition_action_handle"] =
+            json::vector();
+        for (auto hdl : pa_hdl) action_handles.push_back(hdl);
+    }
+    alpm_match_attributes["stage_tables"] = json::vector();
+}
+
+void AlgTcamMatchTable::gen_tbl_cfg(json::vector &out) const {
+    json::map *atcam_tbl_ptr;
+    unsigned number_entries = get_number_entries();
+    if (is_alpm()) {
+        // Add ALPM ATCAM config to ALPM table (generated by pre-classifier in
+        // previous ostage)
+        json::map *alpm_tbl_ptr = base_tbl_cfg(out, "match", number_entries);
+        if (!alpm_tbl_ptr) {
+            error(lineno, "No alpm table generated by alpm pre-classifier");
+            return;
+        }
+        json::map &alpm_tbl = *alpm_tbl_ptr;
+        gen_alpm_cfg(alpm_tbl);
+        json::map &alpm_match_attributes = alpm_tbl["match_attributes"];
+        json::map &atcam_tbl = alpm_match_attributes["atcam_table"];
+        base_alpm_atcam_tbl_cfg(atcam_tbl, "match", number_entries);
+        atcam_tbl_ptr = &atcam_tbl;
+    } else {
+        atcam_tbl_ptr = base_tbl_cfg(out, "match", number_entries);
+    }
+    json::map &tbl = *atcam_tbl_ptr;
+    common_tbl_cfg(tbl);
+    json::map &match_attributes = tbl["match_attributes"];
+    match_attributes["match_type"] = "algorithmic_tcam";
+    if (actions) {
+        actions->gen_tbl_cfg(tbl["actions"]);
+    } else if (action && action->actions) {
+        action->actions->gen_tbl_cfg(tbl["actions"]);
+    }
+    json::vector &units = match_attributes["units"];
+    gen_unit_cfg(units, number_entries);
+    match_attributes["number_partitions"] = number_partitions;
+    match_attributes["partition_field_name"] = partition_field_name;
+    add_all_reference_tables(tbl);
+    if (units.size() > 1 && has_directly_attached_synth2port())
+        error(lineno,
+              "The ability to split directly addressed counters/meters/stateful "
+              "resources across multiple logical tables of an algorithmic tcam match table "
+              "is not currently supported.");
+    // Empty stage table node in atcam. These are moved inside the
+    // units->MatchTable->stage_table node
+    match_attributes["stage_tables"] = json::vector();
+}
+
+DEFINE_TABLE_TYPE(AlgTcamMatchTable)
diff --git a/backends/tofino/bf-asm/attached_table.cpp b/backends/tofino/bf-asm/attached_table.cpp
new file mode 100644
index 00000000000..921bd97f4f5
--- /dev/null
+++ b/backends/tofino/bf-asm/attached_table.cpp
@@ -0,0 +1,524 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <unordered_map>
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "lib/algorithm.h"
+#include "misc.h"
+
+void AttachedTable::pass1() {
+    if (default_action.empty()) default_action = get_default_action();
+    // Per Flow Enable - Validate and Set pfe and address bits
+    if (per_flow_enable_param == "false") per_flow_enable = false;
+
+    if (!Target::SUPPORT_OVERFLOW_BUS() && stage->overflow_bus_use[7])
+        error(layout[0].lineno, "table %s, %s has no overflow bus between logical row 7 and 8",
+              name(), Target::name());
+}
+
+unsigned AttachedTable::per_flow_enable_bit(MatchTable *m) const {
+    if (!per_flow_enable || per_flow_enable_param.empty()) return 0;
+    unsigned pfe_bit = 0;
+    if (m) {
+        auto addr = m->find_address_field(this);
+        auto address_bits = addr ? addr->size : 0;
+        if (auto f = m->lookup_field(per_flow_enable_param)) {
+            // Get pfe bit position from format entry
+            // This value is then adjusted based on address
+            if (f->size == 1)
+                pfe_bit = f->bit(0);
+            else
+                error(lineno, "pfe bit %s is not a 1 bit in table %s format",
+                      per_flow_enable_param.c_str(), m->name());
+            if (addr)
+                pfe_bit -= addr->bit(0);
+            else
+                pfe_bit = 0;  // we use the primary shift to get at the pfe bit
+        } else if (per_flow_enable_param == "true" && addr) {
+            pfe_bit = addr->bit(addr->size - 1) - addr->bit(0) + default_pfe_adjust();
+        } else {
+            // FIXME -- should be an error, but the compiler can hit this for a shared attached
+            // table that is defaulted in one match table and in the overhead in another.  We
+            // should no longer be generating code that tries to set per_flow_enable: in the
+            // attached table (it should be in the call in the match table) at all, but we still
+            // have issues?  Comments in the compiler indicate those should go away
+            // and this can be an error again.
+            warning(lineno, "can't find per_flow_enable param %s in format for %s",
+                    per_flow_enable_param.c_str(), m->name());
+        }
+    } else {
+        for (auto mt : match_tables) {
+            auto bit = per_flow_enable_bit(mt);
+            if (bit && pfe_bit && bit != pfe_bit) {
+                // this should be ok, but the driver can't handle it currently
+                warning(lineno,
+                        "pfe_bit %s at different locations in different match tables,"
+                        " which will cause driver problems",
+                        per_flow_enable_param.c_str());
+            } else {
+                pfe_bit = bit;
+            }
+        }
+    }
+    return pfe_bit;
+}
+
+// ---------------
+// Meter ALU | Row
+// Used      |
+// ---------------
+// 0         | 1
+// 1         | 3
+// 2         | 5
+// 3         | 7
+// ---------------
+void AttachedTable::add_alu_index(json::map &stage_tbl, std::string alu_index) const {
+    if (layout.size() <= 0)
+        error(lineno, "Invalid meter alu setup. A meter ALU should be allocated for table %s",
+              name());
+    stage_tbl[alu_index] = get_alu_index();
+}
+
+SelectionTable *AttachedTable::get_selector() const {
+    SelectionTable *rv = nullptr;
+    for (auto *mtab : match_tables) {
+        auto *sel = mtab->get_selector();
+        if (sel && rv && rv != sel) return nullptr;  // inconsistent
+        if (sel) rv = sel;
+    }
+    return rv;
+}
+
+SelectionTable *AttachedTables::get_selector() const {
+    if (selector) return dynamic_cast<SelectionTable *>(static_cast<Table *>(selector));
+    return nullptr;
+}
+
+StatefulTable *AttachedTable::get_stateful() const {
+    StatefulTable *rv = nullptr;
+    for (auto *mtab : match_tables) {
+        auto *s = mtab->get_stateful();
+        if (s && rv && rv != s) return nullptr;  // inconsistent
+        if (s) rv = s;
+    }
+    return rv;
+}
+
+StatefulTable *AttachedTables::get_stateful(std::string name) const {
+    for (auto &s : statefuls) {
+        if (name == s->name() || name.empty())
+            return dynamic_cast<StatefulTable *>(static_cast<Table *>(s));
+    }
+    return nullptr;
+}
+
+MeterTable *AttachedTable::get_meter() const {
+    MeterTable *rv = nullptr;
+    for (auto *mtab : match_tables) {
+        auto *m = mtab->get_meter();
+        if (m && rv && rv != m) return nullptr;  // inconsistent
+        if (m) rv = m;
+    }
+    return rv;
+}
+
+MeterTable *AttachedTables::get_meter(std::string name) const {
+    for (auto &s : meters) {
+        if (name == s->name() || name.empty())
+            return dynamic_cast<MeterTable *>(static_cast<Table *>(s));
+    }
+    return nullptr;
+}
+
+Table::Format::Field *AttachedTables::find_address_field(const AttachedTable *tbl) const {
+    if (selector == tbl && selector.args.size() > 0) return selector.args.at(0).field();
+    for (auto &s : stats)
+        if (s == tbl && s.args.size() > 0) return s.args.at(0).field();
+    for (auto &m : meters)
+        if (m == tbl && m.args.size() > 0) return m.args.at(0).field();
+    for (auto &s : statefuls)
+        if (s == tbl) {
+            if (s.args.size() > 1) {
+                return s.args.at(1).field();
+            } else if (s.args.size() > 0) {
+                // this special case is a hack in case we're calling this before
+                // pass1 has run on the match table with these attached tables
+                auto *f = s.args.at(0).field();
+                if (f && f->size > 3) return f;
+            }
+        }
+    return nullptr;
+}
+
+bool AttachedTables::run_at_eop() {
+    if (meters.size() > 0) return true;
+    for (auto &s : stats)
+        if (s->run_at_eop()) return true;
+    return false;
+}
+
+bitvec AttachedTables::compute_reachable_tables() const {
+    bitvec rv;
+    if (selector) rv |= selector->reachable_tables();
+    if (selector_length) rv |= selector->reachable_tables();
+    for (auto &t : stats) rv |= t->reachable_tables();
+    for (auto &t : meters) rv |= t->reachable_tables();
+    for (auto &t : statefuls) rv |= t->reachable_tables();
+    return rv;
+}
+
+unsigned AttachedTable::determine_meter_shiftcount(Table::Call &call, int group, int word,
+                                                   int tcam_shift) const {
+    if (call.args[0].name() && strcmp(call.args[0].name(), "$DIRECT") == 0) {
+        return direct_shiftcount() + tcam_shift;
+    } else if (auto f = call.args[0].field()) {
+        BUG_CHECK(int(f->by_group[group]->bit(0) / 128U) == word);
+        return f->by_group[group]->bit(0) % 128U + indirect_shiftcount();
+    } else if (auto f = call.args[1].field()) {
+        return f->by_group[group]->bit(0) % 128U + METER_ADDRESS_ZERO_PAD;
+    } else if (auto f = call.args[2].field()) {
+        return f->by_group[group]->bit(0) % 128U + METER_ADDRESS_ZERO_PAD;
+    } else {
+        return 0;
+    }
+}
+
+/**
+ * In match merge, addresses are generated from result buses containing match overhead.
+ * These buses (83 bits = 64 bits of RAM line + 19 bits of direct address) are sent through
+ * format and merge to potentially generate addresses for meters, counters, action data,
+ * etc.
+ *
+ * The addresses for meter/selector/stateful alu, counter, idletime, and action data
+ * have very similar setups, and will described in the section below.  But generally
+ * the address can be formulated in 3 steps.
+ *
+ *     1. The 83 bit bus is right shifted to get the bits corresponding to the address.
+ *     2. This value is ANDed with a mask to pull only the relevant bits
+ *     3. The value is ORed with a default register to enable certain bits
+ *
+ * This is commonly referred to as shift-mask-default, and will happen for all of
+ * these addresses if necessary.
+ *
+ * The addresses are built up of 2 or 3 general pieces.
+ *
+ *     1. The RAM line location - which RAM/RAM line to look up the address.  This will
+ *        potentially contain a RAM line, a VPN, and Huffman bits.
+ *     2. A per flow enable bit - a bit to enable the associated table to run or not
+ *     3. A meter type - Specifically only for the meter_adr users (selectors, stateful,
+ *        meter).  Will indicate to the meter alu what particular instruction to run.
+ *
+ * The following portion will describe the registers required to build these addresses:
+ *
+ *    1. *_payload_shifter_en - will enable the address to be generated if set to true, i.e.
+ *       if a match table does not have a counter, then the associated stats_payload_shifter_en
+ *       will not be enabled.
+ *
+ *    2. *_exact/_tcam_shiftcount - the right shift per tind/exact match result bus.
+ *       Addresses themselves can have a certain number of bits appended to the lsb, so
+ *       the number of appended bits has to appear in the shiftcount
+ *
+ *    3. *_mask - the post shift AND mask of the relevant address bits from match overhead
+ *
+ *    4. *_default - the post mask OR value.  Potentially useful for per flow enable bits/
+ *       meter types that are identical for every action
+ *
+ *    5. *_per_entry_mux_ctl - the post shift position of the per flow enable bit, if that
+ *       bit is contained in overhead.  This is always ORed in, separate from default
+ *
+ *    6. _type_position - only relevant for meter address users.  This is the lsb of the
+ *       meter type position if the meter position is in overhead.  Note that if this register
+ *       is used, then the meter type must be included in the mask.
+ *
+ * The purpose of the function of the determine_merge_regs is to look at the arguments of the
+ * call for an attached table, and use those to determine the values of these registers.
+ */
+void AttachedTable::determine_meter_merge_regs(MatchTable *match, int type, int bus,
+                                               const std::vector<Call::Arg> &args,
+                                               METER_ACCESS_TYPE default_type, unsigned &adr_mask,
+                                               unsigned &per_entry_en_mux_ctl,
+                                               unsigned &adr_default,
+                                               unsigned &meter_type_position) {
+    adr_mask = 0;
+    per_entry_en_mux_ctl = 0;
+    adr_default = 0;
+    meter_type_position = 0;
+
+    int max_ptr_bits = EXACT_VPN_BITS + EXACT_WORD_BITS;
+    if (match->to<TernaryMatchTable>()) max_ptr_bits = TCAM_VPN_BITS + TCAM_WORD_BITS;
+
+    unsigned max_address = (1U << METER_ADDRESS_BITS) - 1;
+    BUG_CHECK((args.size() == 2 && default_type == METER_COLOR_ACCESS) || args.size() == 3,
+              "wrong size for meter args");
+    if (args[0] == "$DIRECT") {
+        adr_mask |= (((1U << max_ptr_bits) - 1) << address_shift()) & max_address;
+    } else if (auto addr = args[0].field()) {
+        adr_mask |= (((1U << addr->size) - 1) << address_shift()) & max_address;
+    }
+
+    if (args[1].name() && strcmp(args[1].name(), "$DEFAULT") == 0) {
+        adr_default |= (1 << METER_PER_FLOW_ENABLE_START_BIT);
+    } else if (auto pfe_field = args[1].field()) {
+        if (auto addr_field = args[0].field()) {
+            per_entry_en_mux_ctl = pfe_field->bit(0) - addr_field->bit(0) + address_shift();
+        } else if (args[0].hash_dist() || args[0].count_mode()) {
+            per_entry_en_mux_ctl = 0;
+        }
+    }
+
+    if (default_type == METER_COLOR_ACCESS) {
+        // meter color access -- has no meter type
+    } else if (args[2].name() && strcmp(args[2].name(), "$DEFAULT") == 0) {
+        adr_default |= default_type << METER_TYPE_START_BIT;
+    } else if (auto type_field = args[2].field()) {
+        if (auto addr_field = args[0].field()) {
+            meter_type_position = type_field->bit(0) - addr_field->bit(0) + address_shift();
+        } else if (args[0].hash_dist() || args[0].count_mode()) {
+            if (auto pfe_field = args[1].field()) {
+                meter_type_position = type_field->bit(0) - pfe_field->bit(0);
+            }
+        } else {
+            meter_type_position = 0;
+        }
+        adr_mask |= ((1 << METER_TYPE_BITS) - 1) << METER_TYPE_START_BIT;
+    }
+}
+
+const Table::Call *AttachedTables::get_call(const Table *tbl) const {
+    if (selector == tbl) return &selector;
+    for (auto &s : stats)
+        if (s == tbl) return &s;
+    for (auto &m : meters)
+        if (m == tbl) return &m;
+    for (auto &s : statefuls)
+        if (s == tbl) return &s;
+    return nullptr;
+}
+
+/**
+ * Currently a call for an attached table (currently for counters/meters/stateful alus/selectors)
+ * is built up of a 2 part address/3 part address consisting of 3 parameters:
+ *
+ * 1.  The location of the address
+ * 2.  The location of the per flow enable bit
+ * 3.  The location of the meter type (if necessary)
+ *
+ * Currently these locations can be:
+ *     - Names that appear in the format of the table
+ *     - For address location, a hash distribution unit
+ *     - For address a $DIRECT keyword for a directly addressed table
+ *     - For pfe and meter type, a $DEFAULT keyword indicating that the value is ORed in through
+ *       the default register
+ *
+ * This function is responsible for validating this.  Perhaps, in the future, we can have arguments
+ * both contain potential SHIFTs and ORs that can be interpreted by the registers
+ */
+bool AttachedTable::validate_call(Table::Call &call, MatchTable *self, size_t required_args,
+                                  int hash_dist_type, Table::Call &first_call) {
+    if (!self) return false;
+    if (call->stage != self->stage) {
+        error(call.lineno, "%s not in same stage as %s", call->name(), self->name());
+        return false;
+    } else if (call->gress != self->gress) {
+        if (!(call->to<StatefulTable>() &&
+              timing_thread(call->gress) == timing_thread(self->gress))) {
+            error(call.lineno, "%s not in same thread as %s", call->name(), self->name());
+            return false;
+        }
+    } else if (call.args != first_call.args) {
+        error(call.lineno,
+              "All calls for the same address type must be identical, and "
+              "are not for %s and %s",
+              call->name(), first_call->name());
+    }
+
+    if (call.args.size() != required_args) {
+        error(call.lineno, "%s requires exactly %zu arguments", call->name(), required_args);
+        return false;
+    }
+
+    if (call.args.size() == 0) return true;
+    if (call.args[0].name()) {
+        if (strcmp(call.args[0].name(), "$DIRECT") != 0) {
+            error(call.lineno, "Index %s for %s cannot be found", call.args[0].name(),
+                  call->name());
+            return false;
+        }
+    } else if (call.args[0].hash_dist()) {
+        call.args[0].hash_dist()->xbar_use |= hash_dist_type;
+    } else if (call.args[0].type == Table::Call::Arg::Counter) {
+        auto *salu = call->to<StatefulTable>();
+        if (salu == nullptr) {
+            error(call.lineno,
+                  "Index for %s cannot be a stateful counter, as it is not a "
+                  "stateful alu",
+                  call->name());
+            return false;
+        }
+        salu->set_counter_mode(call.args[0].count_mode());
+
+    } else if (!call.args[0].field()) {
+        error(call.lineno, "Index for %s cannot be understood", call->name());
+    }
+
+    if (call.args.size() == 1) return true;
+
+    if (call.args[1].name()) {
+        if (strcmp(call.args[1].name(), "$DEFAULT") != 0) {
+            error(call.lineno, "Per flow enable %s for %s cannot be found", call.args[1].name(),
+                  call->name());
+            return false;
+        }
+    } else if (!call.args[1].field()) {
+        error(call.lineno, "Per flow enable for %s cannot be understood", call->name());
+        return false;
+    }
+
+    if (call.args.size() == 2) return true;
+
+    if (call.args[2].name()) {
+        if (strcmp(call.args[2].name(), "$DEFAULT") != 0) {
+            error(call.lineno, "Meter type %s for %s cannot be found", call.args[2].name(),
+                  call->name());
+            return false;
+        }
+    } else if (!call.args[2].field()) {
+        error(call.lineno, "Meter type for %s cannot be understood", call->name());
+        return false;
+    }
+    return true;
+}
+
+void AttachedTables::pass0(MatchTable *self) {
+    if (selector.check() && selector->set_match_table(self, true) != Table::SELECTION)
+        error(selector.lineno, "%s is not a selection table", selector->name());
+    for (auto &s : stats) {
+        bool direct = false;
+        if (s.check() && s->set_match_table(self, !s.is_direct_call()) != Table::COUNTER)
+            error(s.lineno, "%s is not a counter table", s->name());
+    }
+    for (auto &m : meters)
+        if (m.check()) {
+            auto type = m->set_match_table(self, !m.is_direct_call() > 0);
+            if (type != Table::METER && type != Table::STATEFUL)
+                error(m.lineno, "%s is not a meter table", m->name());
+        }
+    for (auto &s : statefuls) {
+        if (!s.check()) continue;
+        if (s->set_match_table(self, !s.is_direct_call()) != Table::STATEFUL)
+            error(s.lineno, "%s is not a stateful table", s->name());
+    }
+}
+
+void AttachedTables::pass1(MatchTable *self) {
+    if (selector) {
+        selector->validate_call(selector, self, 3, HashDistribution::METER_ADDRESS, selector);
+        if (selector_length && selector_length->name() == selector->name()) {
+            selector_length->to<SelectionTable>()->validate_length_call(selector_length);
+        } else {
+            error(selector.lineno,
+                  "Must provide selector length information when a selector "
+                  "is called");
+        }
+    }
+    for (auto &s : stats) {
+        if (s) {
+            s->validate_call(s, self, 2, HashDistribution::STATISTICS_ADDRESS, stats[0]);
+        }
+    }
+
+    bool color_mapram_req = false;
+    for (auto &m : meters) {
+        if (m) {
+            m->validate_call(m, self, 3, HashDistribution::METER_ADDRESS, meters[0]);
+            if (m->uses_colormaprams()) color_mapram_req = true;
+        }
+    }
+
+    if (color_mapram_req) {
+        if (meter_color) {
+            meter_color->validate_call(meter_color, self, 2, HashDistribution::STATISTICS_ADDRESS,
+                                       meter_color);
+        } else {
+            error(meters[0].lineno,
+                  "Must provide a meter color mapram call when a meter "
+                  "required color maprams is called");
+        }
+    }
+
+    for (auto &s : statefuls) {
+        if (s) {
+            s->validate_call(s, self, 3, HashDistribution::METER_ADDRESS, statefuls[0]);
+        }
+    }
+}
+
+int AttachedTable::json_memunit(const MemUnit &r) const {
+    if (r.stage >= 0) {
+        return r.stage * Target::SRAM_STRIDE_STAGE() + r.row * Target::SRAM_STRIDE_ROW() +
+               r.col * Target::SRAM_STRIDE_COLUMN();
+    } else if (r.row >= 0) {
+        // per-stage logical sram
+        return r.row * Target::SRAM_LOGICAL_UNITS_PER_ROW() + r.col;
+    } else {
+        // lamb
+        return r.col;
+    }
+}
+
+template <class REGS>
+void AttachedTables::write_merge_regs(REGS &regs, MatchTable *self, int type, int bus) {
+    for (auto &s : stats) s->write_merge_regs(regs, self, type, bus, s.args);
+    for (auto &m : meters) {
+        m->write_merge_regs(regs, self, type, bus, m.args);
+        if (m->uses_colormaprams()) {
+            if (meter_color)
+                m->to<MeterTable>()->write_color_regs(regs, self, type, bus, meter_color.args);
+            else
+                m->to<MeterTable>()->write_color_regs(regs, self, type, bus, m.args);
+        }
+    }
+    for (auto &s : statefuls) s->write_merge_regs(regs, self, type, bus, s.args);
+    if (auto s = get_selector()) s->write_merge_regs(regs, self, type, bus, selector.args);
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void AttachedTables::write_merge_regs,
+                      mau_regs &, MatchTable *, int, int)
+
+template <class REGS>
+void AttachedTables::write_tcam_merge_regs(REGS &regs, MatchTable *self, int bus, int tcam_shift) {
+    auto &merge = regs.rams.match.merge;
+    for (auto &st : stats) {
+        merge.mau_stats_adr_tcam_shiftcount[bus] = st->determine_shiftcount(st, 0, 0, tcam_shift);
+        break;
+    }
+    for (auto &m : meters) {
+        m->to<MeterTable>()->setup_tcam_shift(regs, bus, tcam_shift, m, meter_color);
+        break; /* all must be the same, only config once */
+    }
+    for (auto &s : statefuls) {
+        merge.mau_meter_adr_tcam_shiftcount[bus] = s->determine_shiftcount(s, 0, 0, tcam_shift);
+        break; /* all must be the same, only config once */
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void AttachedTables::write_tcam_merge_regs,
+                      mau_regs &, MatchTable *, int, int)
diff --git a/backends/tofino/bf-asm/b2j.cpp b/backends/tofino/bf-asm/b2j.cpp
new file mode 100644
index 00000000000..b6b345aa107
--- /dev/null
+++ b/backends/tofino/bf-asm/b2j.cpp
@@ -0,0 +1,48 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <fstream>
+#include <iostream>
+
+#include "bson.h"
+
+int main(int ac, char **av) {
+    if (ac != 3) {
+        std::cerr << "usage " << av[0] << " <bson in> <json out>" << std::endl;
+        return 1;
+    }
+    std::ifstream in(av[1]);
+    if (!in) {
+        std::cerr << "failed to open " << av[1] << std::endl;
+        return 1;
+    }
+    json::obj *data = nullptr;
+    if (!(in >> json::binary(data))) {
+        std::cerr << "failed to read bson" << std::endl;
+        return 1;
+    }
+    std::ofstream out(av[2]);
+    if (!out) {
+        std::cerr << "failed to open " << av[2] << std::endl;
+        return 1;
+    }
+    if (!(out << data)) {
+        std::cerr << "failed to write json" << std::endl;
+        return 1;
+    }
+    return 0;
+}
diff --git a/backends/tofino/bf-asm/bfas.cpp b/backends/tofino/bf-asm/bfas.cpp
new file mode 100644
index 00000000000..94bd4476ed6
--- /dev/null
+++ b/backends/tofino/bf-asm/bfas.cpp
@@ -0,0 +1,623 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "bfas.h"
+
+#include <unistd.h>
+
+#include <sys/stat.h>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "backends/tofino/bf-asm/target.h"
+#include "backends/tofino/bf-p4c/git_sha_version.h"  // for BF_P4C_GIT_SHA
+#include "backends/tofino/bf-p4c/version.h"
+#include "constants.h"
+#include "lib/indent.h"
+#include "misc.h"
+#include "parser-tofino-jbay.h"
+#include "sections.h"
+#include "top_level.h"
+
+#define MAJOR_VERSION 1
+#define MINOR_VERSION 0
+
+const std::string SCHEMA_VERSION = CONTEXT_SCHEMA_VERSION;  // NOLINT(runtime/string)
+
+option_t options = {
+    .binary = PIPE0,
+    .condense_json = true,
+    .debug_info = false,
+    .disable_egress_latency_padding = false,
+    .disable_gfm_parity = true,
+    .disable_long_branch = false,
+    .disable_power_gating = false,
+    .gen_json = false,
+    .high_availability_enabled = true,
+    .match_compiler = false,
+    .multi_parsers = true,  // TODO Remove option after testing
+    .partial_input = false,
+    .singlewrite = true,
+    .stage_dependency_pattern = "",
+    .target = NO_TARGET,
+    .tof2lab44_workaround = false,
+    .version = CONFIG_OLD,
+    .werror = false,
+    .nowarn = false,
+    .log_hashes = false,
+    .output_dir = ".",
+    .num_stages_override = 0,
+    .tof1_egr_parse_depth_checks_disabled = false,
+};
+
+std::string asmfile_name;                       // NOLINT(runtime/string)
+std::string asmfile_dir;                        // NOLINT(runtime/string)
+std::string gfm_log_file_name = "mau.gfm.log";  // NOLINT(runtime/string)
+
+std::unique_ptr<std::ostream> gfm_out;
+
+int log_error = 0;
+extern char *program_name;
+
+/**
+ * @brief Maximum handle offset which can be used for table and parser handles.
+ *
+ * Selected bits in parser and table handles are dedicated to distinguish handles
+ * for different pipes.
+ * See comments in bf-asm/parser.h and bf-asm/p4_table.cpp to get more information
+ * about format of parser and table handles.
+ * Currently 4 bits are dedicated for pipe id.
+ */
+#define MAX_HANDLE_OFFSET 16
+
+/**
+ * @brief Value OR-ed with table and parser handles to create unique handles.
+ *
+ * See comments in bf-asm/parser.h and bf-asm/p4_table.cpp to get more information
+ * about format of parser and table handles.
+ */
+unsigned unique_table_offset = 0;
+
+BaseAsmParser *asm_parser = nullptr;
+
+// Create target-specific section for parser
+void createSingleAsmParser() {
+    if (asm_parser != nullptr) {
+        return;
+    }
+    asm_parser = new AsmParser;
+}
+
+std::unique_ptr<std::ostream> open_output(const char *name, ...) {
+    char namebuf[1024], *p = namebuf, *end = namebuf + sizeof(namebuf);
+    va_list args;
+    if (!options.output_dir.empty()) p += snprintf(p, end - p, "%s/", options.output_dir.c_str());
+    va_start(args, name);
+    if (p < end) p += vsnprintf(p, end - p, name, args);
+    va_end(args);
+    if (p >= end) {
+        std::cerr << "File name too long: " << namebuf << "..." << std::endl;
+        snprintf(namebuf, sizeof(namebuf), "/dev/null");
+    }
+    auto rv = std::unique_ptr<std::ostream>(new std::ofstream(namebuf));
+    if (!*rv) {
+        std::cerr << "Failed to open " << namebuf << " for writing: " << strerror(errno)
+                  << std::endl;
+    }
+    return rv;
+}
+
+std::string usage(std::string tfas) {
+    std::string u = "usage: ";
+    u.append(tfas);
+    u.append(" [-l:Mo:gqtvh] file...");
+    return u;
+}
+
+void output_all() {
+    auto targetName = "unknown";
+    switch (options.target) {
+#define SET_TOP_LEVEL(TARGET)                            \
+    case Target::TARGET::tag:                            \
+        new TopLevelRegs<Target::TARGET::register_type>; \
+        targetName = Target::TARGET::name;               \
+        break;
+        FOR_ALL_TARGETS(SET_TOP_LEVEL)
+        default:
+            std::cerr << "No target set" << std::endl;
+            error_count++;
+            return;
+    }
+    json::map ctxtJson;
+    const time_t now = time(NULL);
+    char build_date[1024];
+    struct tm lt;
+    localtime_r(&now, &lt);
+    BUG_CHECK(&lt);
+    strftime(build_date, 1024, "%c", &lt);
+    ctxtJson["build_date"] = build_date;
+    ctxtJson["schema_version"] = SCHEMA_VERSION;
+    ctxtJson["compiler_version"] = BF_P4C_VERSION " (" BF_P4C_GIT_SHA ")";
+    ctxtJson["target"] = targetName;
+    ctxtJson["program_name"] = asmfile_name;
+    ctxtJson["learn_quanta"] = json::vector();
+    ctxtJson["parser"] = json::map();
+    ctxtJson["phv_allocation"] = json::vector();
+    ctxtJson["tables"] = json::vector();
+    ctxtJson["mau_stage_characteristics"] = json::vector();
+    ctxtJson["configuration_cache"] = json::vector();
+
+    Section::output_all(ctxtJson);
+    TopLevel::output_all(ctxtJson);
+
+    json::map driver_options;
+    driver_options["hash_parity_enabled"] = !options.disable_gfm_parity;
+    driver_options["high_availability_enabled"] = options.high_availability_enabled;
+    if (options.target == TOFINO)
+        driver_options["tof1_egr_parse_depth_checks_disabled"] =
+            options.tof1_egr_parse_depth_checks_disabled;
+    ctxtJson["driver_options"] = std::move(driver_options);
+
+    auto json_out = open_output("context.json");
+    *json_out << &ctxtJson;
+
+    delete TopLevel::all;
+}
+
+void check_target_pipes(int pipe_id) {
+    if (pipe_id >= 0) {
+        if (pipe_id >= MAX_PIPE_COUNT) {
+            std::cerr << "Pipe number (" << pipe_id << ") exceeds implementation limit of pipes ("
+                      << MAX_PIPE_COUNT << ")." << std::endl;
+            error_count++;
+        } else if (pipe_id < Target::NUM_PIPES()) {
+            options.binary = static_cast<binary_type_t>(PIPE0 + pipe_id);
+        } else {
+            std::cerr << "Pipe number (" << pipe_id << ") exceeds maximum number of pipes ("
+                      << Target::NUM_PIPES() << ") for target " << Target::name() << "."
+                      << std::endl;
+            error_count++;
+        }
+    }
+}
+
+#define MATCH_TARGET_OPTION(TARGET, OPT)                                         \
+    if (!strcasecmp(OPT, Target::TARGET::name)) /* NOLINT(readability/braces) */ \
+        options.target = Target::TARGET::tag;                                    \
+    else
+#define OUTPUT_TARGET(TARGET) << " " << Target::TARGET::name
+
+// Do not build main() when BUILDING_FOR_GTEST.
+#ifndef BUILDING_FOR_GTEST
+int main(int ac, char **av) {
+    int srcfiles = 0;
+    const char *firstsrc = 0;
+    struct stat st;
+    bool asmfile = false;
+    bool disable_clog = true;
+    int pipe_id = -1;
+    extern void register_exit_signals();
+    register_exit_signals();
+    program_name = av[0];
+    std::vector<char *> arguments(av, av + ac);
+    static std::set<std::string> valid_noop_fill = {"and",  "or",   "alu_a", "alu_b",
+                                                    "minu", "mins", "maxu",  "maxs"};
+    if (auto opt = getenv("BFAS_OPTIONS")) {
+        int add_at = 1;
+        while (auto p = strsep(&opt, " \t\r\n")) {
+            if (!*p) continue;
+            arguments.insert(arguments.begin() + add_at++, p);
+        }
+        av = &arguments[0];
+        ac = arguments.size();
+    }
+    for (int i = 1; i < ac; i++) {
+        int val, len;
+        if (av[i][0] == '-' && av[i][1] == 0) {
+            asm_parse_file("<stdin>", stdin);
+        } else if (!strcmp(av[i], "--allpipes")) {
+            options.binary = FOUR_PIPE;
+        } else if (!strcmp(av[i], "--disable-egress-latency-padding")) {
+            options.disable_egress_latency_padding = true;
+        } else if (!strcmp(av[i], "--log-hashes")) {
+            options.log_hashes = true;
+        } else if (!strcmp(av[i], "--disable-longbranch")) {
+            options.disable_long_branch = true;
+        } else if (!strcmp(av[i], "--enable-longbranch")) {
+            if (options.target && Target::LONG_BRANCH_TAGS() == 0) {
+                error(-1, "target %s does not support --enable-longbranch", Target::name());
+                options.disable_long_branch = true;
+            } else {
+                options.disable_long_branch = false;
+            }
+        } else if (!strcmp(av[i], "--gen_json")) {
+            options.gen_json = true;
+            options.binary = NO_BINARY;
+        } else if (!strcmp(av[i], "--high_availability_disabled")) {
+            options.high_availability_enabled = false;
+        } else if (!strcmp(av[i], "--no-condense")) {
+            options.condense_json = false;
+        } else if (!strcmp(av[i], "--no-bin")) {
+            options.binary = NO_BINARY;
+        } else if (!strcmp(av[i], "--no-warn")) {
+            options.nowarn = true;
+        } else if (!strcmp(av[i], "--old_json")) {
+            std::cerr << "Old context json is no longer supported" << std::endl;
+            error_count++;
+        } else if (!strcmp(av[i], "--partial")) {
+            options.partial_input = true;
+        } else if (sscanf(av[i], "--pipe%d%n", &val, &len) > 0 && !av[i][len] && val >= 0) {
+            pipe_id = val;
+        } else if (!strcmp(av[i], "--singlepipe")) {
+            options.binary = ONE_PIPE;
+        } else if (!strcmp(av[i], "--singlewrite")) {
+            options.singlewrite = true;
+        } else if (!strcmp(av[i], "--multi-parsers")) {
+            options.multi_parsers = true;
+        } else if (!strcmp(av[i], "--disable-tof2lab44-workaround")) {
+            options.tof2lab44_workaround = false;
+        } else if (!strcmp(av[i], "--tof2lab44-workaround")) {
+            options.tof2lab44_workaround = true;
+        } else if (!strcmp(av[i], "--stage_dependency_pattern")) {
+            ++i;
+            if (!av[i]) {
+                std::cerr << "No stage dependency pattern specified " << std::endl;
+                error_count++;
+                break;
+            }
+            options.stage_dependency_pattern = av[i];
+        } else if (!strcmp(av[i], "--noop-fill-instruction")) {
+            ++i;
+            if (!av[i] || !valid_noop_fill.count(av[i])) {
+                std::cerr << "invalid fill instruction " << av[i] << std::endl;
+            } else {
+                options.fill_noop_slot = av[i];
+            }
+        } else if (val = 0, sscanf(av[i], "--noop-fill-instruction=%n", &val), val > 0) {
+            if (!valid_noop_fill.count(av[i] + val)) {
+                std::cerr << "invalid fill instruction " << (av[i] + val) << std::endl;
+            } else {
+                options.fill_noop_slot = av[i] + val;
+            }
+        } else if (sscanf(av[i], "--table-handle-offset%d", &val) > 0 && val >= 0 &&
+                   val < MAX_HANDLE_OFFSET) {
+            unique_table_offset = val;
+        } else if (sscanf(av[i], "--num-stages-override%d", &val) > 0 && val >= 0) {
+            options.num_stages_override = val;
+        } else if (!strcmp(av[i], "--target")) {
+            ++i;
+            if (!av[i]) {
+                std::cerr << "No target specified '--target <target>'" << std::endl;
+                error_count++;
+                break;
+            }
+            if (options.target != NO_TARGET) {
+                std::cerr << "Multiple target options" << std::endl;
+                error_count++;
+                break;
+            }
+            FOR_ALL_TARGETS(MATCH_TARGET_OPTION, av[i]) {
+                std::cerr << "Unknown target " << av[i] << std::endl;
+                error_count++;
+                std::cerr << "Supported targets:" FOR_ALL_TARGETS(OUTPUT_TARGET) << std::endl;
+            }
+        } else if (av[i][0] == '-' && av[i][1] == '-') {
+            FOR_ALL_TARGETS(MATCH_TARGET_OPTION, av[i] + 2) {
+                std::cerr << "Unrecognized option " << av[i] << std::endl;
+                error_count++;
+            }
+        } else if (av[i][0] == '-' || av[i][0] == '+') {
+            bool flag = av[i][0] == '+';
+            for (char *arg = av[i] + 1; *arg;) switch (*arg++) {
+                    case 'a':
+                        options.binary = FOUR_PIPE;
+                        break;
+                    case 'C':
+                        options.condense_json = true;
+                        break;
+                    case 'G':
+                        options.gen_json = true;
+                        options.binary = NO_BINARY;
+                        break;
+                    case 'g':
+                        options.debug_info = true;
+                        break;
+                    case 'h':
+                        std::cout << usage(av[0]) << std::endl;
+                        return 0;
+                        break;
+                    case 'l':
+                        ++i;
+                        if (!av[i]) {
+                            std::cerr << "No log file specified '-l <log file>'" << std::endl;
+                            error_count++;
+                            break;
+                        }
+                        disable_clog = false;
+                        if (auto *tmp = new std::ofstream(av[i])) {
+                            if (*tmp) {
+                                /* FIXME -- tmp leaks, but if we delete it, the log
+                                 * redirect fails, and we crash on exit */
+                                std::clog.rdbuf(tmp->rdbuf());
+                            } else {
+                                std::cerr << "Can't open " << av[i] << " for writing" << std::endl;
+                                delete tmp;
+                            }
+                        }
+                        break;
+                    case 'M':
+                        options.match_compiler = true;
+                        options.condense_json = false;
+                        break;
+                    case 'o':
+                        ++i;
+                        if (!av[i]) {
+                            std::cerr << "No output directory specified '-o <output dir>'"
+                                      << std::endl;
+                            error_count++;
+                            break;
+                        }
+                        if (stat(av[i], &st)) {
+                            if (mkdir(av[i], 0777) < 0) {
+                                std::cerr << "Can't create output dir " << av[i] << ": "
+                                          << strerror(errno) << std::endl;
+                                error_count++;
+                            }
+                        } else if (!S_ISDIR(st.st_mode)) {
+                            std::cerr << av[i] << " exists and is not a directory" << std::endl;
+                            error_count++;
+                        }
+                        options.output_dir = av[i];
+                        break;
+                    case 'p':
+                        options.disable_power_gating = true;
+                        break;
+                    case 'q':
+                        std::clog.setstate(std::ios::failbit);
+                        break;
+                    case 's':
+                        options.binary = ONE_PIPE;
+                        break;
+                    case 'T':
+                        disable_clog = false;
+                        if (*arg) {
+                            Log::addDebugSpec(arg);
+                            arg += strlen(arg);
+                        } else if (++i < ac) {
+                            Log::addDebugSpec(av[i]);
+                        }
+                        break;
+                    case 't':
+                        ++i;
+                        if (!av[i]) {
+                            std::cerr << "No target specified '-t <target>'" << std::endl;
+                            error_count++;
+                            break;
+                        }
+                        if (options.target != NO_TARGET) {
+                            std::cerr << "Multiple target options" << std::endl;
+                            error_count++;
+                            break;
+                        }
+                        FOR_ALL_TARGETS(MATCH_TARGET_OPTION, av[i]) {
+                            std::cerr << "Unknown target " << av[i];
+                            error_count++;
+                        }
+                        break;
+                    case 'v':
+                        disable_clog = false;
+                        Log::increaseVerbosity();
+                        break;
+                    case 'W':
+                        if (strcmp(arg, "error"))
+                            options.werror = true;
+                        else
+                            std::cout << "Unknown warning option -W" << arg << std::endl;
+                        arg += strlen(arg);
+                        break;
+                    default:
+                        std::cerr << "Unknown option " << (flag ? '+' : '-') << arg[-1]
+                                  << std::endl;
+                        error_count++;
+                }
+        } else if (FILE *fp = fopen(av[i], "r")) {
+            // asm_parse_file needs to know correct number of stages
+            if (options.num_stages_override) {
+                Target::OVERRIDE_NUM_MAU_STAGES(options.num_stages_override);
+            }
+
+            createSingleAsmParser();
+
+            if (!srcfiles++) firstsrc = av[i];
+            error_count += asm_parse_file(av[i], fp);
+            if (error_count > 0) return 1;
+            fclose(fp);
+            asmfile = true;
+            asmfile_name = get_filename(av[i]);
+            asmfile_dir = get_directory(av[i]);
+        } else {
+            std::cerr << "Can't read " << av[i] << ": " << strerror(errno) << std::endl;
+            error_count++;
+        }
+    }
+
+    check_target_pipes(pipe_id);
+
+    if (disable_clog) std::clog.setstate(std::ios_base::failbit);
+    if (!asmfile) {
+        std::cerr << "No assembly file specified" << std::endl;
+        error_count++;
+    }
+    if (error_count > 0) std::cerr << usage(av[0]) << std::endl;
+
+    if (Log::verbosity() > 0) {
+        gfm_out = open_output("mau.gfm.log");
+    }
+
+    if (error_count == 0 && !options.partial_input) {
+        // Check if file has no sections
+        no_sections_error_exit();
+        // Check if mandatory sections are present in assembly
+        bool no_section = false;
+        no_section |= no_section_error("deparser");
+        no_section |= no_section_error("parser");
+        no_section |= no_section_error("phv");
+        no_section |= no_section_error("stage");
+        if (no_section) exit(1);
+    }
+    if (error_count == 0) {
+        Section::process_all();
+    }
+    if (error_count == 0) {
+        if (srcfiles == 1 && options.output_dir.empty()) {
+            if (const char *p = strrchr(firstsrc, '/'))
+                options.output_dir = p + 1;
+            else if (const char *p = strrchr(firstsrc, '\\'))
+                options.output_dir = p + 1;
+            else
+                options.output_dir = firstsrc;
+            if (const char *e = strrchr(&options.output_dir[0], '.'))
+                options.output_dir.resize(e - &options.output_dir[0]);
+            options.output_dir += ".out";
+            if (stat(options.output_dir.c_str(), &st) ? mkdir(options.output_dir.c_str(), 0777)
+                                                      : !S_ISDIR(st.st_mode))
+                options.output_dir.clear();
+        }
+        output_all();
+    }
+    if (log_error > 0) warning(0, "%d config errors in log file", log_error);
+    return error_count > 0 || (options.werror && warn_count > 0) ? 1 : 0;
+}
+#endif /* !BUILDING_FOR_GTEST */
+
+std::string toString(target_t target) {
+    switch (target) {
+        case TOFINO:
+            return "Tofino";
+        case TOFINO2:
+            return "Tofino2";
+        case TOFINO2H:
+            return "Tofino2H";
+        case TOFINO2U:
+            return "Tofino2U";
+        case TOFINO2M:
+            return "Tofino2M";
+        case TOFINO2A0:
+            return "Tofino2A0";
+        default:
+            BUG("Unexpected target value: 0x%x", target);
+            return "";
+    }
+}
+
+std::ostream &operator<<(std::ostream &out, target_t target) { return out << toString(target); }
+
+void no_sections_error_exit() {
+    if (Section::no_sections_in_assembly()) {
+        std::cerr << "No valid sections found in assembly file" << std::endl;
+        exit(1);
+    }
+}
+
+bool no_section_error(const char *name) {
+    if (!Section::section_in_assembly(name)) {
+        std::cerr << "No '" << name << "' section found in assembly file" << std::endl;
+        return true;
+    }
+    return false;
+}
+
+class Version : public Section {
+    Version() : Section("version") {}
+
+    void input(VECTOR(value_t) args, value_t data) {
+        if (data.type == tINT || data.type == tVEC) {  // version 1.0.0
+            parse_version(data);
+        } else if (data.type == tMAP) {  // version 1.0.1
+            for (auto &kv : MapIterChecked(data.map, true)) {
+                if (kv.key == "version" && (kv.value.type == tVEC || kv.value.type == tINT)) {
+                    parse_version(kv.value);
+                } else if (kv.key == "run_id" && kv.value.type == tSTR) {
+                    _run_id = kv.value.s;
+                } else if (kv.key == "compiler") {
+                    if (kv.value.type == tSTR) {
+                        _compiler = kv.value.s;
+                    } else if (kv.value.type == tINT) {
+                        _compiler = std::to_string(kv.value.i);
+                    } else if (kv.value.type == tVEC) {
+                        const char *sep = "";
+                        for (auto &el : kv.value.vec) {
+                            _compiler += sep;
+                            if (el.type == tSTR)
+                                _compiler += el.s;
+                            else if (el.type == tINT)
+                                _compiler += std::to_string(el.i);
+                            else
+                                error(el.lineno, "can't understand compiler version");
+                            sep = ".";
+                        }
+                    }
+                } else if (kv.key == "target") {
+                    if (kv.value.type == tSTR) {
+                        auto old = options.target;
+                        FOR_ALL_TARGETS(MATCH_TARGET_OPTION, kv.value.s) {
+                            error(kv.value.lineno, "Unknown target %s", kv.value.s);
+                        }
+                        if (old != NO_TARGET && old != options.target) {
+                            options.target = old;
+                            error(kv.value.lineno, "Inconsistent target %s (previously set to %s)",
+                                  kv.value.s, Target::name());
+                        }
+                        createSingleAsmParser();
+                    } else {
+                        error(kv.value.lineno, "Invalid target %s", value_desc(kv.value));
+                    }
+                } else {
+                    warning(kv.key.lineno, "ignoring unknown item %s in version",
+                            value_desc(kv.key));
+                }
+            }
+        } else {
+            error(data.lineno, "Invalid version section");
+        }
+    }
+
+    void output(json::map &ctx_json) {
+        if (!_compiler.empty()) ctx_json["compiler_version"] = _compiler;
+        ctx_json["run_id"] = _run_id;
+    }
+
+ private:
+    void parse_version(value_t data) {
+        if (data.type == tINT) {
+            if (data.i != MAJOR_VERSION)
+                error(data.lineno, "Version %" PRId64 " not supported", data.i);
+        } else if (data.vec.size >= 2) {
+            if (CHECKTYPE(data[0], tINT) && CHECKTYPE(data[1], tINT) &&
+                (data[0].i != MAJOR_VERSION || data[1].i > MINOR_VERSION))
+                error(data.lineno, "Version %" PRId64 ".%" PRId64 " not supported", data[0].i,
+                      data[1].i);
+        } else {
+            error(data.lineno, "Version not understood");
+        }
+    }
+
+    std::string _run_id, _compiler;
+    static Version singleton_version;
+} Version::singleton_version;
diff --git a/backends/tofino/bf-asm/bfas.h b/backends/tofino/bf-asm/bfas.h
new file mode 100644
index 00000000000..ed13b05d7b7
--- /dev/null
+++ b/backends/tofino/bf-asm/bfas.h
@@ -0,0 +1,182 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_BFAS_H_
+#define BACKENDS_TOFINO_BF_ASM_BFAS_H_
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+enum config_version_t { CONFIG_OLD = 1, CONFIG_NEW = 2, CONFIG_BOTH = 3 };
+enum target_t {
+    NO_TARGET = 0,
+    TOFINO,
+    TOFINO2,
+    JBAY = TOFINO2,
+    TOFINO2H,
+    TOFINO2U,
+    TOFINO2M,
+    TOFINO2A0,
+    TARGET_INDEX_LIMIT
+};
+enum binary_type_t {
+    NO_BINARY = -3,
+    FOUR_PIPE = -2,  // binary replicating to all 4 pipes
+    ONE_PIPE = -1,   // binary for one pipe with pipe offset addresses
+    PIPE0 = 0,       // binary with data just in pipe 0
+    PIPE1,           // binary with data just in pipe 1
+    PIPE2,           // binary with data just in pipe 2
+    PIPE3,           // binary with data just in pipe 3
+    MAX_PIPE_COUNT,  // Maximum number of pipes which bfas can create binary for
+};
+
+extern struct option_t {
+    binary_type_t binary;
+    bool condense_json;
+    bool debug_info;
+    bool disable_egress_latency_padding;
+    bool disable_gfm_parity;
+    bool disable_long_branch;
+    bool disable_power_gating;
+    bool gen_json;
+    bool high_availability_enabled;
+    bool match_compiler;
+    bool multi_parsers;
+    bool partial_input;
+    bool singlewrite;
+    std::string stage_dependency_pattern;
+    target_t target;
+    bool tof2lab44_workaround;
+    config_version_t version;
+    bool werror;
+    bool nowarn;
+    bool log_hashes;
+    std::string output_dir;
+    int num_stages_override;
+    bool tof1_egr_parse_depth_checks_disabled;
+    const char *fill_noop_slot;
+} options;
+
+extern unsigned unique_action_handle;
+struct value_t;
+
+extern std::string asmfile_name;
+extern std::string asmfile_dir;
+extern std::unique_ptr<std::ostream> gfm_out;
+
+class BaseAsmParser;
+extern BaseAsmParser *asm_parser;
+void createSingleAsmParser();
+
+std::string toString(target_t target);
+std::ostream &operator<<(std::ostream &out, target_t target);
+
+int asm_parse_file(const char *name, FILE *in);
+int asm_parse_string(const char *in);
+
+void no_sections_error_exit();
+bool no_section_error(const char *name);
+
+extern int error_count, warn_count;
+extern void error(int lineno, const char *fmt, va_list);
+void error(int lineno, const char *fmt, ...) __attribute__((format(printf, 2, 3)));
+inline void error(int lineno, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    error(lineno, fmt, args);
+    va_end(args);
+}
+extern void warning(int lineno, const char *fmt, va_list);
+void warning(int lineno, const char *fmt, ...) __attribute__((format(printf, 2, 3)));
+inline void warning(int lineno, const char *fmt, ...) {
+#ifdef BAREFOOT_INTERNAL
+    if (!options.nowarn) {
+        va_list args;
+        va_start(args, fmt);
+        warning(lineno, fmt, args);
+        va_end(args);
+    }
+#endif /* BAREFOOT_INTERNAL */
+}
+
+inline const char *strip_prefix(const char *str, const char *pfx) {
+    if (const char *p = strstr(str, pfx)) return p + strlen(pfx);
+    return str;
+}
+void bug(const char *, int, const char * = 0, ...) __attribute__((format(printf, 3, 4)))
+__attribute__((noreturn));
+inline void bug(const char *fname, int lineno, const char *fmt, ...) {
+#ifdef NDEBUG
+    fprintf(stderr, "Assembler BUG");
+#else
+    fprintf(stderr, "%s:%d: Assembler BUG: ", fname, lineno);
+    if (fmt) {
+        va_list args;
+        va_start(args, fmt);
+        vfprintf(stderr, fmt, args);
+        va_end(args);
+    }
+#endif /* !NDEBUG */
+    fprintf(stderr, "\n");
+    fflush(stderr);
+    std::terminate();
+}
+
+extern std::unique_ptr<std::ostream> open_output(const char *, ...)
+    __attribute__((format(printf, 1, 2)));
+
+#define SRCFILE strip_prefix(__FILE__, "bf-asm/")
+#define BUG(...)                               \
+    do {                                       \
+        bug(SRCFILE, __LINE__, ##__VA_ARGS__); \
+    } while (0)
+#define BUG_CHECK(e, ...)           \
+    do {                            \
+        if (!(e)) BUG(__VA_ARGS__); \
+    } while (0)
+
+class VersionIter {
+    unsigned left, bit;
+    void check() {
+        while (left && !(left & 1)) {
+            ++bit;
+            left >>= 1;
+        }
+    }
+    VersionIter() : left(0), bit(0) {}
+
+ public:
+    explicit VersionIter(config_version_t v) : left(v), bit(0) { check(); }
+    VersionIter begin() { return *this; }
+    VersionIter end() { return VersionIter(); }
+    int operator*() const { return bit; }
+    bool operator==(VersionIter &a) { return (left << bit) == (a.left << a.bit); }
+    VersionIter &operator++() {
+        left &= ~1;
+        check();
+        return *this;
+    }
+};
+
+extern unsigned unique_table_offset;
+
+#endif /* BACKENDS_TOFINO_BF_ASM_BFAS_H_ */
diff --git a/backends/tofino/bf-asm/bfdis.cpp b/backends/tofino/bf-asm/bfdis.cpp
new file mode 100644
index 00000000000..bb675fb0d65
--- /dev/null
+++ b/backends/tofino/bf-asm/bfdis.cpp
@@ -0,0 +1,185 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fstream>
+#include <string>
+
+#include "bson.h"
+#include "disasm.h"
+#include "fdstream.h"
+
+Disasm *disasm = nullptr;
+
+int read_bin(std::istream &in) {
+    uint32_t atom_typ = 0;
+    while (in.read((char *)&atom_typ, 4)) {
+        if ((atom_typ >> 24) == 'H') {
+            json::map hdr;
+            if (!(in >> json::binary(hdr))) return -1;
+            if (auto target = hdr["target"]) {
+                disasm = Disasm::create(target.to<json::string>());
+            } else {
+                std::cerr << "no target specified in the binary" << std::endl;
+                delete disasm;
+                disasm = nullptr;
+            }
+        } else if ((atom_typ >> 24) == 'C') {
+            // future context json embedding in binary
+            std::unique_ptr<json::obj> ctxt_json;
+            if (!(in >> json::binary(ctxt_json))) return -1;
+        } else if ((atom_typ >> 24) == 'P') {
+            uint32_t prsr_hdl = 0;
+            if (!in.read((char *)&prsr_hdl, 4)) return -1;
+        } else if ((atom_typ >> 24) == 'R') {
+            // R block -- writing a single 32-bit register via 32-bit PCIe address
+            uint32_t reg_addr = 0, reg_data = 0;
+            if (!in.read((char *)&reg_addr, 4)) return -1;
+            if (!in.read((char *)&reg_data, 4)) return -1;
+            if (disasm) disasm->input_binary(reg_addr, 'R', &reg_data, 1);
+        } else if ((atom_typ >> 24) == 'B') {
+            // B block -- write a range of 32-bit registers via 64-bit PCIe address
+            // size of the range is specified as count * width (in bits), which must
+            // always be a multiple of 32
+
+            uint64_t addr = 0;
+            uint32_t count = 0;
+            uint32_t width = 0;
+
+            if (!in.read((char *)&addr, 8)) return -1;
+            if (!in.read((char *)&width, 4)) return -1;
+            if (!in.read((char *)&count, 4)) return -1;
+            // printf("B%08" PRIx64 ": %xx%x", addr, width, count);
+            count = (uint64_t)count * width / 32;
+            std::vector<uint32_t> data(count);
+            if (!in.read((char *)&data[0], count * 4)) return -1;
+            if (disasm) disasm->input_binary(addr, 'B', &data[0], count);
+        } else if ((atom_typ >> 24) == 'D') {
+            // D block -- write a range of 128-bit memory via 64-bit chip address
+            // size of the range is specified as count * width (in bits), which must
+            // always be a multiple of 64
+
+            uint64_t addr = 0;
+            uint32_t count = 0;
+            uint32_t width = 0;
+
+            if (!in.read((char *)&addr, 8)) return -1;
+            if (!in.read((char *)&width, 4)) return -1;
+            if (!in.read((char *)&count, 4)) return -1;
+            // printf("D%011" PRIx64 ": %xx%x", addr, width, count);
+            width /= 8;
+            std::vector<uint32_t> data(count * width / 4);
+            if (!in.read((char *)&data[0], count * width)) return -1;
+            if (disasm) disasm->input_binary(addr, 'D', &data[0], count * width / 4);
+        } else if ((atom_typ >> 24) == 'S') {
+            // S block -- 'scanset' writing multiple data to a single 32-bit PCIE address
+            uint64_t sel_addr = 0, reg_addr = 0;
+            uint32_t sel_data = 0, width = 0, count = 0;
+
+            if (!in.read((char *)&sel_addr, 8)) return -1;
+            if (!in.read((char *)&sel_data, 4)) return -1;
+            if (!in.read((char *)&reg_addr, 8)) return -1;
+            if (!in.read((char *)&width, 4)) return -1;
+            if (!in.read((char *)&count, 4)) return -1;
+            count = (uint64_t)count * width / 32;
+            std::vector<uint32_t> data(count);
+            if (!in.read((char *)&data[0], count * 4)) return -1;
+            if (disasm) disasm->input_binary(reg_addr, 'S', &data[0], count);
+        } else {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "Parse error: atom_typ=%x (%c)\n", atom_typ, atom_typ >> 24);
+            fprintf(stderr, "fpos=%" PRIu64 " <%" PRIx64 "h>\n", (uint64_t)in.tellg(),
+                    (uint64_t)in.tellg());
+            fprintf(stderr, "\n");
+
+            return -1;
+        }
+    }
+
+    return in.eof() ? 0 : -1;
+}
+
+int main(int ac, char **av) {
+    int error = 0;
+    for (int i = 1; i < ac; ++i) {
+        if (*av[i] == '-') {
+            for (char *arg = av[i] + 1; *arg;) switch (*arg++) {
+                    case 'l':
+                        ++i;
+                        if (!av[i]) {
+                            std::cerr << "No log file specified '-l <log file>'" << std::endl;
+                            error_count++;
+                            break;
+                        }
+                        if (auto *tmp = new std::ofstream(av[i])) {
+                            if (*tmp) {
+                                /* FIXME -- tmp leaks, but if we delete it, the log
+                                 * redirect fails, and we crash on exit */
+                                std::clog.rdbuf(tmp->rdbuf());
+                            } else {
+                                std::cerr << "Can't open " << av[i] << " for writing" << std::endl;
+                                delete tmp;
+                            }
+                        }
+                        break;
+                    case 'v':
+                        Log::increaseVerbosity();
+                        break;
+                    case 'T':
+                        if (*arg) {
+                            Log::addDebugSpec(arg);
+                            arg += strlen(arg);
+                        } else if (++i < ac) {
+                            Log::addDebugSpec(av[i]);
+                        }
+                        break;
+                    default:
+                        fprintf(stderr, "ignoring argument -%c\n", *arg);
+                        error = 1;
+                }
+        } else {
+            std::ifstream in(av[i], std::ios::binary);
+            if (!in) {
+                fprintf(stderr, "failed to open %s\n", av[i]);
+                error = 1;
+                continue;
+            }
+            unsigned char magic[4] = {};
+            in.read((char *)magic, 4);
+            if (magic[0] == 0 && magic[3] && strchr("RDBH", magic[3])) {
+                in.seekg(0);
+                error |= read_bin(in);
+            } else if (magic[0] == 0x1f && magic[1] == 0x8b) {
+                if (auto *pipe = popen((std::string("zcat < ") + av[i]).c_str(), "r")) {
+                    fdstream in(fileno(pipe));
+                    error |= read_bin(in);
+                    pclose(pipe);
+                } else {
+                    fprintf(stderr, "%s: Cannot open pipe to read\n", av[i]);
+                }
+            } else {
+                fprintf(stderr, "%s: Unknown file format\n", av[i]);
+            }
+        }
+    }
+    if (error == 1) fprintf(stderr, "usage: %s <file>\n", av[0]);
+    return error;
+}
diff --git a/backends/tofino/bf-asm/bfdumpbin.cpp b/backends/tofino/bf-asm/bfdumpbin.cpp
new file mode 100644
index 00000000000..078b9288758
--- /dev/null
+++ b/backends/tofino/bf-asm/bfdumpbin.cpp
@@ -0,0 +1,228 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fstream>
+#include <string>
+
+#include "bson.h"
+#include "fdstream.h"
+
+struct {
+    bool oneLine;
+    bool noHeader;
+    bool noCtxtJson;
+} options;
+
+int dump_bin(std::istream &in) {
+    uint32_t atom_typ = 0;
+    while (in.read((char *)&atom_typ, 4)) {
+        if ((atom_typ >> 24) == 'H') {
+            json::map hdr;
+            if (!(in >> json::binary(hdr))) return -1;
+            if (!options.noHeader)
+                for (auto &el : hdr) std::cout << el.first << " = " << el.second << std::endl;
+        } else if ((atom_typ >> 24) == 'C') {
+            // future context json embedding in binary
+            std::unique_ptr<json::obj> ctxt_json;
+            if (!(in >> json::binary(ctxt_json))) return -1;
+            if (!options.noCtxtJson) std::cout << ctxt_json;
+        } else if ((atom_typ >> 24) == 'P') {
+            uint32_t prsr_hdl = 0;
+            if (!in.read((char *)&prsr_hdl, 4)) return -1;
+            printf("P: %08x (parser handle)\n", prsr_hdl);
+        } else if ((atom_typ >> 24) == 'R') {
+            // R block -- writing a single 32-bit register via 32-bit PCIe address
+            uint32_t reg_addr = 0, reg_data = 0;
+            if (!in.read((char *)&reg_addr, 4)) return -1;
+            if (!in.read((char *)&reg_data, 4)) return -1;
+            printf("R%08x: %08x\n", reg_addr, reg_data);
+        } else if ((atom_typ >> 24) == 'B') {
+            // B block -- write a range of 32-bit registers via 64-bit PCIe address
+            // size of the range is specified as count * width (in bits), which must
+            // always be a multiple of 32
+
+            uint64_t addr = 0;
+            uint32_t count = 0;
+            uint32_t width = 0;
+
+            if (!in.read((char *)&addr, 8)) return -1;
+            if (!in.read((char *)&width, 4)) return -1;
+            if (!in.read((char *)&count, 4)) return -1;
+            printf("B%08" PRIx64 ": %xx%x", addr, width, count);
+            if ((uint64_t)count * width % 32 != 0) printf("  (not a multiple of 32 bits!)");
+            count = (uint64_t)count * width / 32;
+            uint32_t data, prev;
+            int repeat = 0, col = 0;
+            for (unsigned i = 0; i < count; ++i) {
+                if (!in.read((char *)&data, 4)) return -1;
+                if (i != 0 && data == prev) {
+                    repeat++;
+                    continue;
+                }
+                if (repeat > 0) {
+                    printf(" x%-7d", repeat + 1);
+                    if (++col > 8) col = 0;
+                }
+                repeat = 0;
+                if (!options.oneLine && col++ % 8 == 0) printf("\n   ");
+                printf(" %08x", prev = data);
+            }
+            if (repeat > 0) printf(" x%d", repeat + 1);
+            printf("\n");
+        } else if ((atom_typ >> 24) == 'D') {
+            // D block -- write a range of 128-bit memory via 64-bit chip address
+            // size of the range is specified as count * width (in bits), which must
+            // always be a multiple of 64
+
+            uint64_t addr = 0;
+            uint32_t count = 0;
+            uint32_t width = 0;
+
+            if (!in.read((char *)&addr, 8)) return -1;
+            if (!in.read((char *)&width, 4)) return -1;
+            if (!in.read((char *)&count, 4)) return -1;
+            printf("D%011" PRIx64 ": %xx%x", addr, width, count);
+            if ((uint64_t)count * width % 64 != 0) printf("  (not a multiple of 64 bits!)");
+
+            width /= 8;
+
+            uint64_t chunk[2], prev_chunk[2];
+            int repeat = 0, col = 0;
+            for (unsigned i = 0; i < count * width; i += 16) {
+                if (!in.read((char *)chunk, 16)) return -1;
+                if (i != 0 && chunk[0] == prev_chunk[0] && chunk[1] == prev_chunk[1]) {
+                    repeat++;
+                    continue;
+                }
+                if (repeat > 0) {
+                    printf(" x%d", repeat + 1);
+                    col = 0;
+                }
+                repeat = 0;
+                if (!options.oneLine && col++ % 2 == 0) printf("\n   ");
+                printf(" %016" PRIx64 "%016" PRIx64, prev_chunk[1] = chunk[1],
+                       prev_chunk[0] = chunk[0]);
+            }
+
+            if (repeat > 0) {
+                printf(" x%d", repeat + 1);
+                col = 0;
+            }
+
+            if (count * width % 16 == 8) {
+                if (!in.read((char *)chunk, 8)) return -1;
+                if (!options.oneLine && col % 2 == 0) printf("\n   ");
+                printf(" %016" PRIx64, chunk[0]);
+            }
+            printf("\n");
+        } else if ((atom_typ >> 24) == 'S') {
+            // S block -- 'scanset' writing multiple data to a single 32-bit PCIE address
+            uint64_t sel_addr = 0, reg_addr = 0;
+            uint32_t sel_data = 0, width = 0, count = 0;
+
+            if (!in.read((char *)&sel_addr, 8)) return -1;
+            if (!in.read((char *)&sel_data, 4)) return -1;
+            if (!in.read((char *)&reg_addr, 8)) return -1;
+            if (!in.read((char *)&width, 4)) return -1;
+            if (!in.read((char *)&count, 4)) return -1;
+            printf("S%011" PRIx64 ": %x, %011" PRIx64 ": %xx%x", sel_addr, sel_data, reg_addr,
+                   width, count);
+            if (width % 32 != 0) printf("  (not a multiple of 32 bits!)");
+            count = (uint64_t)count * width / 32;
+            uint32_t data, prev;
+            int repeat = 0, col = 0;
+            for (unsigned i = 0; i < count; ++i) {
+                if (!in.read((char *)&data, 4)) return -1;
+                if (i != 0 && data == prev) {
+                    repeat++;
+                    continue;
+                }
+                if (repeat > 0) {
+                    printf(" x%-7d", repeat + 1);
+                    if (++col > 8) col = 0;
+                }
+                repeat = 0;
+                if (!options.oneLine && col++ % 8 == 0) printf("\n   ");
+                printf(" %08x", prev = data);
+            }
+            if (repeat > 0) printf(" x%d", repeat + 1);
+            printf("\n");
+        } else {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "Parse error: atom_typ=%x (%c)\n", atom_typ, atom_typ >> 24);
+            fprintf(stderr, "fpos=%" PRIu64 " <%" PRIx64 "h>\n", (uint64_t)in.tellg(),
+                    (uint64_t)in.tellg());
+            fprintf(stderr, "\n");
+
+            return -1;
+        }
+    }
+
+    return in.eof() ? 0 : -1;
+}
+
+int main(int ac, char **av) {
+    int error = 0;
+    for (int i = 1; i < ac; ++i) {
+        if (*av[i] == '-') {
+            for (char *arg = av[i] + 1; *arg;) switch (*arg++) {
+                    case 'C':
+                        options.noCtxtJson = true;
+                        break;
+                    case 'H':
+                        options.noHeader = true;
+                        break;
+                    case 'L':
+                        options.oneLine = true;
+                        break;
+                    default:
+                        fprintf(stderr, "ignoring argument -%c\n", *arg);
+                        error = 1;
+                }
+        } else {
+            std::ifstream in(av[i], std::ios::binary);
+            if (!in) {
+                fprintf(stderr, "failed to open %s\n", av[i]);
+                error = 1;
+                continue;
+            }
+            unsigned char magic[4] = {};
+            in.read((char *)magic, 4);
+            if (magic[0] == 0 && magic[3] && strchr("RDBH", magic[3])) {
+                in.seekg(0);
+                error |= dump_bin(in);
+            } else if (magic[0] == 0x1f && magic[1] == 0x8b) {
+                if (auto *pipe = popen((std::string("zcat < ") + av[i]).c_str(), "r")) {
+                    fdstream in(fileno(pipe));
+                    error |= dump_bin(in);
+                    pclose(pipe);
+                } else {
+                    fprintf(stderr, "%s: Cannot open pipe to read\n", av[i]);
+                }
+            } else {
+                fprintf(stderr, "%s: Unknown file format\n", av[i]);
+            }
+        }
+    }
+    if (error == 1) fprintf(stderr, "usage: %s <file>\n", av[0]);
+    return error;
+}
diff --git a/backends/tofino/bf-asm/bflink b/backends/tofino/bf-asm/bflink
new file mode 100755
index 00000000000..3195414ba5c
--- /dev/null
+++ b/backends/tofino/bf-asm/bflink
@@ -0,0 +1,182 @@
+#!/bin/sh
+
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+WALLE=""
+CHIP=""
+OUT=""
+objs=""
+object_files=""
+base_program=""
+debug_info=false
+tmpdir=""
+pipe_args=""
+READLINK_COMMAND=$(which greadlink || which readlink)
+execdir=$(dirname $($READLINK_COMMAND -f $0))
+
+if [ x"$BFAS_OPTIONS" = x"-g" ]; then
+    debug_info=true
+fi
+
+tempfile() {
+    file=$(basename $1 $2)
+    orig=file
+    ctr=1
+    while [ -r $tmpdir/$file ]; do
+        file=$ctr-$file
+        ctr=$((ctr + 1))
+    done
+    echo $file
+}
+
+while [ $# -gt 0 ]; do
+    case $1 in
+    -b)
+        base_program="$2"
+        shift ; shift;;
+    -g)
+        debug_info=true
+        shift;;
+    -o)
+        OUT="$2"
+        shift ; shift ;;
+    --walle|-w)
+        WALLE="$2"
+        shift ; shift ;;
+    --target|-t)
+        CHIP="$2"
+        OUT="$2.bin"
+        shift ; shift ;;
+    --singlepipe|-s)
+        pipe_args="--top memories.pipe --top regs.pipe"
+        shift ;;
+    --allpipes|-a)
+        pipe_args=""
+        shift ;;
+    *.json.Z)
+        if [ -z "$tmpdir" ]; then
+            tmpdir=$(mktemp -d)
+        fi
+        file=$(tempfile $1 .Z)
+        gunzip -c $1 >$tmpdir/$file
+        objs="$objs $tmpdir/$file"
+        object_files="$object_files $1"
+        shift ;;
+    *.json.gz)
+        if [ -z "$tmpdir" ]; then
+            tmpdir=$(mktemp -d)
+        fi
+        file=$(tempfile $1 .gz)
+        gunzip -c $1 >$tmpdir/$file
+        objs="$objs $tmpdir/$file"
+        object_files="$object_files $1"
+        shift ;;
+    *.json.bz)
+        if [ -z "$tmpdir" ]; then
+            tmpdir=$(mktemp -d)
+        fi
+        file=$(tempfile $1 .bz)
+        bzcat $1 >$tmpdir/$file
+        objs="$objs $tmpdir/$file"
+        object_files="$object_files $1"
+        shift ;;
+    *.json.bz2)
+        if [ -z "$tmpdir" ]; then
+            tmpdir=$(mktemp -d)
+        fi
+        file=$(tempfile $1 .bz2)
+        bzcat $1 >$tmpdir/$file
+        objs="$objs $tmpdir/$file"
+        object_files="$object_files $1"
+        shift ;;
+    *.json)
+        objs="$objs $1"
+        object_files="$object_files $1"
+        shift ;;
+    *)
+        echo >&2 "Unknown argument $1"
+        shift ;;
+    esac
+done
+
+if [ ! -x "$WALLE" ]; then
+    if [ -f $execdir/walle -a -x $execdir/walle ]; then
+        WALLE=$execdir/walle
+    elif [ -x $execdir/walle.py ]; then
+        WALLE=$execdir/walle.py
+    elif [ -x $execdir/walle/walle.py ]; then
+        WALLE=$execdir/walle/walle.py
+    elif [ -e "$WALLE" ]; then
+        echo "$WALLE must be executable"
+        exit 1
+    else
+        echo "4: $WALLE"
+        echo >&2 "Can't find walle"
+        exit 1
+    fi
+fi
+
+if [ -z "$CHIP" ]; then
+    for jf in $objs; do
+        if [ $(basename $jf) = regs.top.cfg.json ]; then
+            CHIP=$(grep '"_type"' $jf | sed -e 's/.*"regs\.//' -e 's/[_"].*//')
+            break
+        fi
+    done
+    if [ -z "$CHIP" ]; then
+        echo >&2 "Can't find target, assuming tofino"
+        CHIP=tofino
+    fi
+    if [ -z "$OUT" ]; then
+        OUT=$CHIP.bin
+    fi
+fi
+
+schema_arg=""
+if [ -r $CHIP/chip.schema ]; then
+    schema_arg="--schema $CHIP/chip.schema"
+elif [ -r $execdir/$CHIP/chip.schema ]; then
+    schema_arg="--schema $execdir/$CHIP/chip.schema"
+fi
+
+#echo "$WALLE --target $CHIP $schema_arg -o $OUT $objs $pipe_args"
+$WALLE --target $CHIP $schema_arg -o $OUT $objs $pipe_args
+rc=$?
+
+# cleanup
+output_dir=$(dirname $OUT)
+if [ -z "$output_dir" ]; then
+    output_dir="./"
+fi
+if ! $debug_info; then
+    rm -f $object_files
+fi
+if [ ! -z "$base_program" ] ; then
+    pp=$output_dir/${base_program}.p4i
+    if ! $debug_info && test -e $pp ; then rm -f $pp; fi
+fi
+if ! $debug_info && test -e $output_dir/bfas.config.log ; then
+    rm -f $output_dir/bfas.config.log
+fi
+# if we uncompressed, remove the directory
+if [ -d "$tmpdir" ]; then
+    rm -rf $tmpdir
+fi
+
+# exit with a return code if walle failed
+exit $rc
diff --git a/backends/tofino/bf-asm/binary_output.h b/backends/tofino/bf-asm/binary_output.h
new file mode 100644
index 00000000000..23d56608a8d
--- /dev/null
+++ b/backends/tofino/bf-asm/binary_output.h
@@ -0,0 +1,72 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_BINARY_OUTPUT_H_
+#define BACKENDS_TOFINO_BF_ASM_BINARY_OUTPUT_H_
+
+#include <iomanip>
+#include <iostream>
+
+namespace binout {
+
+class tag {
+    char data[4] = {0, 0, 0, 0};
+
+ public:
+    tag(char ch) { data[3] = ch; }  // NOLINT(runtime/explicit)
+    friend std::ostream &operator<<(std::ostream &out, const tag &e) {
+        return out.write(e.data, 4);
+    }
+};
+
+class byte4 {
+    char data[4];
+
+ public:
+    byte4(uint32_t v) {  // NOLINT(runtime/explicit)
+        data[0] = v & 0xff;
+        data[1] = (v >> 8) & 0xff;
+        data[2] = (v >> 16) & 0xff;
+        data[3] = (v >> 24) & 0xff;
+    }
+    friend std::ostream &operator<<(std::ostream &out, const byte4 &e) {
+        return out.write(e.data, 4);
+    }
+};
+
+class byte8 {
+    char data[8];
+
+ public:
+    byte8(uint64_t v) {  // NOLINT(runtime/explicit)
+        data[0] = v & 0xff;
+        data[1] = (v >> 8) & 0xff;
+        data[2] = (v >> 16) & 0xff;
+        data[3] = (v >> 24) & 0xff;
+        data[4] = (v >> 32) & 0xff;
+        data[5] = (v >> 40) & 0xff;
+        data[6] = (v >> 48) & 0xff;
+        data[7] = (v >> 56) & 0xff;
+    }
+    friend std::ostream &operator<<(std::ostream &out, const byte8 &e) {
+        return out.write(e.data, 8);
+    }
+};
+
+}  // end namespace binout
+
+#endif /* BACKENDS_TOFINO_BF_ASM_BINARY_OUTPUT_H_ */
diff --git a/backends/tofino/bf-asm/bson.cpp b/backends/tofino/bf-asm/bson.cpp
new file mode 100644
index 00000000000..b6f0d8d261a
--- /dev/null
+++ b/backends/tofino/bf-asm/bson.cpp
@@ -0,0 +1,320 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "bson.h"
+
+#include <inttypes.h>
+
+#include "lib/hex.h"
+
+namespace {
+uint8_t get8(std::istream &in) {
+    char data;
+    in.read(&data, sizeof(data));
+    return data & 0xffU;
+}
+
+int32_t get32(std::istream &in) {
+    char data[4];
+    in.read(data, sizeof(data));
+    return (data[0] & 0xffU) | ((data[1] & 0xffU) << 8) | ((data[2] & 0xffU) << 16) |
+           ((data[3] & 0xffU) << 24);
+}
+int64_t get64(std::istream &in) {
+    char data[8];
+    in.read(data, sizeof(data));
+    return (data[0] & 0xffULL) | ((data[1] & 0xffULL) << 8) | ((data[2] & 0xffULL) << 16) |
+           ((data[3] & 0xffULL) << 24) | ((data[4] & 0xffULL) << 32) | ((data[5] & 0xffULL) << 40) |
+           ((data[6] & 0xffULL) << 48) | ((data[7] & 0xffULL) << 56);
+}
+
+std::string out32(int32_t val) {
+    char data[4];
+    data[0] = val & 0xff;
+    data[1] = (val >> 8) & 0xff;
+    data[2] = (val >> 16) & 0xff;
+    data[3] = (val >> 24) & 0xff;
+    return std::string(data, sizeof(data));
+}
+
+std::string out64(int64_t val) {
+    char data[8];
+    data[0] = val & 0xff;
+    data[1] = (val >> 8) & 0xff;
+    data[2] = (val >> 16) & 0xff;
+    data[3] = (val >> 24) & 0xff;
+    data[4] = (val >> 32) & 0xff;
+    data[5] = (val >> 40) & 0xff;
+    data[6] = (val >> 48) & 0xff;
+    data[7] = (val >> 56) & 0xff;
+    return std::string(data, sizeof(data));
+}
+
+}  // end anonymous namespace
+
+namespace json {
+
+std::istream &operator>>(std::istream &in, bson_wrap<json::vector> o) {
+    json::vector &out = o.o;
+    std::streamoff start = in.tellg();
+    std::streamoff end = start + get32(in);
+    out.clear();
+    while (uint8_t type = get8(in)) {
+        if (!in) break;
+        if (in.tellg() >= end) {
+            std::cerr << "truncated array" << std::endl;
+            in.setstate(std::ios::failbit);
+            break;
+        }
+        std::string key;
+        getline(in, key, '\0');
+        if (key != std::to_string(out.size())) std::cerr << "incorrect key in array" << std::endl;
+        switch (type) {
+            case 0x02: {
+                uint32_t len = get32(in) - 1;
+                std::string val;
+                val.resize(len);
+                in.read(&val[0], len);
+                out.push_back(val.c_str());
+                if (in.get() != 0) {
+                    std::cerr << "missing NUL in bson string" << std::endl;
+                    in.setstate(std::ios::failbit);
+                }
+                break;
+            }
+            case 0x03: {
+                json::map obj;
+                in >> binary(obj);
+                out.push_back(std::move(obj));
+                break;
+            }
+            case 0x04: {
+                json::vector obj;
+                in >> binary(obj);
+                out.push_back(std::move(obj));
+                break;
+            }
+            case 0x08:
+                switch (get8(in)) {
+                    case 0:
+                        out.push_back(false);
+                        break;
+                    case 1:
+                        out.push_back(true);
+                        break;
+                    default:
+                        std::cerr << "invalid boolean value" << std::endl;
+                        in.setstate(std::ios::failbit);
+                        break;
+                }
+                break;
+            case 0x0a:
+                out.push_back(nullptr);
+                break;
+            case 0x10:
+                out.push_back(get32(in));
+                break;
+            case 0x12:
+                out.push_back(get64(in));
+                break;
+            case 0x7f:
+            case 0xff:
+                break;
+            default:
+                std::cerr << "unhandled bson tag " << hex(type) << std::endl;
+                break;
+        }
+    }
+    if (start != -1 && in && in.tellg() != end) {
+        std::cerr << "incorrect length for object" << std::endl;
+    }
+    return in;
+}
+
+std::istream &operator>>(std::istream &in, bson_wrap<json::map> o) {
+    json::map &out = o.o;
+    std::streamoff start = in.tellg();
+    std::streamoff end = start + get32(in);
+    out.clear();
+    while (uint8_t type = get8(in)) {
+        if (!in) break;
+        if (in.tellg() >= end) {
+            std::cerr << "truncated object" << std::endl;
+            in.setstate(std::ios::failbit);
+            break;
+        }
+        std::string key;
+        getline(in, key, '\0');
+        if (out.count(key.c_str())) std::cerr << "duplicate key in map" << std::endl;
+        switch (type) {
+            case 0x02: {
+                uint32_t len = get32(in) - 1;
+                std::string val;
+                val.resize(len);
+                in.read(&val[0], len);
+                out[key] = val;
+                if (in.get() != 0) {
+                    std::cerr << "missing NUL in bson string" << std::endl;
+                    in.setstate(std::ios::failbit);
+                }
+                break;
+            }
+            case 0x03: {
+                json::map obj;
+                in >> binary(obj);
+                out[key] = mkuniq<json::map>(std::move(obj));
+                break;
+            }
+            case 0x04: {
+                json::vector obj;
+                in >> binary(obj);
+                out[key] = mkuniq<json::vector>(std::move(obj));
+                break;
+            }
+            case 0x08:
+                switch (get8(in)) {
+                    case 0:
+                        out[key] = mkuniq<False>(False());
+                        break;
+                    case 1:
+                        out[key] = mkuniq<True>(True());
+                        break;
+                    default:
+                        std::cerr << "invalid boolean value" << std::endl;
+                        in.setstate(std::ios::failbit);
+                        break;
+                }
+                break;
+            case 0x0a:
+                out[key] = std::unique_ptr<json::obj>();
+                break;
+            case 0x10:
+                out[key] = get32(in);
+                break;
+            case 0x12:
+                out[key] = get64(in);
+                break;
+            case 0x7f:
+            case 0xff:
+                break;
+            default:
+                std::cerr << "unhandled bson tag " << hex(type) << std::endl;
+                break;
+        }
+    }
+    if (start != -1 && in && in.tellg() != end) {
+        std::cerr << "incorrect length for object" << std::endl;
+    }
+    return in;
+}
+
+static std::unique_ptr<json::vector> map_is_vector(json::map &m) {
+    int idx = 0;
+    for (auto &el : m) {
+        if (*el.first != std::to_string(idx).c_str()) return nullptr;
+        ++idx;
+    }
+    if (idx == 0) return nullptr;
+    auto rv = mkuniq<json::vector>();
+    for (auto &el : m) rv->push_back(std::move(el.second));
+    // return std::move(rv);
+    return rv;
+}
+
+std::istream &operator>>(std::istream &in, bson_wrap<std::unique_ptr<obj>> json) {
+    json::map rv;
+    in >> binary(rv);
+    if (auto asvec = map_is_vector(rv))
+        json.o = std::move(asvec);
+    else
+        json.o = mkuniq<json::map>(std::move(rv));
+    return in;
+}
+
+std::string bson_encode(const json::vector &v);
+std::string bson_encode(const json::map &m);
+
+std::string bson_encode_element(const std::string &key, const json::obj *o) {
+    if (!o) return '\x0A' + key + '\0';
+    if (o->is<json::True>()) return '\x08' + key + '\0' + '\1';
+    if (o->is<json::False>()) return '\x08' + key + '\0' + '\0';
+    if (o->is<json::number>()) {
+        auto &n = o->to<json::number>();
+        if (static_cast<int32_t>(n.val) == n.val)
+            return '\x10' + key + '\0' + out32(n.val);
+        else
+            return '\x12' + key + '\0' + out64(n.val);
+    }
+    if (o->is<json::string>()) {
+        auto &s = o->to<json::string>();
+        return '\x02' + key + '\0' + out32(s.size() + 1) + s + '\0';
+    }
+    if (o->is<json::vector>()) {
+        auto doc = bson_encode(o->to<json::vector>());
+        return '\x04' + key + '\0' + out32(doc.size() + 4) + doc;
+    }
+    if (o->is<json::map>()) {
+        auto doc = bson_encode(o->to<json::map>());
+        return '\x03' + key + '\0' + out32(doc.size() + 4) + doc;
+    }
+    assert(0);
+    return "";  // quiet warning
+}
+
+std::string bson_encode(const json::vector &v) {
+    std::string rv;
+    int idx = 0;
+    for (auto &el : v) {
+        rv += bson_encode_element(std::to_string(idx), el.get());
+        ++idx;
+    }
+    rv += '\0';
+    return rv;
+}
+std::string bson_encode(const json::map &m) {
+    std::string rv;
+    for (auto &el : m) {
+        if (auto key = el.first->as_string())
+            rv += bson_encode_element(*key, el.second.get());
+        else
+            std::cerr << "Can't encode non-string key in bson object" << std::endl;
+    }
+    rv += '\0';
+    return rv;
+}
+
+std::ostream &operator<<(std::ostream &out, bson_wrap<const vector> v) {
+    auto data = bson_encode(v.o);
+    out.write(out32(data.size() + 4).c_str(), 4);
+    out.write(data.data(), data.size());
+    return out;
+}
+std::ostream &operator<<(std::ostream &out, bson_wrap<const map> m) {
+    auto data = bson_encode(m.o);
+    out.write(out32(data.size() + 4).c_str(), 4);
+    out.write(data.data(), data.size());
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &out, bson_wrap<const obj> json) {
+    if (auto m = json.o.as_map()) return out << binary(*m);
+    if (auto v = json.o.as_vector()) return out << binary(*v);
+    std::cerr << "object not map or vector can't be output as bson" << std::endl;
+    return out;
+}
+
+}  // end namespace json
diff --git a/backends/tofino/bf-asm/bson.h b/backends/tofino/bf-asm/bson.h
new file mode 100644
index 00000000000..d193123c322
--- /dev/null
+++ b/backends/tofino/bf-asm/bson.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_BSON_H_
+#define BACKENDS_TOFINO_BF_ASM_BSON_H_
+
+#include <type_traits>
+
+#include "backends/tofino/bf-asm/json.h"
+
+namespace json {
+
+template <class T>
+struct bson_wrap {
+    T &o;
+    bson_wrap(T &o) : o(o) {}  // NOLINT(runtime/explicit)
+    template <class U>
+    bson_wrap(U &o) : o(o) {}  // NOLINT(runtime/explicit)
+};
+
+template <class T>
+bson_wrap<T> binary(T &o) {
+    return bson_wrap<T>(o);
+}
+
+std::istream &operator>>(std::istream &in, bson_wrap<std::unique_ptr<obj>> json);
+std::istream &operator>>(std::istream &in, bson_wrap<json::vector> json);
+std::istream &operator>>(std::istream &in, bson_wrap<json::map> json);
+inline std::istream &operator>>(std::istream &in, bson_wrap<obj *> json) {
+    std::unique_ptr<obj> p;
+    in >> binary(p);
+    if (in) json.o = p.release();
+    return in;
+}
+
+std::ostream &operator<<(std::ostream &out, bson_wrap<const vector>);
+std::ostream &operator<<(std::ostream &out, bson_wrap<const map>);
+std::ostream &operator<<(std::ostream &out, bson_wrap<const obj> json);
+inline std::ostream &operator<<(std::ostream &out, bson_wrap<vector> json) {
+    return operator<<(out, bson_wrap<const vector>(json.o));
+}
+inline std::ostream &operator<<(std::ostream &out, bson_wrap<map> json) {
+    return operator<<(out, bson_wrap<const map>(json.o));
+}
+inline std::ostream &operator<<(std::ostream &out, bson_wrap<obj> json) {
+    return operator<<(out, bson_wrap<const obj>(json.o));
+}
+inline std::ostream &operator<<(std::ostream &out, bson_wrap<const obj *> json) {
+    return out << binary(*json.o);
+}
+inline std::ostream &operator<<(std::ostream &out, bson_wrap<obj *> json) {
+    return out << binary(*json.o);
+}
+inline std::ostream &operator<<(std::ostream &out, bson_wrap<const std::unique_ptr<obj>> json) {
+    return out << binary(*json.o.get());
+}
+
+}  // end namespace json
+
+#endif /* BACKENDS_TOFINO_BF_ASM_BSON_H_ */
diff --git a/backends/tofino/bf-asm/checked_array.h b/backends/tofino/bf-asm/checked_array.h
new file mode 100644
index 00000000000..0c047fd0474
--- /dev/null
+++ b/backends/tofino/bf-asm/checked_array.h
@@ -0,0 +1,146 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_CHECKED_ARRAY_H_
+#define BACKENDS_TOFINO_BF_ASM_CHECKED_ARRAY_H_
+
+#include <stdlib.h>
+
+#include "bfas.h"  // to get at the options
+#include "lib/log.h"
+
+void print_regname(std::ostream &out, const void *addr, const void *end);
+
+template <size_t S, typename T>
+class checked_array;
+template <size_t S, typename T>
+std::ostream &operator<<(std::ostream &out, checked_array<S, T> *arr);
+
+template <typename T>
+class checked_array_base {
+ public:
+    virtual T &operator[](size_t) = 0;
+    virtual const T &operator[](size_t) const = 0;
+    virtual size_t size() const = 0;
+    virtual T *begin() = 0;
+    virtual T *end() = 0;
+    virtual bool modified() const = 0;
+    virtual void set_modified(bool v = true) = 0;
+    virtual bool disabled() const = 0;
+    virtual bool disable() = 0;
+    virtual bool disable_if_zero() = 0;
+    virtual void enable() = 0;
+};
+
+template <size_t S, typename T>
+class checked_array : public checked_array_base<T> {
+    bool disabled_;
+    T data[S];
+
+ public:
+    checked_array() : disabled_(false) {}
+    template <class U>
+    explicit checked_array(U v) : disabled_(false) {
+        for (auto &e : data) new (&e) T(v);
+    }
+    template <class U>
+    checked_array(const std::initializer_list<U> &v) : disabled_(false) {
+        auto it = v.begin();
+        for (auto &e : data) {
+            if (it == v.end()) break;
+            new (&e) T(*it++);
+        }
+    }
+    T &operator[](size_t idx) {
+        if (idx >= S) {
+            LOG1("ERROR: array index " << idx << " out of bounds " << this);
+            BUG("array index %zu out of bounds (%zu)", idx, S);
+        }
+        return data[idx];
+    }
+    const T &operator[](size_t idx) const {
+        if (idx >= S) {
+            LOG1("ERROR: array index " << idx << " out of bounds " << this);
+            BUG("array index %zu out of bounds (%zu)", idx, S);
+        }
+        return data[idx];
+    }
+    size_t size() const { return S; }
+    T *begin() { return data; }
+    T *end() { return data + S; }
+    bool modified() const {
+        for (size_t i = 0; i < S; i++)
+            if (data[i].modified()) return true;
+        return false;
+    }
+    void set_modified(bool v = true) {
+        for (size_t i = 0; i < S; i++) data[i].set_modified(v);
+    }
+    bool disabled() const { return disabled_; }
+    bool disable() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i].disable()) rv = false;
+        if (rv) disabled_ = true;
+        return rv;
+    }
+    void enable() {
+        disabled_ = false;
+        for (size_t i = 0; i < S; i++) data[i].enable();
+    }
+    bool disable_if_unmodified() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i].disable_if_unmodified()) rv = false;
+        if (rv && !options.gen_json) {
+            /* Can't actually disable arrays when generating json, as walle doesn't like it,
+             * but allow containing object to be disabled */
+            disabled_ = true;
+        }
+        return rv;
+    }
+    bool disable_if_zero() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i].disable_if_zero()) rv = false;
+        if (rv && !options.gen_json) {
+            /* Can't actually disable arrays when generating json, as walle doesn't like it,
+             * but allow containing object to be disabled */
+            disabled_ = true;
+        }
+        return rv;
+    }
+    bool disable_if_reset_value() {
+        bool rv = true;
+        for (size_t i = 0; i < S; i++)
+            if (!data[i].disable_if_reset_value()) rv = false;
+        if (rv && !options.gen_json) {
+            /* Can't actually disable arrays when generating json, as walle doesn't like it,
+             * but allow containing object to be disabled */
+            disabled_ = true;
+        }
+        return rv;
+    }
+};
+
+template <size_t S, typename T>
+inline std::ostream &operator<<(std::ostream &out, checked_array<S, T> *arr) {
+    print_regname(out, arr, arr + 1);
+    return out;
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_CHECKED_ARRAY_H_ */
diff --git a/backends/tofino/bf-asm/cmake/config.h.cmake b/backends/tofino/bf-asm/cmake/config.h.cmake
new file mode 100644
index 00000000000..3e8ae79d73f
--- /dev/null
+++ b/backends/tofino/bf-asm/cmake/config.h.cmake
@@ -0,0 +1,17 @@
+#ifndef __BFASM_CONFIG_H__
+#define __BFASM_CONFIG_H__
+
+/* Define to 1 if you have the execinfo.h header */
+#cmakedefine HAVE_EXECINFO_H @HAVE_EXECINFO_H@
+
+/* Define to 1 if you have the ucontext.h header */
+#cmakedefine HAVE_UCONTEXT_H @HAVE_UCONTEXT_H@
+
+/* Schema version */
+#cmakedefine CONTEXT_SCHEMA_VERSION "@CONTEXT_SCHEMA_VERSION@"
+
+/* define the version */
+#cmakedefine TFAS_VERSION "@BFN_P4C_VERSION@"
+
+
+#endif // __BFASM_CONFIG_H__
diff --git a/backends/tofino/bf-asm/constants.h b/backends/tofino/bf-asm/constants.h
new file mode 100644
index 00000000000..374ef837287
--- /dev/null
+++ b/backends/tofino/bf-asm/constants.h
@@ -0,0 +1,242 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CONSTANTS_H_
+#define CONSTANTS_H_
+
+enum {
+    /* global constants related to MAU stage */
+    LOGICAL_TABLES_PER_STAGE = 16,
+    PHYSICAL_TABLES_PER_STAGE = 16,
+    TCAM_TABLES_PER_STAGE = 8,
+    SRAM_ROWS = 8,
+    LOGICAL_SRAM_ROWS = 16,
+    SRAM_UNITS_PER_ROW = 12,
+    MAPRAM_UNITS_PER_ROW = 6,
+    MEM_WORD_WIDTH = 128,
+    SRAM_DEPTH_BITS = 10,
+    SRAM_DEPTH = 1 << SRAM_DEPTH_BITS,
+    LAMB_DEPTH_BITS = 6,
+    LAMB_DEPTH = 1 << LAMB_DEPTH_BITS,
+    TCAM_ROWS = 12,
+    TCAM_UNITS_PER_ROW = 2,
+    TCAM_XBAR_GROUPS = 12,
+    TCAM_XBAR_GROUP_SIZE = 44,
+    TCAM_XBAR_INPUT_BYTES = 68,
+    TCAM_VPN_BITS = 6,
+    TCAM_WORD_BITS = 9,
+    TCAM_FORMAT_WIDTH = 47,
+    TCAM_PAYLOAD_BITS = 1,
+    TCAM_PAYLOAD_BITS_START = 0,
+    TCAM_MATCH_BITS_START = TCAM_PAYLOAD_BITS_START + TCAM_PAYLOAD_BITS,
+    TCAM_PARITY_BITS = 2,
+    TCAM_PARITY_BITS_START = 45,
+    TCAM_VERSION_BITS = 2,
+    TCAM_VERSION_BITS_START = 43,
+    EXACT_XBAR_GROUPS = 8,
+    EXACT_XBAR_GROUP_SIZE = 128,
+    BYTE_XBAR_GROUPS = 8,
+    BYTE_XBAR_GROUP_SIZE = 8,
+    GALOIS_FIELD_MATRIX_COLUMNS = 52,
+    EXACT_HASH_GROUP_SIZE = 52,
+    EXACT_HASH_ADR_BITS = 10,
+    EXACT_HASH_ADR_GROUPS = 5,
+    EXACT_HASH_SELECT_BITS = 12,
+    EXACT_HASH_FIRST_SELECT_BIT = EXACT_HASH_GROUP_SIZE - EXACT_HASH_SELECT_BITS,
+    EXACT_VPN_BITS = 9,
+    EXACT_WORD_BITS = 10,
+    NEXT_TABLE_MAX_RAM_EXTRACT_BITS = 8,
+    MAX_LONGBRANCH_TAGS = 8,
+    MAX_IMMED_ACTION_DATA = 32,
+    ACTION_DATA_8B_SLOTS = 16,
+    ACTION_DATA_16B_SLOTS = 24,
+    ACTION_DATA_32B_SLOTS = 16,
+    ACTION_DATA_BUS_SLOTS = ACTION_DATA_8B_SLOTS + ACTION_DATA_16B_SLOTS + ACTION_DATA_32B_SLOTS,
+    ACTION_DATA_BUS_BYTES =
+        ACTION_DATA_8B_SLOTS + 2 * ACTION_DATA_16B_SLOTS + 4 * ACTION_DATA_32B_SLOTS,
+    ACTION_HV_XBAR_SLICES = 8,
+    ACTION_HV_XBAR_SLICE_SIZE = 16,
+    ACTION_INSTRUCTION_SUCCESSOR_TABLE_DEPTH = 8,
+    ACTION_INSTRUCTION_ADR_ENABLE = 0x40,
+    ACTION_IMEM_SLOTS = 32,
+    ACTION_IMEM_COLORS = 2,
+    ACTION_IMEM_ADDR_MAX = ACTION_IMEM_SLOTS * ACTION_IMEM_COLORS,
+    ACTION_ALWAYS_RUN_IMEM_ADDR = 63,
+    SELECTOR_PORTS_PER_WORD = 120,
+    STATEFUL_PREDICATION_ENCODE_NOOP = 0,
+    STATEFUL_PREDICATION_ENCODE_NOTCMPHI = 3,
+    STATEFUL_PREDICATION_ENCODE_NOTCMPLO = 5,
+    STATEFUL_PREDICATION_ENCODE_CMPLO = 0xaaaa,
+    STATEFUL_PREDICATION_ENCODE_CMPHI = 0xcccc,
+    STATEFUL_PREDICATION_ENCODE_CMP0 = 0xaaaa,
+    STATEFUL_PREDICATION_ENCODE_CMP1 = 0xcccc,
+    STATEFUL_PREDICATION_ENCODE_CMP2 = 0xf0f0,
+    STATEFUL_PREDICATION_ENCODE_CMP3 = 0xff00,
+    STATEFUL_PREDICATION_ENCODE_UNCOND = 0xffff,
+    STATEFUL_PREDICATION_OUTPUT = 6,
+    // See bf-drivers/include/pipe_mgr/pipe_mgr_intf.h for the definitions
+    TYPE_ENUM_SHIFT = 24,
+    PIPE_ID_SHIFT = 28,
+    REGISTER_PARAM_HANDLE_START = (0x08 << TYPE_ENUM_SHIFT),
+    ACTION_HANDLE_START = (0x20 << TYPE_ENUM_SHIFT),
+    FIELD_HANDLE_START = (0x9 << TYPE_ENUM_SHIFT),
+    PER_FLOW_ENABLE_BITS = 1,
+    METER_TYPE_BITS = 3,
+    // Order is METER_TYPE, METER_PFE, METER_ADDRESS
+    METER_TYPE_START_BIT = 24,
+    METER_LOWER_HUFFMAN_BITS = 7,
+    METER_ADDRESS_BITS = 23,
+    METER_FULL_ADDRESS_BITS = METER_ADDRESS_BITS + PER_FLOW_ENABLE_BITS + METER_TYPE_BITS,
+    METER_ADDRESS_ZERO_PAD = 23,
+    METER_PER_FLOW_ENABLE_START_BIT = 23,
+    IDLETIME_BUSSES = 20,
+    IDLETIME_BUSSES_PER_HALF = IDLETIME_BUSSES / 2,
+    IDLETIME_ADDRESS_PER_FLOW_ENABLE_START_BIT = 20,
+    IDLETIME_ADDRESS_BITS = 20,
+    IDLETIME_FULL_ADDRESS_BITS = IDLETIME_ADDRESS_BITS + PER_FLOW_ENABLE_BITS,
+    IDLETIME_ADDRESS_ZERO_PAD = 4,
+    IDLETIME_HUFFMAN_BITS = 4,
+    SELECTOR_METER_TYPE_START_BIT = METER_TYPE_START_BIT,
+    SELECTOR_LOWER_HUFFMAN_BITS = METER_LOWER_HUFFMAN_BITS,
+    SELECTOR_METER_ADDRESS_BITS = METER_ADDRESS_BITS,
+    SELECTOR_PER_FLOW_ENABLE_START_BIT = METER_PER_FLOW_ENABLE_START_BIT,
+    SELECTOR_VHXBAR_HASH_BUS_INDEX = 3,
+    SELECTOR_LENGTH_MOD_BITS = 5,
+    STAT_ADDRESS_BITS = 19,
+    STAT_FULL_ADDRESS_BITS = STAT_ADDRESS_BITS + PER_FLOW_ENABLE_BITS,
+    STAT_ADDRESS_ZERO_PAD = 7,
+    STAT_METER_COLOR_LOWER_HUFFMAN_BITS = 3,
+    STATISTICS_PER_FLOW_ENABLE_START_BIT = 19,
+    STATISTICS_PER_FLOW_SHIFT_COUNT = 7,
+    ACTION_ADDRESS_ZERO_PAD = 5,
+    ACTION_ADDRESS_BITS = 22,
+    ACTION_FULL_ADDRESS_BITS = 23,
+    ACTION_DATA_PER_FLOW_ENABLE_START_BIT = ACTION_ADDRESS_BITS,
+    ACTION_DATA_LOWER_HUFFMAN_BITS = 5,
+    ACTION_DATA_UPPER_HUFFMAN_BITS = 2,
+    ACTION_DATA_HUFFMAN_BITS = ACTION_DATA_LOWER_HUFFMAN_BITS + ACTION_DATA_UPPER_HUFFMAN_BITS,
+    ACTION_DATA_HUFFMAN_DIFFERENCE = 10,
+    MAX_PORTS = 288,
+    MAX_LRT_ENTRIES = 3,
+    UPPER_MATCH_CENTRAL_FIRST_ROW = SRAM_ROWS / 2,
+    UPPER_MATCH_CENTRAL_FIRST_LOGICAL_ROW = UPPER_MATCH_CENTRAL_FIRST_ROW * 2,
+    CHECKSUM_ENGINE_PHVID_TOFINO_LOW = 224,
+    CHECKSUM_ENGINE_PHVID_TOFINO_HIGH = 235,
+    CHECKSUM_ENGINE_PHVID_TOFINO_PER_GRESS = 6,
+    CONSTANTS_PHVID_JBAY_LOW = 224,
+    CONSTANTS_PHVID_JBAY_HIGH = 232,
+};
+
+enum METER_ACCESS_TYPE {
+    NOP = 0,
+    METER_LPF_COLOR_BLIND = 2,
+    METER_SELECTOR = 4,
+    METER_COLOR_AWARE = 6,
+    STATEFUL_INSTRUCTION_0 = 1,
+    STATEFUL_INSTRUCTION_1 = 3,
+    STATEFUL_INSTRUCTION_2 = 5,
+    STATEFUL_INSTRUCTION_3 = 7,
+    METER_COLOR_ACCESS = -1  // special for color mapram access
+};
+
+/* constants for various config params */
+#include <math.h>
+#undef OVERFLOW /* get rid of global preproc define from math.h */
+namespace UnitRam {
+enum {
+    MATCH = 1,
+    ACTION = 2,
+    STATISTICS = 3,
+    METER = 4,
+    STATEFUL = 5,
+    TERNARY_INDIRECTION = 6,
+    SELECTOR = 7,
+    HASH_ACTION = 8,
+};
+namespace DataMux {
+enum {
+    STATISTICS = 0,
+    METER = 1,
+    OVERFLOW = 2,
+    OVERFLOW2 = 3,
+    ACTION = 4,
+    NONE = 7,
+};
+}  // namespace DataMux
+namespace AdrMux {
+enum {
+    ACTION = 1,
+    TERNARY_INDIRECTION = 2,
+    OVERFLOW = 4,
+    STATS_METERS = 5,
+    SELECTOR_ALU = 6,
+    SELECTOR_OVERFLOW = 7,
+    SELECTOR_ACTION_OVERFLOW = 8,
+};
+}  // namespace AdrMux
+}  // namespace UnitRam
+namespace AdrDist {
+enum {
+    ACTION = 0,
+    STATISTICS = 1,
+    METER = 2,
+    OVERFLOW = 3,
+};
+}  // namespace AdrDist
+namespace MapRam {
+enum {
+    STATISTICS = 1,
+    METER = 2,
+    STATEFUL = 3,
+    IDLETIME = 4,
+    COLOR = 5,
+    SELECTOR_SIZE = 6,
+};
+namespace Mux {
+enum {
+    SYSTEM = 0,
+    SYNTHETIC_TWO_PORT = 1,
+    IDLETIME = 2,
+    COLOR = 3,
+};
+}  // namespace Mux
+namespace ColorBus {
+enum {
+    NONE = 0,
+    COLOR = 1,
+    OVERFLOW = 2,
+    OVERFLOW_2 = 3,
+};
+}  // namespace ColorBus
+}  // namespace MapRam
+namespace BusHashGroup {
+enum {
+    SELECTOR_MOD = 0,
+    METER_ADDRESS = 1,
+    STATISTICS_ADDRESS = 2,
+    ACTION_DATA_ADDRESS = 3,
+    IMMEDIATE_DATA = 4,
+};
+}  // namespace BusHashGroup
+namespace MoveReg {
+enum {
+    STATS = 0,
+    METER = 1,
+    IDLE = 2,
+};
+}  // namespace MoveReg
+#endif /* CONSTANTS_H_ */
diff --git a/backends/tofino/bf-asm/counter.cpp b/backends/tofino/bf-asm/counter.cpp
new file mode 100644
index 00000000000..04c497f990c
--- /dev/null
+++ b/backends/tofino/bf-asm/counter.cpp
@@ -0,0 +1,404 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "data_switchbox.h"
+#include "input_xbar.h"
+#include "lib/algorithm.h"
+#include "misc.h"
+
+// target specific template specializations
+#include "jbay/counter.h"
+#include "tofino/counter.h"
+
+void CounterTable::setup(VECTOR(pair_t) & data) {
+    common_init_setup(data, false, P4Table::Statistics);
+    if (!format) error(lineno, "No format specified in table %s", name());
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (common_setup(kv, data, P4Table::Statistics)) {
+        } else if (kv.key == "count") {
+            if (kv.value == "bytes")
+                type = BYTES;
+            else if (kv.value == "packets")
+                type = PACKETS;
+            else if (kv.value == "both" || kv.value == "packets_and_bytes")
+                type = BOTH;
+            else
+                error(kv.value.lineno, "Unknown counter type %s", value_desc(kv.value));
+        } else if (kv.key == "teop") {
+            if (gress != EGRESS) error(kv.value.lineno, "tEOP can only be used in EGRESS");
+            if (!Target::SUPPORT_TRUE_EOP())
+                error(kv.value.lineno, "tEOP is not available on device");
+            if (CHECKTYPE(kv.value, tINT)) {
+                teop = kv.value.i;
+                if (teop < 0 || teop > 3)
+                    error(kv.value.lineno, "Invalid tEOP bus %d, valid values are 0-3", teop);
+                BUG_CHECK(!stage->teop[teop].first,
+                          "previously used tEOP bus %d used again in stage %d", teop,
+                          stage->stageno);
+                stage->teop[teop] = {true, stage->stageno};
+            }
+        } else if (kv.key == "lrt") {
+            if (!CHECKTYPE2(kv.value, tVEC, tMAP)) continue;
+            collapse_list_of_maps(kv.value, true);
+            if (kv.value.type == tVEC) {
+                for (auto &el : kv.value.vec) lrt.emplace_back(el);
+            } else if (kv.value.map.size >= 1 && kv.value.map[0].key.type == tSTR) {
+                lrt.emplace_back(kv.value);
+            } else {
+                for (auto &el : kv.value.map) {
+                    if (CHECKTYPE2(el.key, tINT, tBIGINT) && CHECKTYPE(el.value, tINT)) {
+                        lrt.emplace_back(el.key.lineno,
+                                         get_int64(el.key, 64, "Threshold too large"), el.value.i);
+                    }
+                }
+            }
+        } else if (kv.key == "bytecount_adjust") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                bytecount_adjust = kv.value.i;
+            }
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (teop >= 0 && type != BYTES && type != BOTH)
+        error(lineno, "tEOP bus can only used when counting bytes");
+    if (Target::SRAM_GLOBAL_ACCESS())
+        alloc_global_srams();
+    else
+        alloc_rams(true, stage->sram_use);
+}
+
+CounterTable::lrt_params::lrt_params(const value_t &m)
+    : lineno(m.lineno), threshold(-1), interval(-1) {
+    if (CHECKTYPE(m, tMAP)) {
+        for (auto &kv : MapIterChecked(m.map, true)) {
+            if (kv.key == "threshold") {
+                if (CHECKTYPE2(kv.value, tINT, tBIGINT))
+                    threshold = get_int64(kv.value, 64, "Threshold too large");
+            } else if (kv.key == "interval") {
+                if (CHECKTYPE(kv.value, tINT)) interval = kv.value.i;
+            } else {
+                warning(kv.key.lineno, "ignoring unknown item %s in lrt params",
+                        value_desc(kv.key));
+            }
+        }
+        if (threshold < 0) error(m.lineno, "No threshold in lrt params");
+        if (interval < 0) error(m.lineno, "No interval in lrt params");
+    }
+}
+
+void CounterTable::pass1() {
+    LOG1("### Counter table " << name() << " pass1 " << loc());
+    if (!p4_table)
+        p4_table = P4Table::alloc(P4Table::Statistics, this);
+    else
+        p4_table->check(this);
+    alloc_vpns();
+    alloc_maprams();
+    std::sort(layout.begin(), layout.end(),
+              [](const Layout &a, const Layout &b) -> bool { return a.row > b.row; });
+    // stage->table_use[timing_thread(gress)] |= Stage::USE_SELECTOR;
+    int prev_row = -1;
+    for (auto &row : layout) {
+        if (home_rows.count(row.row)) prev_row = -1;
+
+        if (prev_row >= 0)
+            need_bus(lineno, stage->overflow_bus_use, row.row, "Overflow");
+        else
+            need_bus(lineno, stage->stats_bus_use, row.row, "Statistics data");
+        for (int r = (row.row + 1) | 1; r < prev_row; r += 2)
+            need_bus(lineno, stage->overflow_bus_use, r, "Overflow");
+        prev_row = row.row;
+    }
+    Synth2Port::pass1();
+    int update_interval_bits = 29;
+    // Tofino didn't have enough bits to cover all possible values of
+    // the update interval.  The compiler should have saturated it to
+    // the max value.  Check that has been done here.
+    if (options.target == TOFINO) update_interval_bits = 28;
+    for (auto &l : lrt) {
+        if (l.interval >= (1 << update_interval_bits))
+            error(l.lineno, "lrt update interval too large");
+    }
+    if (lrt.size() > MAX_LRT_ENTRIES)
+        error(lrt[0].lineno, "Too many lrt entries (max %d)", MAX_LRT_ENTRIES);
+}
+
+void CounterTable::pass2() {
+    LOG1("### Counter table " << name() << " pass2 " << loc());
+    if (logical_id < 0) warning(lineno, "counter %s appears unused by any table", name());
+}
+
+void CounterTable::pass3() { LOG1("### Counter table " << name() << " pass3 " << loc()); }
+
+static int counter_size[] = {0, 0, 1, 2, 3, 0, 4};
+static int counter_masks[] = {0, 7, 3, 4, 1, 0, 0};
+static int counter_shifts[] = {0, 3, 2, 3, 1, 0, 2};
+static int counter_hole_swizzle[] = {0, 0, 0, 1, 0, 0, 2};
+
+int CounterTable::direct_shiftcount() const {
+    return 64 + STAT_ADDRESS_ZERO_PAD - counter_shifts[format->groups()];
+}
+
+int CounterTable::indirect_shiftcount() const {
+    return STAT_ADDRESS_ZERO_PAD - counter_shifts[format->groups()];
+}
+
+int CounterTable::address_shift() const { return counter_shifts[format->groups()]; }
+
+unsigned CounterTable::determine_shiftcount(Table::Call &call, int group, unsigned word,
+                                            int tcam_shift) const {
+    if (call.args[0].name() && strcmp(call.args[0].name(), "$DIRECT") == 0) {
+        return direct_shiftcount() + tcam_shift;
+    } else if (call.args[0].field()) {
+        BUG_CHECK(unsigned(call.args[0].field()->by_group[group]->bit(0) / 128) == word);
+        return call.args[0].field()->by_group[group]->bit(0) % 128 + indirect_shiftcount();
+    } else if (call.args[1].field()) {
+        return call.args[1].field()->by_group[group]->bit(0) % 128 + STAT_ADDRESS_ZERO_PAD;
+    }
+    return 0;
+}
+
+template <class REGS>
+void CounterTable::write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                                       const std::vector<Call::Arg> &args) {
+    auto &merge = regs.rams.match.merge;
+    unsigned adr_mask = 0;
+    unsigned per_entry_en_mux_ctl = 0;
+    unsigned adr_default = 0;
+
+    if (args[0].type == Table::Call::Arg::Name && args[0].name() != nullptr &&
+        strcmp(args[0].name(), "$DIRECT") == 0) {
+        adr_mask |= ((1U << STAT_ADDRESS_BITS) - 1) & ~counter_masks[format->groups()];
+    } else if (args[0].type == Table::Call::Arg::Field && args[0].field() != nullptr) {
+        auto addr = args[0].field();
+        auto address_bits = addr->size;
+        adr_mask |= ((1U << address_bits) - 1) << (counter_shifts[format->groups()]);
+    }
+
+    if (args[1].type == Table::Call::Arg::Name && args[1].name() != nullptr &&
+        strcmp(args[1].name(), "$DEFAULT") == 0) {
+        adr_default = (1U << STATISTICS_PER_FLOW_ENABLE_START_BIT);
+    } else if (args[1].type == Table::Call::Arg::Field) {
+        if (args[0].type == Table::Call::Arg::Field) {
+            per_entry_en_mux_ctl = args[1].field()->bit(0) - args[0].field()->bit(0);
+            per_entry_en_mux_ctl += counter_shifts[format->groups()];
+        } else if (args[0].type == Table::Call::Arg::HashDist) {
+            per_entry_en_mux_ctl = 0;
+        }
+    }
+
+    merge.mau_stats_adr_mask[type][bus] = adr_mask;
+    merge.mau_stats_adr_default[type][bus] = adr_default;
+    merge.mau_stats_adr_per_entry_en_mux_ctl[type][bus] = per_entry_en_mux_ctl;
+    merge.mau_stats_adr_hole_swizzle_mode[type][bus] = counter_hole_swizzle[format->groups()];
+}
+
+template <class REGS>
+void CounterTable::write_regs_vt(REGS &regs) {
+    LOG1("### Counter table " << name() << " write_regs " << loc());
+    // FIXME -- factor common AttachedTable::write_regs
+    // FIXME -- factor common Synth2Port::write_regs
+    // FIXME -- factor common MeterTable::write_regs
+    Layout *home = nullptr;
+    bool push_on_overflow = false;
+    auto &map_alu = regs.rams.map_alu;
+    auto &adrdist = regs.rams.match.adrdist;
+    DataSwitchboxSetup<REGS> *swbox = nullptr;
+    std::vector<int> stats_groups;
+    int minvpn, maxvpn;
+
+    layout_vpn_bounds(minvpn, maxvpn, true);
+    for (Layout &logical_row : layout) {
+        unsigned row = logical_row.row / 2U;
+        unsigned side = logical_row.row & 1; /* 0 == left  1 == right */
+        BUG_CHECK(side == 1);                /* no map rams or alus on left side anymore */
+        /* FIXME factor vpn/mapram stuff with selection.cpp */
+        auto vpn = logical_row.vpns.begin();
+        auto mapram = logical_row.maprams.begin();
+        auto &map_alu_row = map_alu.row[row];
+        auto home_it = home_rows.find(logical_row.row);
+        if (home_it != home_rows.end()) {
+            home = &logical_row;
+            swbox = new DataSwitchboxSetup<REGS>(regs, this, logical_row.row,
+                                                 (++home_it == home_rows.end()) ? -1 : *home_it);
+
+            stats_groups.push_back(swbox->get_home_row() / 2);
+
+            if (swbox->get_home_row() != row) swbox->setup_row(swbox->get_home_row());
+        }
+        BUG_CHECK(home != nullptr);
+        LOG2("# DataSwitchbox.setup(" << row << ") home=" << home->row / 2U);
+        swbox->setup_row(row);
+        for (auto &memunit : logical_row.memunits) {
+            int logical_col = memunit.col;
+            unsigned col = logical_col + 6 * side;
+            swbox->setup_row_col(row, col, *vpn);
+            write_mapram_regs(regs, row, *mapram, *vpn, MapRam::STATISTICS);
+            if (gress) regs.cfg_regs.mau_cfg_uram_thread[col / 4U] |= 1U << (col % 4U * 8U + row);
+            ++mapram, ++vpn;
+        }
+        if (&logical_row == home) {
+            int stats_group_index = swbox->get_home_row() / 2;
+            auto &stats = map_alu.stats_wrap[stats_group_index].stats;
+            auto &stat_ctl = stats.statistics_ctl;
+            stat_ctl.stats_entries_per_word = format->groups();
+            if (type & BYTES) stat_ctl.stats_process_bytes = 1;
+            if (type & PACKETS) stat_ctl.stats_process_packets = 1;
+            // The configuration values for threshold and interval are passed
+            // in directly to the assembler.  Any adjustment required based
+            // on the counter type has already been done.
+            if (lrt.size() > 0) {
+                stat_ctl.lrt_enable = 1;
+                int idx = 0;
+                for (auto &l : lrt) {
+                    stats.lrt_threshold[idx] = l.threshold;
+                    stats.lrt_update_interval[idx] = l.interval;
+                    ++idx;
+                }
+            }
+            stat_ctl.stats_alu_egress = timing_thread(gress);
+            if (type == BYTES || type == BOTH) {
+                auto stats_bytecount_adjust_size = stat_ctl.stats_bytecount_adjust.size();
+                auto stats_bytecount_adjust_mask = ((1U << stats_bytecount_adjust_size) - 1);
+                int bytecount_adjust_max = (1U << (stats_bytecount_adjust_size - 1)) - 1;
+                int bytecount_adjust_min = -1 * (1U << (stats_bytecount_adjust_size - 1));
+                if (bytecount_adjust > bytecount_adjust_max ||
+                    bytecount_adjust < bytecount_adjust_min) {
+                    error(lineno,
+                          "The bytecount adjust value of %d on counter %s "
+                          "does not fit within allowed range for %d bits - { %d, %d }",
+                          bytecount_adjust, name(), stats_bytecount_adjust_size,
+                          bytecount_adjust_min, bytecount_adjust_max);
+                }
+                stat_ctl.stats_bytecount_adjust = bytecount_adjust & stats_bytecount_adjust_mask;
+            }
+            stat_ctl.stats_alu_error_enable = 0;  // TODO
+            if (logical_id >= 0) regs.cfg_regs.mau_cfg_stats_alu_lt[stats_group_index] = logical_id;
+            // setup_muxctl(adrdist.stats_alu_phys_to_logical_ixbar_ctl[row/2], logical_id);
+            map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_base = minvpn;
+            map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_limit = maxvpn;
+        } else {
+            auto &adr_ctl = map_alu_row.vh_xbars.adr_dist_oflo_adr_xbar_ctl[side];
+            if (swbox->get_home_row_logical() >= 8 && logical_row.row < 8) {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = 0;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::OVERFLOW;
+                push_on_overflow = true;
+                BUG_CHECK(options.target == TOFINO);
+            } else {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = swbox->get_home_row_logical() % 8;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::STATISTICS;
+            }
+            adr_ctl.adr_dist_oflo_adr_xbar_enable = 1;
+        }
+    }
+    bool run_at_eop = this->run_at_eop();
+    if (home_rows.size() > 1) write_alu_vpn_range(regs);
+
+    BUG_CHECK(stats_groups.size() == home_rows.size());
+    bool first_stats_group = true;
+    for (int &idx : stats_groups) {
+        auto &movereg_stats_ctl = adrdist.movereg_stats_ctl[idx];
+        for (MatchTable *m : match_tables) {
+            run_at_eop = run_at_eop || m->run_at_eop();
+            adrdist.adr_dist_stats_adr_icxbar_ctl[m->logical_id] |= 1U << idx;
+            auto &dump_ctl = regs.cfg_regs.stats_dump_ctl[m->logical_id];
+            dump_ctl.stats_dump_entries_per_word = format->groups();
+            if (type == BYTES || type == BOTH) dump_ctl.stats_dump_has_bytes = 1;
+            if (type == PACKETS || type == BOTH) dump_ctl.stats_dump_has_packets = 1;
+            dump_ctl.stats_dump_offset = minvpn;
+            dump_ctl.stats_dump_size = maxvpn;
+            if (direct) {
+                adrdist.movereg_ad_direct[MoveReg::STATS] |= 1U << m->logical_id;
+                if (m->is_ternary()) movereg_stats_ctl.movereg_stats_ctl_tcam = 1;
+            }
+            movereg_stats_ctl.movereg_stats_ctl_lt = m->logical_id;
+            // The first ALU will drive this xbar register
+            if (first_stats_group) {
+                adrdist.movereg_ad_stats_alu_to_logical_xbar_ctl[m->logical_id / 8U].set_subfield(
+                    4 + idx, 3 * (m->logical_id % 8U), 3);
+            }
+            adrdist.mau_ad_stats_virt_lt[idx] |= 1U << m->logical_id;
+        }
+        movereg_stats_ctl.movereg_stats_ctl_size = counter_size[format->groups()];
+        movereg_stats_ctl.movereg_stats_ctl_direct = direct;
+        if (run_at_eop) {
+            if (teop >= 0) {
+                setup_teop_regs(regs, idx);
+            } else {
+                adrdist.deferred_ram_ctl[MoveReg::STATS][idx].deferred_ram_en = 1;
+                adrdist.deferred_ram_ctl[MoveReg::STATS][idx].deferred_ram_thread = gress;
+                if (gress) regs.cfg_regs.mau_cfg_dram_thread |= 1 << idx;
+                movereg_stats_ctl.movereg_stats_ctl_deferred = 1;
+            }
+            adrdist.stats_bubble_req[timing_thread(gress)].bubble_req_1x_class_en |= 1 << (4 + idx);
+        } else {
+            adrdist.packet_action_at_headertime[0][idx] = 1;
+            adrdist.stats_bubble_req[timing_thread(gress)].bubble_req_1x_class_en |= 1 << idx;
+        }
+        if (push_on_overflow) {
+            adrdist.deferred_oflo_ctl = 1 << ((home->row - 8) / 2U);
+            adrdist.oflo_adr_user[0] = adrdist.oflo_adr_user[1] = AdrDist::STATISTICS;
+        }
+        first_stats_group = false;
+    }
+}
+
+void CounterTable::gen_tbl_cfg(json::vector &out) const {
+    // FIXME -- factor common Synth2Port stuff
+    auto spare_mems = determine_spare_bank_memory_units();
+    int size = (layout_size() - spare_mems.size()) * SRAM_DEPTH * format->groups();
+    json::map &tbl = *base_tbl_cfg(out, "statistics", size);
+    json::map &stage_tbl = *add_stage_tbl_cfg(tbl, "statistics", size);
+    if (home_rows.size() > 1)
+        add_alu_indexes(stage_tbl, "stats_alu_index");
+    else
+        add_alu_index(stage_tbl, "stats_alu_index");
+    tbl["enable_pfe"] = per_flow_enable;
+    tbl["pfe_bit_position"] = per_flow_enable_bit();
+    if (auto *f = lookup_field("bytes"))
+        tbl["byte_counter_resolution"] = f->size;
+    else
+        tbl["byte_counter_resolution"] = INT64_C(0);
+    if (auto *f = lookup_field("packets"))
+        tbl["packet_counter_resolution"] = f->size;
+    else
+        tbl["packet_counter_resolution"] = INT64_C(0);
+    switch (type) {
+        case PACKETS:
+            tbl["statistics_type"] = "packets";
+            break;
+        case BYTES:
+            tbl["statistics_type"] = "bytes";
+            break;
+        case BOTH:
+            tbl["statistics_type"] = "packets_and_bytes";
+            break;
+        default:
+            break;
+    }
+    if (context_json) stage_tbl.merge(*context_json);
+}
+
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(CounterTable, TARGET_CLASS)
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void CounterTable::write_merge_regs,
+                      (mau_regs & regs, MatchTable *match, int type, int bus,
+                       const std::vector<Call::Arg> &args),
+                      { write_merge_regs_vt(regs, match, type, bus, args); })
diff --git a/backends/tofino/bf-asm/crash.cpp b/backends/tofino/bf-asm/crash.cpp
new file mode 100644
index 00000000000..a092bce2696
--- /dev/null
+++ b/backends/tofino/bf-asm/crash.cpp
@@ -0,0 +1,281 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <errno.h>
+
+#include "backends/tofino/bf-asm/config.h"
+#if HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <string.h>
+
+#include <sys/wait.h>
+#if HAVE_UCONTEXT_H
+#include <ucontext.h>
+#endif
+#include <unistd.h>
+
+#include <iostream>
+
+#include "bfas.h"
+#include "exename.h"
+#include "lib/hex.h"
+#include "lib/log.h"
+
+using namespace P4;
+
+static const char *signames[] = {
+    "NONE", "HUP",  "INT",  "QUIT", "ILL",    "TRAP",   "ABRT",  "BUS",  "FPE",  "KILL", "USR1",
+    "SEGV", "USR2", "PIPE", "ALRM", "TERM",   "STKFLT", "CHLD",  "CONT", "STOP", "TSTP", "TTIN",
+    "TTOU", "URG",  "XCPU", "XFSZ", "VTALRM", "PROF",   "WINCH", "POLL", "PWR",  "SYS"};
+
+char *program_name = nullptr;
+
+#ifdef MULTITHREAD
+#include <pthread.h>
+
+#include <mutex>
+std::vector<pthread_t> thread_ids;
+__thread int my_id;
+
+void register_thread() {
+    static std::mutex lock;
+    std::lock_guard<std::mutex> acquire(lock);
+    my_id = thread_ids.size();
+    thread_ids.push_back(pthread_self());
+}
+#define MTONLY(...) __VA_ARGS__
+#else
+#define MTONLY(...)
+#endif  // MULTITHREAD
+
+static MTONLY(__thread) int shutdown_loop = 0;  // avoid infinite loop if shutdown crashes
+
+static void sigint_shutdown(int sig, siginfo_t *, void *) {
+    if (shutdown_loop++) _exit(-1);
+    LOG1("Exiting with SIG" << signames[sig]);
+    _exit(sig + 0x80);
+}
+
+/*
+ * call external program addr2line WITHOUT using malloc or stdio or anything
+ * else that might be problematic if there's memory corruption or exhaustion
+ */
+const char *addr2line(void *addr, const char *text) {
+    MTONLY(static std::mutex lock; std::lock_guard<std::mutex> acquire(lock);)
+    static pid_t child = 0;
+    static int to_child, from_child;
+    static char binary[PATH_MAX];
+    static char buffer[PATH_MAX];
+    const char *t;
+
+    if (!text || !(t = strchr(text, '('))) {
+        text = exename(program_name);
+        t = text + strlen(text);
+    }
+    memcpy(buffer, text, t - text);
+    buffer[t - text] = 0;
+    if (child && strcmp(binary, buffer)) {
+        child = 0;
+        close(to_child);
+        close(from_child);
+    }
+    memcpy(binary, buffer, (t - text) + 1);
+    text = binary;
+    if (!child) {
+        int pfd1[2], pfd2[2];
+        char *p = buffer;
+        const char *argv[4] = {"/bin/sh", "-c", buffer, 0};
+        strcpy(p, "addr2line ");  // NOLINT
+        p += strlen(p);
+        strcpy(p, " -Cfspe ");  // NOLINT
+        p += strlen(p);
+        t = text + strlen(text);
+        if (!memchr(text, '/', t - text)) {
+            strcpy(p, "$(which ");  // NOLINT
+            p += strlen(p);
+        }
+        memcpy(p, text, t - text);
+        p += t - text;
+        if (!memchr(text, '/', t - text)) *p++ = ')';
+        *p = 0;
+        child = -1;
+#if HAVE_PIPE2
+        if (pipe2(pfd1, O_CLOEXEC) < 0) return 0;
+        if (pipe2(pfd2, O_CLOEXEC) < 0) return 0;
+#else
+        if (pipe(pfd1) < 0) return 0;
+        if (pipe(pfd2) < 0) return 0;
+        fcntl(pfd1[0], F_SETFD, FD_CLOEXEC | fcntl(pfd1[0], F_GETFL));
+        fcntl(pfd1[1], F_SETFD, FD_CLOEXEC | fcntl(pfd1[1], F_GETFL));
+        fcntl(pfd2[0], F_SETFD, FD_CLOEXEC | fcntl(pfd2[0], F_GETFL));
+        fcntl(pfd2[1], F_SETFD, FD_CLOEXEC | fcntl(pfd2[1], F_GETFL));
+#endif
+        while ((child = fork()) == -1 && errno == EAGAIN) {
+        }
+        if (child == -1) return 0;
+        if (child == 0) {
+            dup2(pfd1[1], 1);
+            dup2(pfd1[1], 2);
+            dup2(pfd2[0], 0);
+            execvp(argv[0], (char *const *)argv);
+            _exit(-1);
+        }
+        close(pfd1[1]);
+        from_child = pfd1[0];
+        close(pfd2[0]);
+        to_child = pfd2[1];
+    }
+    if (child == -1) return 0;
+    char *p = buffer;
+    uintptr_t a = (uintptr_t)addr;
+    int shift = (CHAR_BIT * sizeof(uintptr_t) - 1) & ~3;
+    while (shift > 0 && (a >> shift) == 0) shift -= 4;
+    while (shift >= 0) {
+        *p++ = "0123456789abcdef"[(a >> shift) & 0xf];
+        shift -= 4;
+    }
+    *p++ = '\n';
+    auto _unused = write(to_child, buffer, p - buffer);
+    (void)_unused;
+    p = buffer;
+    int len;
+    while (p < buffer + sizeof(buffer) - 1 &&
+           (len = read(from_child, p, buffer + sizeof(buffer) - p - 1)) > 0 && (p += len) &&
+           !memchr(p - len, '\n', len)) {
+    }
+    *p = 0;
+    if ((p = strchr(buffer, '\n'))) *p = 0;
+    if (buffer[0] == 0 || buffer[0] == '?') return 0;
+    return buffer;
+}
+
+#if HAVE_UCONTEXT_H
+static void dumpregs(mcontext_t *mctxt) {
+#if defined(REG_EAX)
+    LOG1(" eax=" << P4::hex(mctxt->gregs[REG_EAX], 8, '0')
+                 << " ebx=" << P4::hex(mctxt->gregs[REG_EBX], 8, '0')
+                 << " ecx=" << P4::hex(mctxt->gregs[REG_ECX], 8, '0')
+                 << " edx=" << P4::hex(mctxt->gregs[REG_EDX], 8, '0'));
+    LOG1(" edi=" << P4::hex(mctxt->gregs[REG_EDI], 8, '0')
+                 << " esi=" << P4::hex(mctxt->gregs[REG_ESI], 8, '0')
+                 << " ebp=" << P4::hex(mctxt->gregs[REG_EBP], 8, '0')
+                 << " esp=" << P4::hex(mctxt->gregs[REG_ESP], 8, '0'));
+#elif defined(REG_RAX)
+    LOG1(" rax=" << P4::hex(mctxt->gregs[REG_RAX], 16, '0')
+                 << " rbx=" << P4::hex(mctxt->gregs[REG_RBX], 16, '0')
+                 << " rcx=" << P4::hex(mctxt->gregs[REG_RCX], 16, '0'));
+    LOG1(" rdx=" << P4::hex(mctxt->gregs[REG_RDX], 16, '0')
+                 << " rdi=" << P4::hex(mctxt->gregs[REG_RDI], 16, '0')
+                 << " rsi=" << P4::hex(mctxt->gregs[REG_RSI], 16, '0'));
+    LOG1(" rbp=" << P4::hex(mctxt->gregs[REG_RBP], 16, '0')
+                 << " rsp=" << P4::hex(mctxt->gregs[REG_RSP], 16, '0')
+                 << "  r8=" << P4::hex(mctxt->gregs[REG_R8], 16, '0'));
+    LOG1("  r9=" << P4::hex(mctxt->gregs[REG_R9], 16, '0')
+                 << " r10=" << P4::hex(mctxt->gregs[REG_R10], 16, '0')
+                 << " r11=" << P4::hex(mctxt->gregs[REG_R11], 16, '0'));
+    LOG1(" r12=" << P4::hex(mctxt->gregs[REG_R12], 16, '0')
+                 << " r13=" << P4::hex(mctxt->gregs[REG_R13], 16, '0')
+                 << " r14=" << P4::hex(mctxt->gregs[REG_R14], 16, '0'));
+    LOG1(" r15=" << P4::hex(mctxt->gregs[REG_R15], 16, '0'));
+#elif defined(__i386__)
+    LOG1(" eax=" << P4::hex(mctxt->mc_eax, 8, '0') << " ebx=" << P4::hex(mctxt->mc_ebx, 8, '0')
+                 << " ecx=" << P4::hex(mctxt->mc_ecx, 8, '0')
+                 << " edx=" << P4::hex(mctxt->mc_edx, 8, '0'));
+    LOG1(" edi=" << P4::hex(mctxt->mc_edi, 8, '0') << " esi=" << P4::hex(mctxt->mc_esi, 8, '0')
+                 << " ebp=" << P4::hex(mctxt->mc_ebp, 8, '0')
+                 << " esp=" << P4::hex(mctxt->mc_esp, 8, '0'));
+#elif defined(__amd64__)
+    LOG1(" rax=" << P4::hex(mctxt->mc_rax, 16, '0') << " rbx=" << P4::hex(mctxt->mc_rbx, 16, '0')
+                 << " rcx=" << P4::hex(mctxt->mc_rcx, 16, '0'));
+    LOG1(" rdx=" << P4::hex(mctxt->mc_rdx, 16, '0') << " rdi=" << P4::hex(mctxt->mc_rdi, 16, '0')
+                 << " rsi=" << P4::hex(mctxt->mc_rsi, 16, '0'));
+    LOG1(" rbp=" << P4::hex(mctxt->mc_rbp, 16, '0') << " rsp=" << P4::hex(mctxt->mc_rsp, 16, '0')
+                 << "  r8=" << P4::hex(mctxt->mc_r8, 16, '0'));
+    LOG1("  r9=" << P4::hex(mctxt->mc_r9, 16, '0') << " r10=" << P4::hex(mctxt->mc_r10, 16, '0')
+                 << " r11=" << P4::hex(mctxt->mc_r11, 16, '0'));
+    LOG1(" r12=" << P4::hex(mctxt->mc_r12, 16, '0') << " r13=" << P4::hex(mctxt->mc_r13, 16, '0')
+                 << " r14=" << P4::hex(mctxt->mc_r14, 16, '0'));
+    LOG1(" r15=" << P4::hex(mctxt->mc_r15, 16, '0'));
+#else
+#warning "unknown machine type"
+#endif
+}
+#endif
+
+static void crash_shutdown(int sig, siginfo_t *info, void *uctxt) {
+    if (shutdown_loop++) _exit(-1);
+    MTONLY(static std::recursive_mutex lock; static int threads_dumped = 0;
+           static bool killed_all_threads = false; lock.lock(); if (!killed_all_threads) {
+               killed_all_threads = true;
+               for (int i = 0; i < int(thread_ids.size()); i++)
+                   if (i != my_id - 1) {
+                       pthread_kill(thread_ids[i], SIGABRT);
+                   }
+           })
+    LOG1(MTONLY("Thread #" << my_id << " " <<) "exiting with SIG" << signames[sig] << ", trace:");
+    if (sig == SIGILL || sig == SIGFPE || sig == SIGSEGV || sig == SIGBUS || sig == SIGTRAP)
+        LOG1("  address = " << hex(info->si_addr));
+#if HAVE_UCONTEXT_H
+    dumpregs(&(static_cast<ucontext_t *>(uctxt)->uc_mcontext));
+#else
+    (void)uctxt;  // Suppress unused parameter warning.
+#endif
+#if HAVE_EXECINFO_H
+    if (LOGGING(1)) {
+        static void *buffer[64];
+        int size = backtrace(buffer, 64);
+        char **strings = backtrace_symbols(buffer, size);
+        for (int i = 1; i < size; i++) {
+            if (strings) LOG1("  " << strings[i]);
+            if (const char *line = addr2line(buffer[i], strings ? strings[i] : 0))
+                LOG1("    " << line);
+        }
+        if (size < 1) LOG1("backtrace failed");
+        free(strings);
+    }
+#endif
+    MTONLY(
+        if (++threads_dumped < int(thread_ids.size())) {
+            lock.unlock();
+            pthread_exit(0);
+        } else { lock.unlock(); })
+    if (sig != SIGABRT) BUG("Exiting with SIG%s", signames[sig]);
+    _exit(sig + 0x80);
+}
+
+void register_exit_signals() {
+    struct sigaction sigact;
+    sigact.sa_sigaction = sigint_shutdown;
+    sigact.sa_flags = SA_SIGINFO;
+    sigemptyset(&sigact.sa_mask);
+    sigaction(SIGHUP, &sigact, 0);
+    sigaction(SIGINT, &sigact, 0);
+    sigaction(SIGQUIT, &sigact, 0);
+    sigaction(SIGTERM, &sigact, 0);
+    sigact.sa_sigaction = crash_shutdown;
+    sigaction(SIGILL, &sigact, 0);
+    sigaction(SIGABRT, &sigact, 0);
+    sigaction(SIGFPE, &sigact, 0);
+    sigaction(SIGSEGV, &sigact, 0);
+    sigaction(SIGBUS, &sigact, 0);
+    sigaction(SIGTRAP, &sigact, 0);
+    signal(SIGPIPE, SIG_IGN);
+}
diff --git a/backends/tofino/bf-asm/data_switchbox.h b/backends/tofino/bf-asm/data_switchbox.h
new file mode 100644
index 00000000000..d69d952f899
--- /dev/null
+++ b/backends/tofino/bf-asm/data_switchbox.h
@@ -0,0 +1,168 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_DATA_SWITCHBOX_H_
+#define BACKENDS_TOFINO_BF_ASM_DATA_SWITCHBOX_H_
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+
+/*
+ * Code to handle programming of the Ram Data Bus Horizontal/Vertical Switchbox
+ * see section 6.2.4.4 of the MAU uArch docs
+ */
+
+template <class REGS>
+class DataSwitchboxSetup {
+    REGS &regs;
+    Table *tbl;
+    unsigned home_row, home_row_logical, prev_row, top_ram_row, bottom_ram_row;
+
+ public:
+    unsigned get_home_row() { return home_row; }
+    unsigned get_home_row_logical() { return home_row_logical; }
+    DataSwitchboxSetup(REGS &regs, Table *t, int home = -1, int next_home = -1)
+        : regs(regs), tbl(t) {
+        if (home >= 0)
+            top_ram_row = prev_row = home_row = home / 2U;
+        else
+            top_ram_row = prev_row = home_row = tbl->layout[0].row / 2U;
+        bottom_ram_row = tbl->layout.back().row / 2U;
+        if (next_home >= 0) {
+            for (auto it = tbl->layout.rbegin(); it != tbl->layout.rend(); ++it) {
+                if (it->row > next_home) {
+                    bottom_ram_row = it->row / 2U;
+                    break;
+                }
+            }
+        }
+
+        // Counter ALU's are on even rows on right side of RAM array. Set
+        // home_row to the correct ALU
+        if (tbl->table_type() == Table::COUNTER)
+            prev_row = home_row = prev_row % 2 ? prev_row + 1 : prev_row;
+        // Stateful/Selection/Meter ALU's are on odd rows on right side of RAM
+        // array. Set home_row to the correct ALU
+        else if (tbl->table_type() == Table::STATEFUL || tbl->table_type() == Table::SELECTION ||
+                 tbl->table_type() == Table::METER)
+            prev_row = home_row = prev_row % 2 ? prev_row : prev_row + 1;
+        home_row_logical = home_row * 2 + 1;
+    }
+    /**
+     * Responsible for the data hv switch box per row, as well as the fabric_ctl.  At a high
+     * level, the fabric ctl is an optimized version of the fabric_ctl in order to manage
+     * some of the timing issues.
+     *
+     * Operates under the assumption that all rows in the layout are numerically highest to lowest.
+     * Information has to flow up to the home row, and flow down to the lowest row.  Should not
+     * flow above the homerow and below the lowest row
+     */
+    void setup_row(unsigned row) {
+        auto &map_alu = regs.rams.map_alu;
+        auto &swbox = regs.rams.array.switchbox.row;
+        auto &map_alu_row = map_alu.row[row];
+        int side = 1;  // always -- currently no maprams on left side
+        auto &syn2port_ctl = map_alu_row.i2portctl.synth2port_fabric_ctl[0][side];
+        map_alu_row.i2portctl.synth2port_ctl.synth2port_enable = 1;
+        while (prev_row != row) {
+            auto &prev_syn2port_ctl = map_alu.row[prev_row].i2portctl.synth2port_fabric_ctl[0];
+            if (prev_row == home_row) {
+                swbox[prev_row].ctl.r_stats_alu_o_mux_select.r_stats_alu_o_sel_oflo_rd_b_i = 1;
+                swbox[prev_row].ctl.b_oflo_wr_o_mux_select.b_oflo_wr_o_sel_stats_wr_r_i = 1;
+                prev_syn2port_ctl[side].stats_to_vbus_below = 1;
+            } else {
+                // If a row is in the middle of possible rows, must program the switchbox
+                // to have data pass through the bottom of the switch box to the top of
+                // the switchbox
+                swbox[prev_row].ctl.t_oflo_rd_o_mux_select.t_oflo_rd_o_sel_oflo_rd_b_i = 1;
+                swbox[prev_row].ctl.b_oflo_wr_o_mux_select.b_oflo_wr_o_sel_oflo_wr_t_i = 1;
+                // below2above only means that there is no synth2port RAMs on this row, but
+                // the signal needs to pass between the rows
+                prev_syn2port_ctl[side].synth2port_connect_below2above = 1;
+                /* need to also program left side below2above connections
+                 * see ram_bus_path.py:254 -- 'Mike F.' comment */
+                prev_syn2port_ctl[0].synth2port_connect_below2above = 1;
+                prev_syn2port_ctl[side].oflo_to_vbus_below = 1;
+            }
+            auto &next_syn2port_ctl =
+                map_alu.row[prev_row - 1].i2portctl.synth2port_fabric_ctl[0][side];
+            // From RTL, it only appears that oflo_to_vbus_below/above should be programmed
+            // when RAMs appear on the RAM line, but the model asserts if these are not enabled.
+            // Keeping this, as it is what is DV'ed against
+            next_syn2port_ctl.oflo_to_vbus_above = 1;
+            prev_row--;
+        }
+        // FIXME: Should this be top_ram_row?
+        if (row == home_row) {
+            swbox[row].ctl.r_stats_alu_o_mux_select.r_stats_alu_o_sel_stats_rd_r_i = 1;
+        } else {
+            // The oflo signal of this row must go through the overflow bus
+            swbox[row].ctl.t_oflo_rd_o_mux_select.t_oflo_rd_o_sel_oflo_rd_r_i = 1;
+            swbox[row].ctl.r_oflo_wr_o_mux_select = 1;
+            syn2port_ctl.synth2port_connect_above = 1;
+        }
+
+        if (row != bottom_ram_row) {
+            // To determine whether data flows back down.  Doesn't flow down on the lowest row
+            syn2port_ctl.synth2port_connect_below = 1;
+        }
+    }
+    void setup_row_col(unsigned row, unsigned col, int vpn) {
+        int side = col >= 6;
+        unsigned logical_col = col % 6U;
+        auto &ram = regs.rams.array.row[row].ram[col];
+        auto &map_alu = regs.rams.map_alu;
+        auto &map_alu_row = map_alu.row[prev_row];
+        auto &unitram_config = map_alu_row.adrmux.unitram_config[side][logical_col];
+        unitram_config.unitram_type = tbl->unitram_type();
+        unitram_config.unitram_logical_table = tbl->logical_id;
+        if (!options.match_compiler)  // FIXME -- compiler doesn't set this?
+            unitram_config.unitram_vpn = vpn;
+        if (tbl->gress == INGRESS || tbl->gress == GHOST)
+            unitram_config.unitram_ingress = 1;
+        else
+            unitram_config.unitram_egress = 1;
+        unitram_config.unitram_enable = 1;
+
+        auto &ram_address_mux_ctl = map_alu_row.adrmux.ram_address_mux_ctl[side][logical_col];
+        ram_address_mux_ctl.ram_unitram_adr_mux_select = UnitRam::AdrMux::STATS_METERS;
+        if (row == home_row) {
+            ram.unit_ram_ctl.match_ram_write_data_mux_select = UnitRam::DataMux::STATISTICS;
+            ram.unit_ram_ctl.match_ram_read_data_mux_select = UnitRam::DataMux::STATISTICS;
+            if (tbl->adr_mux_select_stats())
+                ram_address_mux_ctl.ram_stats_meter_adr_mux_select_stats = 1;
+            else
+                ram_address_mux_ctl.ram_stats_meter_adr_mux_select_meter = 1;
+            ram_address_mux_ctl.ram_ofo_stats_mux_select_statsmeter = 1;
+            ram_address_mux_ctl.synth2port_radr_mux_select_home_row = 1;
+        } else {
+            ram.unit_ram_ctl.match_ram_write_data_mux_select = UnitRam::DataMux::OVERFLOW;
+            ram.unit_ram_ctl.match_ram_read_data_mux_select = UnitRam::DataMux::OVERFLOW;
+            ram_address_mux_ctl.ram_oflo_adr_mux_select_oflo = 1;
+            ram_address_mux_ctl.ram_ofo_stats_mux_select_oflo = 1;
+            ram_address_mux_ctl.synth2port_radr_mux_select_oflo = 1;
+        }
+        ram_address_mux_ctl.map_ram_wadr_mux_select = MapRam::Mux::SYNTHETIC_TWO_PORT;
+        ram_address_mux_ctl.map_ram_wadr_mux_enable = 1;
+        ram_address_mux_ctl.map_ram_radr_mux_select_smoflo = 1;
+        int syn2port_bus = prev_row == top_ram_row ? 0 : 1;
+        auto &syn2port_members = map_alu_row.i2portctl.synth2port_hbus_members[syn2port_bus][side];
+        syn2port_members |= 1U << logical_col;
+    }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_DATA_SWITCHBOX_H_ */
diff --git a/backends/tofino/bf-asm/deparser.cpp b/backends/tofino/bf-asm/deparser.cpp
new file mode 100644
index 00000000000..922b07baf11
--- /dev/null
+++ b/backends/tofino/bf-asm/deparser.cpp
@@ -0,0 +1,819 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "deparser.h"
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/target.h"
+#include "constants.h"
+#include "lib/range.h"
+#include "misc.h"
+#include "parser-tofino-jbay.h"
+#include "phv.h"
+#include "top_level.h"
+#include "ubits.h"
+
+unsigned Deparser::unique_field_list_handle;
+Deparser Deparser::singleton_object;
+
+Deparser::Deparser() : Section("deparser") {}
+Deparser::~Deparser() {}
+
+struct Deparser::FDEntry {
+    struct Base {
+        virtual ~Base() {}
+        virtual void check(bitvec &phv_use) = 0;
+        virtual unsigned encode() = 0;
+        virtual unsigned size() = 0;  // size in bytes;
+        virtual void dbprint(std::ostream &) const = 0;
+        template <class T>
+        bool is() const {
+            return dynamic_cast<const T *>(this) != nullptr;
+        }
+        template <class T>
+        T *to() {
+            return dynamic_cast<T *>(this);
+        }
+        friend std::ostream &operator<<(std::ostream &out, const Base &b) {
+            b.dbprint(out);
+            return out;
+        }
+    };
+    struct Phv : Base {
+        ::Phv::Ref val;
+        Phv(gress_t g, const value_t &v) : val(g, DEPARSER_STAGE, v) {}
+        void check(bitvec &phv_use) override {
+            if (val.check()) {
+                phv_use[val->reg.uid] = 1;
+                if (val->lo != 0 || val->hi != val->reg.size - 1)
+                    error(val.lineno,
+                          "Can only output full phv registers, not slices, "
+                          "in deparser");
+            }
+        }
+        unsigned encode() override { return val->reg.deparser_id(); }
+        unsigned size() override { return val->reg.size / 8; }
+        const ::Phv::Register *reg() { return &val->reg; }
+        void dbprint(std::ostream &out) const override { out << val.desc(); }
+    };
+    struct Checksum : Base {
+        gress_t gress;
+        int unit;
+        Checksum(gress_t gr, const value_t &v) : gress(gr) {
+            if (CHECKTYPE(v, tINT)) {
+                if ((unit = v.i) < 0 || v.i >= Target::DEPARSER_CHECKSUM_UNITS())
+                    error(v.lineno, "Invalid deparser checksum unit %" PRId64 "", v.i);
+            }
+        }
+        void check(bitvec &phv_use) override {}
+        template <class TARGET>
+        unsigned encode();
+        unsigned encode() override;
+        unsigned size() override { return 2; }
+        void dbprint(std::ostream &out) const override { out << gress << " checksum " << unit; }
+    };
+    struct Constant : Base {
+        int lineno;
+        gress_t gress;
+        int val;
+        Constant(gress_t g, const value_t &v) : gress(g), val(v.i) {
+            lineno = v.lineno;
+            if (v.i < 0 || v.i >> 8)
+                error(lineno,
+                      "Invalid deparser constant %" PRId64 ", valid constant range is 0-255", v.i);
+            bool ok = Deparser::add_constant(gress, val);
+            if (!ok) error(lineno, "Ran out of deparser constants");
+        }
+        void check(bitvec &phv_use) override {}
+        template <class TARGET>
+        unsigned encode();
+        unsigned encode() override;
+        unsigned size() override { return 1; }
+        void dbprint(std::ostream &out) const override { out << val; }
+    };
+    struct Clot : Base {
+        int lineno;
+        gress_t gress;
+        std::string tag;
+        int length = -1;
+        std::map<unsigned, ::Phv::Ref> phv_replace;
+        std::map<unsigned, Checksum> csum_replace;
+        Clot(gress_t gr, const value_t &tag, const value_t &data, ordered_set<::Phv::Ref> &pov)
+            : lineno(tag.lineno), gress(gr) {
+            if (CHECKTYPE2(tag, tINT, tSTR)) {
+                if (tag.type == tSTR)
+                    this->tag = tag.s;
+                else
+                    this->tag = std::to_string(tag.i);
+            }
+            if (data.type == tMAP) {
+                for (auto &kv : data.map) {
+                    if (kv.key == "pov") {
+                        pov.emplace(gress, DEPARSER_STAGE, kv.value);
+                    } else if (kv.key == "max_length" || kv.key == "length") {
+                        if (length >= 0) error(kv.value.lineno, "Duplicate length");
+                        if (CHECKTYPE(kv.value, tINT) && ((length = kv.value.i) < 0 || length > 64))
+                            error(kv.value.lineno, "Invalid clot length");
+                    } else if (kv.key.type == tINT) {
+                        if (phv_replace.count(kv.key.i) || csum_replace.count(kv.key.i))
+                            error(kv.value.lineno, "Duplicate value at offset %" PRId64 "",
+                                  kv.key.i);
+                        if (kv.value.type == tCMD && kv.value.vec.size == 2 &&
+                            kv.value == "full_checksum")
+                            csum_replace.emplace(kv.key.i, Checksum(gress, kv.value.vec[1]));
+                        else
+                            phv_replace.emplace(kv.key.i,
+                                                ::Phv::Ref(gress, DEPARSER_STAGE, kv.value));
+                    } else {
+                        error(kv.value.lineno, "Unknown key for clot: %s", value_desc(kv.key));
+                    }
+                }
+            } else {
+                pov.emplace(gress, DEPARSER_STAGE, data);
+            }
+            if (pov.size() > Target::DEPARSER_MAX_POV_PER_USE())
+                error(data.lineno, "Too many POV bits for CLOT");
+        }
+        void check(bitvec &phv_use) override {
+            if (length < 0) length = Parser::clot_maxlen(gress, tag);
+            if (length < 0) error(lineno, "No length for clot %s", tag.c_str());
+            if (Parser::clot_tag(gress, tag) < 0) error(lineno, "No tag for clot %s", tag.c_str());
+            unsigned next = 0;
+            ::Phv::Ref *prev = nullptr;
+            for (auto &r : phv_replace) {
+                if (r.first < next) {
+                    error(r.second.lineno, "Overlapping phvs in clot");
+                    error(prev->lineno, "%s and %s", prev->name(), r.second.name());
+                }
+                if (r.second.check()) {
+                    phv_use[r.second->reg.uid] = 1;
+                    if (r.second->lo != 0 || r.second->hi != r.second->reg.size - 1)
+                        error(r.second.lineno,
+                              "Can only output full phv registers, not slices,"
+                              " in deparser");
+                    next = r.first + r.second->reg.size / 8U;
+                    prev = &r.second;
+                }
+            }
+        }
+        unsigned size() override { return length; }
+        unsigned encode() override {
+            BUG();
+            return -1;
+        }
+        void dbprint(std::ostream &out) const override {
+            out << "clot " << tag;
+            if (length > 0) out << " [len " << length << "]";
+        }
+    };
+
+    int lineno;
+    std::unique_ptr<Base> what;
+    ordered_set<::Phv::Ref> pov;
+    FDEntry(gress_t gress, const value_t &v, const value_t &p) {
+        lineno = v.lineno;
+        if (v.type == tCMD && v.vec.size == 2 && v == "clot") {
+            what.reset(new Clot(gress, v.vec[1], p, pov));
+        } else if (v.type == tCMD && v.vec.size == 2 && v == "full_checksum") {
+            what.reset(new Checksum(gress, v.vec[1]));
+            pov.emplace(gress, DEPARSER_STAGE, p);
+        } else if (v.type == tINT) {
+            what.reset(new Constant(gress, v));
+            pov.emplace(gress, DEPARSER_STAGE, p);
+        } else {
+            what.reset(new Phv(gress, v));
+            pov.emplace(gress, DEPARSER_STAGE, p);
+        }
+    }
+    void check(bitvec &phv_use) { what->check(phv_use); }
+};
+
+struct Deparser::Intrinsic::Type {
+    target_t target;
+    gress_t gress;
+    std::string name;
+    int max;
+    static std::map<std::string, Type *> all[TARGET_INDEX_LIMIT][2];
+
+ protected:
+    Type(target_t t, gress_t gr, const char *n, int m) : target(t), gress(gr), name(n), max(m) {
+        BUG_CHECK(!all[t][gr].count(name));
+        all[target][gress][name] = this;
+    }
+    ~Type() { all[target][gress].erase(name); }
+
+ public:
+#define VIRTUAL_TARGET_METHODS(TARGET)                                            \
+    virtual void setregs(Target::TARGET::deparser_regs &regs, Deparser &deparser, \
+                         Intrinsic &vals) {                                       \
+        BUG_CHECK(!"target mismatch");                                            \
+    }
+    FOR_ALL_REGISTER_SETS(VIRTUAL_TARGET_METHODS)
+#undef VIRTUAL_TARGET_METHODS
+};
+
+#define DEPARSER_INTRINSIC(TARGET, GR, NAME, MAX)                                                  \
+    static struct TARGET##INTRIN##GR##NAME : public Deparser::Intrinsic::Type {                    \
+        TARGET##INTRIN##GR##NAME()                                                                 \
+            : Deparser::Intrinsic::Type(Target::TARGET::tag, GR, #NAME, MAX) {}                    \
+        void setregs(Target::TARGET::deparser_regs &, Deparser &, Deparser::Intrinsic &) override; \
+    } TARGET##INTRIN##GR##NAME##_singleton;                                                        \
+    void TARGET##INTRIN##GR##NAME::setregs(Target::TARGET::deparser_regs &regs,                    \
+                                           Deparser &deparser, Deparser::Intrinsic &intrin)
+
+std::map<std::string, Deparser::Intrinsic::Type *>
+    Deparser::Intrinsic::Type::all[TARGET_INDEX_LIMIT][2];
+
+Deparser::Digest::Digest(Deparser::Digest::Type *t, int l, VECTOR(pair_t) & data) {
+    type = t;
+    lineno = l;
+    for (auto &l : data) {
+        if (l.key == "select") {
+            if (l.value.type == tMAP && l.value.map.size == 1) {
+                select = Val(t->gress, l.value.map[0].key, l.value.map[0].value);
+            } else {
+                select = Val(t->gress, l.value);
+            }
+        } else if (t->can_shift && l.key == "shift") {
+            if (CHECKTYPE(l.value, tINT)) shift = l.value.i;
+        } else if (l.key == "context_json") {
+            if (CHECKTYPE(l.value, tMAP)) context_json = toJson(l.value.map);
+        } else if (!CHECKTYPE(l.key, tINT)) {
+            continue;
+        } else if (l.key.i < 0 || l.key.i >= t->count) {
+            error(l.key.lineno, "%s index %" PRId64 " out of range", t->name.c_str(), l.key.i);
+        } else if (l.value.type != tVEC) {
+            layout[l.key.i].emplace_back(t->gress, DEPARSER_STAGE, l.value);
+        } else {
+            // TODO : Need an empty layout entry if no values are present to
+            // set the config registers correctly
+            layout.emplace(l.key.i, std::vector<Phv::Ref>());
+            for (auto &v : l.value.vec) layout[l.key.i].emplace_back(t->gress, DEPARSER_STAGE, v);
+        }
+    }
+    if (!select && t->name != "pktgen") error(lineno, "No select key in %s spec", t->name.c_str());
+}
+
+#define DEPARSER_DIGEST(TARGET, GRESS, NAME, CNT, ...)                                          \
+    static struct TARGET##GRESS##NAME##Digest : public Deparser::Digest::Type {                 \
+        TARGET##GRESS##NAME##Digest()                                                           \
+            : Deparser::Digest::Type(Target::TARGET::tag, GRESS, #NAME, CNT) {                  \
+            __VA_ARGS__                                                                         \
+        }                                                                                       \
+        void setregs(Target::TARGET::deparser_regs &, Deparser &, Deparser::Digest &) override; \
+    } TARGET##GRESS##NAME##Digest##_singleton;                                                  \
+    void TARGET##GRESS##NAME##Digest::setregs(Target::TARGET::deparser_regs &regs,              \
+                                              Deparser &deparser, Deparser::Digest &data)
+
+std::map<std::string, Deparser::Digest::Type *> Deparser::Digest::Type::all[TARGET_INDEX_LIMIT][2];
+
+void Deparser::start(int lineno, VECTOR(value_t) args) {
+    if (args.size == 0) {
+        this->lineno[INGRESS] = this->lineno[EGRESS] = lineno;
+        return;
+    }
+    if (args.size != 1 || (args[0] != "ingress" && args[0] != "egress"))
+        error(lineno, "deparser must specify ingress or egress");
+    gress_t gress = args[0] == "egress" ? EGRESS : INGRESS;
+    if (!this->lineno[gress]) this->lineno[gress] = lineno;
+}
+
+void Deparser::input(VECTOR(value_t) args, value_t data) {
+    if (!CHECKTYPE(data, tMAP)) return;
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        if (args.size > 0) {
+            if (args[0] == "ingress" && gress != INGRESS) continue;
+            if (args[0] == "egress" && gress != EGRESS) continue;
+        } else if (error_count > 0) {
+            break;
+        }
+        for (auto &kv : MapIterChecked(data.map, true)) {
+            if (kv.key == "dictionary") {
+                if (kv.value.type == tVEC && kv.value.vec.size == 0) continue;
+                collapse_list_of_maps(kv.value);
+                if (!CHECKTYPE(kv.value, tMAP)) continue;
+                for (auto &ent : kv.value.map)
+                    dictionary[gress].emplace_back(gress, ent.key, ent.value);
+            } else if (kv.key == "pov") {
+                if (kv.value.type != tVEC) {
+                    /// The check for correct type is done in Phv::Ref constructor
+                    pov_order[gress].emplace_back(gress, DEPARSER_STAGE, kv.value);
+                } else {
+                    for (auto &ent : kv.value.vec)
+                        pov_order[gress].emplace_back(gress, DEPARSER_STAGE, ent);
+                }
+            } else if (kv.key == "partial_checksum") {
+                if (kv.key.type != tCMD || kv.key.vec.size != 2 || kv.key[1].type != tINT ||
+                    kv.key[1].i < 0 || kv.key[1].i >= Target::DEPARSER_CHECKSUM_UNITS()) {
+                    error(kv.key.lineno, "Invalid deparser checksum unit number");
+                } else if (CHECKTYPE2(kv.value, tVEC, tMAP)) {
+                    collapse_list_of_maps(kv.value);
+                    int unit = kv.key[1].i;
+                    if (unit < 0) error(kv.key.lineno, "Invalid checksum unit %d", unit);
+                    for (auto &ent : kv.value.map) {
+                        checksum_entries[gress][unit].emplace_back(gress, ent.key, ent.value);
+                    }
+                }
+            } else if (kv.key == "full_checksum") {
+                if (kv.key.type != tCMD || kv.key.vec.size != 2 || kv.key[1].type != tINT ||
+                    kv.key[1].i < 0 || kv.key[1].i >= Target::DEPARSER_CHECKSUM_UNITS()) {
+                    error(kv.key.lineno, "Invalid deparser checksum unit number");
+                } else if (CHECKTYPE2(kv.value, tVEC, tMAP)) {
+                    collapse_list_of_maps(kv.value);
+                    int unit = kv.key[1].i;
+                    if (unit < 0) error(kv.key.lineno, "Invalid checksum unit %d", unit);
+                    for (auto &ent : kv.value.map) {
+                        if (ent.key == "partial_checksum") {
+                            full_checksum_unit[gress][unit].entries[ent.key[1].i] =
+                                checksum_entries[gress][ent.key[1].i];
+                            collapse_list_of_maps(ent.value);
+                            for (auto &a : ent.value.map) {
+                                if (a.key == "pov") {
+                                    full_checksum_unit[gress][unit].pov[ent.key[1].i] =
+                                        ::Phv::Ref(gress, DEPARSER_STAGE, a.value);
+                                } else if (a.key == "invert") {
+                                    full_checksum_unit[gress][unit].checksum_unit_invert.insert(
+                                        ent.key[1].i);
+                                }
+                            }
+                        } else if (ent.key == "clot") {
+                            collapse_list_of_maps(ent.value);
+                            for (auto &a : ent.value.map) {
+                                if (a.key == "pov") {
+                                    full_checksum_unit[gress][unit].clot_entries.emplace_back(
+                                        gress, ent.key[1].i, a.value);
+                                } else if (a.key == "invert") {
+                                    full_checksum_unit[gress][unit].clot_tag_invert.insert(
+                                        a.value.i);
+                                }
+                            }
+                        } else if (ent.key == "zeros_as_ones") {
+                            full_checksum_unit[gress][unit].zeros_as_ones_en = ent.value.i;
+                        }
+                    }
+                }
+            } else if (auto *itype = ::get(Intrinsic::Type::all[Target::register_set()][gress],
+                                           value_desc(&kv.key))) {
+                intrinsics.emplace_back(itype, kv.key.lineno);
+                auto &intrin = intrinsics.back();
+                collapse_list_of_maps(kv.value);
+                if (kv.value.type == tVEC) {
+                    for (auto &val : kv.value.vec) intrin.vals.emplace_back(gress, val);
+                } else if (kv.value.type == tMAP) {
+                    for (auto &el : kv.value.map) intrin.vals.emplace_back(gress, el.key, el.value);
+                } else {
+                    intrin.vals.emplace_back(gress, kv.value);
+                }
+            } else if (auto *digest = ::get(Digest::Type::all[Target::register_set()][gress],
+                                            value_desc(&kv.key))) {
+                if (CHECKTYPE(kv.value, tMAP))
+                    digests.emplace_back(digest, kv.value.lineno, kv.value.map);
+            } else {
+                error(kv.key.lineno, "Unknown deparser tag %s", value_desc(&kv.key));
+            }
+        }
+    }
+}
+
+template <class ENTRIES>
+static void write_checksum_entry(ENTRIES &entry, unsigned mask, int swap, int id,
+                                 const char *name = "entry") {
+    BUG_CHECK(swap == 0 || swap == 1);
+    BUG_CHECK(mask == 0 || mask & 3);
+    if (entry.modified()) error(1, "%s appears multiple times in checksum %d", name, id);
+    entry.swap = swap;
+    // CSR: The order of operation: data is swapped or not and then zeroed or not
+    if (swap) mask = (mask & 0x2) >> 1 | (mask & 0x1) << 1;
+    switch (mask) {
+        case 0:
+            entry.zero_m_s_b = 1;
+            entry.zero_l_s_b = 1;
+            break;
+        case 1:
+            entry.zero_m_s_b = 1;
+            entry.zero_l_s_b = 0;
+            break;
+        case 2:
+            entry.zero_m_s_b = 0;
+            entry.zero_l_s_b = 1;
+            break;
+        case 3:
+            entry.zero_m_s_b = 0;
+            entry.zero_l_s_b = 0;
+            break;
+        default:
+            break;
+    }
+}
+
+// Used for field dictionary logging and deparser resoureces.
+// Using fd entry and pov, a json::map is filled with appropriate field names
+void write_field_name_in_json(const Phv::Register *phv, const Phv::Register *pov, int povBit,
+                              json::map &chunk_byte, json::map &fd_entry_chunk_byte, int stageno,
+                              gress_t gress) {
+    auto povName_ = Phv::get_pov_name(pov->mau_id(), povBit);
+    std::string povName = povName_;
+    std::string headerName;
+    size_t pos = 0;
+    if ((pos = povName.find("$valid")) != std::string::npos) {
+        headerName = povName.substr(0, pos);
+    }
+    std::string fieldNames;
+    auto allFields = Phv::aliases(phv, stageno);
+    for (auto fieldName : allFields) {
+        if (fieldName.find(headerName) != std::string::npos) fieldNames += (fieldName + ", ");
+    }
+    fd_entry_chunk_byte["phv_container"] = phv->uid;
+    chunk_byte["PHV"] = phv->uid;
+    chunk_byte["Field"] = fieldNames;
+    return;
+}
+
+void write_pov_resources_in_json(ordered_map<const Phv::Register *, unsigned> &pov,
+                                 json::map &pov_resources) {
+    unsigned pov_size = 0;
+    json::vector pov_bits;
+    // ent will be tuple of (register ref, pov position start)
+    for (auto const &ent : pov) {
+        // Go through all the bits
+        unsigned used_bits = 0;
+        for (unsigned i = 0; i < ent.first->size; i++) {
+            json::map pov_bit;
+            std::string pov_name = Phv::get_pov_name(ent.first->uid, i);
+            // Check if this POV bit is used
+            if (pov_name.compare(" ") != 0) {
+                pov_bit["pov_bit"] = ent.second + i;
+                pov_bit["phv_container"] = ent.first->uid;
+                pov_bit["phv_container_bit"] = i;
+                pov_bit["pov_name"] = pov_name;
+                pov_bits.push_back(std::move(pov_bit));
+                used_bits++;
+            }
+        }
+        if (pov_size < (ent.second + used_bits)) pov_size = ent.second + used_bits;
+    }
+    pov_resources["size"] = pov_size;
+    pov_resources["pov_bits"] = std::move(pov_bits);
+}
+
+// Used for field dictionary logging. Using fd entry and pov, a json::map
+// is filled with appropriate checksum or constant
+void write_csum_const_in_json(int deparserPhvIdx, json::map &chunk_byte,
+                              json::map &fd_entry_chunk_byte, gress_t gress) {
+    if (options.target == Target::Tofino::tag) {
+        if (deparserPhvIdx >= CHECKSUM_ENGINE_PHVID_TOFINO_LOW &&
+            deparserPhvIdx <= CHECKSUM_ENGINE_PHVID_TOFINO_HIGH) {
+            auto csum_id = deparserPhvIdx - CHECKSUM_ENGINE_PHVID_TOFINO_LOW -
+                           (gress * CHECKSUM_ENGINE_PHVID_TOFINO_PER_GRESS);
+            chunk_byte["Checksum"] = csum_id;
+            fd_entry_chunk_byte["csum_engine"] = csum_id;
+        }
+    } else if (options.target == Target::JBay::tag) {
+        if (deparserPhvIdx > CONSTANTS_PHVID_JBAY_LOW &&
+            deparserPhvIdx < CONSTANTS_PHVID_JBAY_HIGH) {
+            chunk_byte["Constant"] =
+                Deparser::get_constant(gress, deparserPhvIdx - CONSTANTS_PHVID_JBAY_LOW);
+            fd_entry_chunk_byte["phv_container"] = deparserPhvIdx;
+        } else {
+            auto csum_id = deparserPhvIdx - CONSTANTS_PHVID_JBAY_HIGH;
+            chunk_byte["Checksum"] = csum_id;
+            fd_entry_chunk_byte["csum_engine"] = csum_id;
+        }
+    }
+    return;
+}
+
+/// Get JSON for deparser resources from digest of deparser table
+/// @param tab_digest Digest for the deparser table, nullptr if the table does not exist
+/// @return JSON node representation of the table for deparser resources
+json::map deparser_table_digest_to_json(Deparser::Digest *tab_digest) {
+    json::map dep_table;
+    json::vector table_phv;
+
+    // nullptr means the table is not used, create JSON node for empty table
+    // and return it
+    if (tab_digest == nullptr) {
+        dep_table["nTables"] = 0;
+        dep_table["maxBytes"] = 0;
+        dep_table["table_phv"] = std::move(table_phv);
+        return dep_table;
+    }
+
+    unsigned int max_bytes = 0;
+    // Prepare tables of the deparser table type
+    for (auto &set : tab_digest->layout) {
+        json::map table;
+        table["table_id"] = set.first;
+        // TODO: field_list_name?
+        json::vector bytes;
+        unsigned byte_n = 0;
+        for (auto &reg : set.second) {
+            json::map byte;
+            byte["byte_number"] = byte_n++;
+            byte["phv_container"] = reg->reg.uid;
+            bytes.push_back(std::move(byte));
+        }
+        if (byte_n > max_bytes) max_bytes = byte_n;
+        table["bytes"] = std::move(bytes);
+        table_phv.push_back(std::move(table));
+    }
+    dep_table["nTables"] = tab_digest->layout.size();
+    dep_table["maxBytes"] = max_bytes;
+    dep_table["index_phv"] = tab_digest->select->reg.uid;
+    dep_table["table_phv"] = std::move(table_phv);
+    // Now we have a digest
+    return dep_table;
+}
+
+/// Create resources_deparser.json with the deparser node
+/// for resources.json
+/// @param fde_entries_i JSON vector of field dictionary entries from Ingress
+/// @param fde_entries_e JSON vector of field dictionary entries from Egress
+void Deparser::report_resources_deparser_json(json::vector &fde_entries_i,
+                                              json::vector &fde_entries_e) {
+    json::map resources_deparser_ingress;
+    json::map resources_deparser_egress;
+    // Set gress property
+    resources_deparser_ingress["gress"] = "ingress";
+    resources_deparser_egress["gress"] = "egress";
+    // Fill out POV resource information for ingress
+    json::map pov_resources;
+    write_pov_resources_in_json(pov[INGRESS], pov_resources);
+    resources_deparser_ingress["pov"] = std::move(pov_resources);
+    // Fill out POV resoure information for egress
+    write_pov_resources_in_json(pov[EGRESS], pov_resources);
+    resources_deparser_egress["pov"] = std::move(pov_resources);
+    // Fill out field dictionaries
+    unsigned n_fde_entries = Target::DEPARSER_MAX_FD_ENTRIES();
+    resources_deparser_ingress["nFdeEntries"] = n_fde_entries;
+    resources_deparser_ingress["fde_entries"] = std::move(fde_entries_i);
+    resources_deparser_egress["nFdeEntries"] = n_fde_entries;
+    resources_deparser_egress["fde_entries"] = std::move(fde_entries_e);
+    // Fill deparser tables
+    Digest *learning_table[2] = {nullptr, nullptr};
+    Digest *resubmit_table[2] = {nullptr, nullptr};
+    Digest *mirror_table[2] = {nullptr, nullptr};
+    for (auto &digest : digests) {
+        // Check if this is egress/ingress
+        if (digest.type->gress != INGRESS && digest.type->gress != EGRESS) continue;
+        if (digest.type->name == "learning")
+            learning_table[digest.type->gress] = &digest;
+        else if (digest.type->name == "resubmit" ||
+                 digest.type->name == "resubmit_preserving_field_list")
+            resubmit_table[digest.type->gress] = &digest;
+        else if (digest.type->name == "mirror")
+            mirror_table[digest.type->gress] = &digest;
+    }
+    resources_deparser_ingress["mirror_table"] =
+        deparser_table_digest_to_json(mirror_table[INGRESS]);
+    resources_deparser_egress["mirror_table"] = deparser_table_digest_to_json(mirror_table[EGRESS]);
+    resources_deparser_ingress["resubmit_table"] =
+        deparser_table_digest_to_json(resubmit_table[INGRESS]);
+    resources_deparser_egress["resubmit_table"] =
+        deparser_table_digest_to_json(resubmit_table[EGRESS]);
+    resources_deparser_ingress["learning_table"] =
+        deparser_table_digest_to_json(learning_table[INGRESS]);
+    resources_deparser_egress["learning_table"] =
+        deparser_table_digest_to_json(learning_table[EGRESS]);
+
+    // Create the main deparser resources node
+    json::vector resources_deparser;
+    resources_deparser.push_back(std::move(resources_deparser_ingress));
+    resources_deparser.push_back(std::move(resources_deparser_egress));
+    // Dump resources to file
+    auto deparser_json_dump = open_output("logs/resources_deparser.json");
+    *deparser_json_dump << &resources_deparser;
+}
+
+#include "jbay/deparser.cpp"    // NOLINT(build/include)
+#include "tofino/deparser.cpp"  // NOLINT(build/include)
+
+std::vector<Deparser::ChecksumVal> Deparser::merge_csum_entries(
+    const std::vector<Deparser::ChecksumVal> &entries, int id) {
+    std::vector<Deparser::ChecksumVal> rv;
+    ordered_map<std::string, Deparser::ChecksumVal> merged_entries;
+
+    for (auto &entry : entries) {
+        if (entry.is_clot()) {
+            rv.push_back(entry);
+            continue;
+        }
+        auto name = entry.val.name();
+        int hi = entry.val.hibit();
+        int lo = entry.val.lobit();
+        bool is_hi = hi >= 16;
+        bool is_lo = lo < 16;
+
+        if (!merged_entries.count(name)) {
+            auto reg = Phv::reg(name);
+            auto new_entry(entry);
+            if (lo != 0 && hi != reg->size - 1) {
+                new_entry.val = Phv::Ref(*reg, entry.val.gress(), 0, reg->size - 1);
+            }
+            merged_entries.emplace(name, new_entry);
+        } else {
+            auto &rv_entry = merged_entries[name];
+            if (rv_entry.mask & entry.mask)
+                error(entry.lineno, "bytes within %s appear multiple times in checksum %d", name,
+                      id);
+            if (is_hi) {
+                if ((rv_entry.mask & 0xc) && (rv_entry.swap & 2) != (entry.swap & 2))
+                    error(entry.lineno, "incompatible swap values for %s in checksum %d", name, id);
+                rv_entry.mask |= entry.mask & 0xc;
+                rv_entry.swap |= entry.swap & 2;
+            }
+            if (is_lo) {
+                if ((rv_entry.mask & 0x3) && (rv_entry.swap & 1) != (entry.swap & 1))
+                    error(entry.lineno, "incompatible swap values for %s in checksum %d", name, id);
+                rv_entry.mask |= entry.mask & 0x3;
+                rv_entry.swap |= entry.swap & 1;
+            }
+        }
+    }
+
+    for (auto &[_, entry] : merged_entries) rv.push_back(entry);
+
+    return rv;
+}
+
+/* The following uses of specialized templates must be after the specialization... */
+void Deparser::process() {
+    bitvec pov_use[2];
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        for (auto &ent : pov_order[gress])
+            if (ent.check()) {
+                pov_use[gress][ent->reg.uid] = 1;
+                phv_use[gress][ent->reg.uid] = 1;
+            }
+        for (auto &ent : dictionary[gress]) {
+            ent.check(phv_use[gress]);
+            for (auto &pov : ent.pov) {
+                if (!pov.check()) continue;
+                phv_use[gress][pov->reg.uid] = 1;
+                if (pov->lo != pov->hi) error(pov.lineno, "POV bits should be single bits");
+                if (!pov_use[gress][pov->reg.uid]) {
+                    pov_order[gress].emplace_back(pov->reg, gress);
+                    pov_use[gress][pov->reg.uid] = 1;
+                }
+            }
+        }
+        for (int i = 0; i < MAX_DEPARSER_CHECKSUM_UNITS; i++)
+            for (auto &ent : full_checksum_unit[gress][i].entries) {
+                for (const auto &entry : ent.second) {
+                    if (!entry.check()) error(entry.lineno, "Invalid checksum entry");
+                }
+                ent.second = merge_csum_entries(ent.second, i);
+            }
+    }
+    for (auto &intrin : intrinsics) {
+        for (auto &el : intrin.vals) {
+            if (el.check()) phv_use[intrin.type->gress][el->reg.uid] = 1;
+            for (auto &pov : el.pov) {
+                if (pov.check()) {
+                    phv_use[intrin.type->gress][pov->reg.uid] = 1;
+                    if (pov->lo != pov->hi) error(pov.lineno, "POV bits should be single bits");
+                    if (!pov_use[intrin.type->gress][pov->reg.uid]) {
+                        pov_order[intrin.type->gress].emplace_back(pov->reg, intrin.type->gress);
+                        pov_use[intrin.type->gress][pov->reg.uid] = 1;
+                    }
+                }
+            }
+        }
+        if (intrin.vals.size() > (size_t)intrin.type->max)
+            error(intrin.lineno, "Too many values for %s", intrin.type->name.c_str());
+    }
+    if (phv_use[INGRESS].intersects(phv_use[EGRESS]))
+        error(lineno[INGRESS], "Registers used in both ingress and egress in deparser: %s",
+              Phv::db_regset(phv_use[INGRESS] & phv_use[EGRESS]).c_str());
+    for (auto &digest : digests) {
+        if (digest.select.check()) {
+            phv_use[digest.type->gress][digest.select->reg.uid] = 1;
+            if (digest.select->lo > 0 && !digest.type->can_shift)
+                error(digest.select.lineno, "%s digest selector must be in bottom bits of phv",
+                      digest.type->name.c_str());
+        }
+        for (auto &pov : digest.select.pov) {
+            if (pov.check()) {
+                phv_use[digest.type->gress][pov->reg.uid] = 1;
+                if (pov->lo != pov->hi) error(pov.lineno, "POV bits should be single bits");
+                if (!pov_use[digest.type->gress][pov->reg.uid]) {
+                    pov_order[digest.type->gress].emplace_back(pov->reg, digest.type->gress);
+                    pov_use[digest.type->gress][pov->reg.uid] = 1;
+                }
+            }
+        }
+        for (auto &set : digest.layout)
+            for (auto &reg : set.second)
+                if (reg.check()) phv_use[digest.type->gress][reg->reg.uid] = 1;
+    }
+    SWITCH_FOREACH_REGISTER_SET(Target::register_set(), TARGET *t = nullptr;
+                                // process(t);
+                                process((TARGET *)nullptr);)
+
+    if (options.match_compiler || 1) { /* FIXME -- need proper liveness analysis */
+        Phv::setuse(INGRESS, phv_use[INGRESS]);
+        Phv::setuse(EGRESS, phv_use[EGRESS]);
+    }
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        int pov_byte = 0, pov_size = 0;
+        for (auto &ent : pov_order[gress])
+            if (pov[gress].count(&ent->reg) == 0) {
+                pov[gress][&ent->reg] = pov_size;
+                pov_size += ent->reg.size;
+            }
+        if (pov_size > 8 * Target::DEPARSER_MAX_POV_BYTES())
+            error(lineno[gress], "Ran out of space in POV in deparser");
+    }
+}
+
+/* The following uses of specialized templates must be after the specialization... */
+void Deparser::output(json::map &map) {
+    SWITCH_FOREACH_TARGET(options.target, auto *regs = new TARGET::deparser_regs;
+                          declare_registers(regs); write_config(*regs);
+                          gen_learn_quanta(*regs, map["learn_quanta"]); return;)
+    error(__LINE__, "Unsupported target %d", options.target);
+}
+
+/* this is a bit complicated since the output from compiler digest is as follows:
+ context_json:
+  0: [ [ ipv4.ihl, 0, 4, 0], [ ipv4.protocol, 0, 8, 1], [ ipv4.srcAddr, 0, 32, 2], [
+ ethernet.srcAddr, 0, 48, 6], [ ethernet.dstAddr, 0, 48, 12], [ ipv4.fragOffset, 0, 13, 18     ], [
+ ipv4.identification, 0, 16, 20], [ routing_metadata.learn_meta_1, 0, 20, 22], [
+ routing_metadata.learn_meta_4, 0, 10, 26] ] 1: [ [ ipv4.ihl, 0, 4, 0], [ ipv4.identification, 0,
+ 16, 1], [ ipv4.protocol, 0, 8, 3], [ ipv4.srcAddr, 0, 32, 4], [ ethernet.srcAddr, 0, 48, 8], [
+ ethernet.dstAddr, 0, 48,      14], [ ipv4.fragOffset, 0, 13, 20], [ routing_metadata.learn_meta_2,
+ 0, 24, 22], [ routing_metadata.learn_meta_3, 0, 25, 26] ] name: [ learn_1, learn_2 ]
+*/
+template <class REGS>
+void Deparser::gen_learn_quanta(REGS &regs, json::vector &learn_quanta) {
+    for (auto &digest : digests) {
+        if (digest.type->name != "learning") continue;
+        BUG_CHECK(digest.context_json);
+        auto namevec = (*(digest.context_json))["name"];
+        auto &names = *(namevec->as_vector());
+        auto digentry = digest.context_json->begin();
+        // Iterate on names. for each name, get the corresponding digest entry and fill in
+        for (auto &tname : names) {
+            BUG_CHECK(digentry != digest.context_json->end());
+            json::map quanta;
+            quanta["name"] = (*tname).c_str();
+            quanta["lq_cfg_type"] = digentry->first->as_number()->val;
+            quanta["handle"] = next_handle();
+            auto *digfields = digentry->second->as_vector();
+            if (digfields) {
+                auto &digfields_vec = *digfields;
+                json::vector &fields = quanta["fields"];
+                for (auto &tup : digfields_vec) {
+                    auto &one = *(tup->as_vector());
+                    BUG_CHECK(one.size() == 5);
+                    json::map anon;
+                    anon["field_name"] = (*(one[0])).clone();
+                    anon["start_byte"] = (*(one[1])).clone();
+                    anon["field_width"] = (*(one[2])).clone();
+                    anon["start_bit"] = (*(one[3])).clone();
+                    anon["phv_offset"] = (*(one[4])).clone();
+                    fields.push_back(std::move(anon));
+                }
+            }
+            digentry++;
+            learn_quanta.push_back(std::move(quanta));
+        }
+    }
+}
+
+unsigned Deparser::FDEntry::Checksum::encode() {
+    SWITCH_FOREACH_TARGET(options.target, return encode<TARGET::register_type>(););
+    return -1;
+}
+
+unsigned Deparser::FDEntry::Constant::encode() {
+    SWITCH_FOREACH_TARGET(options.target, return encode<TARGET::register_type>(););
+    return -1;
+}
+
+void Deparser::gtest_clear() {
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < MAX_DEPARSER_CHECKSUM_UNITS; j++) checksum_entries[i][j].clear();
+        dictionary[i].clear();
+        pov_order[i].clear();
+        pov[i].clear();
+        phv_use[i].clear();
+        constants[i].clear();
+    }
+    intrinsics.clear();
+    digests.clear();
+}
diff --git a/backends/tofino/bf-asm/deparser.h b/backends/tofino/bf-asm/deparser.h
new file mode 100644
index 00000000000..c958f3dd428
--- /dev/null
+++ b/backends/tofino/bf-asm/deparser.h
@@ -0,0 +1,286 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef DEPARSER_H_
+#define DEPARSER_H_
+
+#include <vector>
+
+#include <boost/optional.hpp>
+
+#include "constants.h"
+#include "lib/bitops.h"
+#include "lib/ordered_set.h"
+#include "phv.h"
+#include "sections.h"
+
+enum {
+    // limits over all targets
+    MAX_DEPARSER_CHECKSUM_UNITS = 8,
+    DEPARSER_STAGE = INT_MAX,  // greater than the number of stages
+};
+
+/**
+ * \ingroup parde
+ */
+class Deparser : public Section {
+    static Deparser singleton_object;
+
+ public:
+    struct Val {
+        /* a phv or clot reference with optional associated POV phv reference */
+        Phv::Ref val;
+        int tag = -1;
+        ordered_set<Phv::Ref> pov;
+        std::reference_wrapper<int> lineno = val.lineno;
+        Val() = default;
+        Val(const Val &) = default;
+        Val(Val &&) = default;
+        Val &operator=(Val &&) = default;
+        virtual ~Val() {}
+        Val(gress_t gr, const value_t &v) : val(gr, DEPARSER_STAGE, v) {}
+        Val(gress_t gr, const value_t &v, const value_t &p) : val(gr, DEPARSER_STAGE, v) {
+            pov.emplace(gr, DEPARSER_STAGE, p);
+        }
+        Val(gress_t gr, int tag, const value_t &p) : tag(tag) {
+            pov.emplace(gr, DEPARSER_STAGE, p);
+        }
+        Val &operator=(const Val &a) {
+            val = a.val;
+            tag = a.tag;
+            pov = a.pov;
+            return *this;
+        }
+        explicit operator bool() const { return is_phv() || is_clot(); }
+        Phv::Slice operator*() const { return *val; }
+        Phv::Slice operator->() const { return *val; }
+        bool is_phv() const { return bool(val); }
+        bool is_clot() const { return tag >= 0; }
+        virtual bool check() const {
+            if (is_phv() && is_clot()) {
+                error(lineno, "Reference cannot be phv and clot at the same time");
+                return false;
+            }
+            if (is_phv()) {
+                return val.check();
+            } else if (is_clot()) {
+                if (pov.empty()) {
+                    error(lineno, "Clot requires a pov bit");
+                    return false;
+                }
+            } else {
+                error(lineno, "Unknown val");
+                return false;
+            }
+            return true;
+        }
+    };
+
+    struct ChecksumVal : public Val {
+        int mask = 0;
+        int swap = 0;
+        ChecksumVal(gress_t gr, const value_t &v, const value_t &m) : Val(gr, v) {
+            if ((val->lo % 8 != 0) || (val->hi % 8 != 7))
+                error(lineno, "Can only do checksums on byte-aligned container slices");
+            mask = ((1 << (val->hi + 1) / 8) - 1) ^ ((1 << val->lo / 8) - 1);
+
+            if (CHECKTYPE(m, tMAP)) {
+                for (auto &kv : m.map) {
+                    if (kv.key == "pov") {
+                        if (!pov.empty()) error(kv.value.lineno, "Duplicate POV");
+                        pov.emplace_back(gr, DEPARSER_STAGE, kv.value);
+                    } else if (kv.key == "swap" && CHECKTYPE(kv.value, tINT)) {
+                        swap = kv.value.i;
+                    } else {
+                        error(m.lineno, "Unknown key for checksum: %s", value_desc(kv.key));
+                    }
+                }
+            }
+        }
+        ChecksumVal(gress_t gr, int tag, const value_t &p) : Val(gr, tag, p) {}
+        ChecksumVal &operator=(const ChecksumVal &a) {
+            Val::operator=(a);
+            mask = a.mask;
+            swap = a.swap;
+            return *this;
+        }
+        ChecksumVal(const ChecksumVal &a) : Val(a) {
+            mask = a.mask;
+            swap = a.swap;
+        };
+        ChecksumVal() : Val() {}
+        ChecksumVal(ChecksumVal &&) = default;
+        ChecksumVal &operator=(ChecksumVal &&) = default;
+        bool check() const override {
+            if (is_phv()) {
+                if (mask == 0) error(lineno, "mask is 0 for phv checkum value?");
+                if (swap < 0 || swap > 3) error(lineno, "Invalid swap for phv checksum");
+            }
+            return Val::check();
+        }
+    };
+
+    struct FullChecksumUnit {
+        std::map<int, std::vector<ChecksumVal>> entries;
+        std::map<int, Phv::Ref> pov;
+        std::set<int> checksum_unit_invert;
+        std::set<int> clot_tag_invert;
+        std::vector<ChecksumVal> clot_entries;
+        bool zeros_as_ones_en = false;
+    };
+
+    struct FDEntry;
+    std::vector<ChecksumVal> checksum_entries[2][MAX_DEPARSER_CHECKSUM_UNITS];
+    FullChecksumUnit full_checksum_unit[2][MAX_DEPARSER_CHECKSUM_UNITS];
+    int lineno[2];
+    std::vector<FDEntry> dictionary[2];
+    std::vector<Phv::Ref> pov_order[2];
+    ordered_map<const Phv::Register *, unsigned> pov[2];
+    bitvec phv_use[2];
+    std::set<int> constants[2];
+
+    struct Intrinsic {
+        struct Type;
+        Type *type;
+        int lineno;
+        std::vector<Val> vals;
+        Intrinsic(Type *t, int l) : type(t), lineno(l) {}
+    };
+    std::vector<Intrinsic> intrinsics;
+    struct Digest {
+        struct Type {
+            target_t target;
+            gress_t gress;
+            std::string name;
+            int count;
+            bool can_shift = false;
+            static std::map<std::string, Type *> all[TARGET_INDEX_LIMIT][2];
+
+         protected:
+            Type(target_t t, gress_t gr, const char *n, int cnt)
+                : target(t), gress(gr), name(n), count(cnt) {
+                BUG_CHECK(!all[target][gress].count(name));
+                all[target][gress][name] = this;
+            }
+            ~Type() { all[target][gress].erase(name); }
+
+         public:
+#define VIRTUAL_TARGET_METHODS(TARGET)                                            \
+    virtual void setregs(Target::TARGET::deparser_regs &regs, Deparser &deparser, \
+                         Deparser::Digest &data) {                                \
+        BUG_CHECK(!"target mismatch");                                            \
+    }
+            FOR_ALL_REGISTER_SETS(VIRTUAL_TARGET_METHODS)
+#undef VIRTUAL_TARGET_METHODS
+        };
+
+        Type *type;
+        int lineno;
+        Val select;
+        int shift = 0;
+        std::map<int, std::vector<Phv::Ref>> layout;
+        std::unique_ptr<json::map> context_json;
+        Digest(Type *t, int lineno, VECTOR(pair_t) & data);
+    };
+    std::vector<Digest> digests;
+    Deparser();
+    ~Deparser();
+    void start(int lineno, VECTOR(value_t) args);
+    void input(VECTOR(value_t) args, value_t data);
+    void process();
+    std::vector<ChecksumVal> merge_csum_entries(const std::vector<ChecksumVal> &, int);
+    template <class TARGET>
+    void process(TARGET *);
+    void output(json::map &);
+    template <class REGS>
+    void gen_learn_quanta(REGS &, json::vector &);
+    template <class REGS>
+    void write_config(REGS &);
+
+    static const bitvec &PhvUse(gress_t gr) { return singleton_object.phv_use[gr]; }
+
+    static bool add_constant(gress_t gr, int c) {
+        if (!singleton_object.constants[gr].count(c)) {
+            singleton_object.constants[gr].insert(c);
+            if (int(singleton_object.constants[gr].size()) > Target::DEPARSER_CONSTANTS())
+                return false;
+        }
+        return true;
+    }
+
+    static int constant_idx(gress_t gr, int c) {
+        if (singleton_object.constants[gr].count(c))
+            return std::distance(singleton_object.constants[gr].begin(),
+                                 singleton_object.constants[gr].find(c));
+        return -1;
+    }
+
+    // @return constant value that will be deparsed
+    static int get_constant(gress_t gr, int phv_idx) {
+        int i = 0;
+        for (auto constant : singleton_object.constants[gr]) {
+            if ((phv_idx - 224) == i) {
+                return constant;
+            } else {
+                i++;
+            }
+        }
+        return -1;
+    }
+
+    // Writes POV information in json used for field dictionary logging
+    // and deparser resources
+    static void write_pov_in_json(json::map &fd, json::map &fd_entry, const Phv::Register *phv,
+                                  int bit, int offset) {
+        auto povName = Phv::get_pov_name(phv->uid, offset);
+        // Field dictionary logging
+        fd["POV PHV"] = phv->uid;
+        fd["POV Field bit"] = bit;
+        fd["POV Field Name"] = povName;
+        // Deparser resources
+        fd_entry["pov_bit"] = bit;
+        fd_entry["pov_name"] = povName;
+        return;
+    }
+
+    // Digest Handle Setup
+    // ------------------------------------------------------
+    // | Pipe ID | Field Type |  Field List Handle          |
+    // 31 ...   28           24                             0
+    // Field List Handle = 24 bits
+    // Field List Type = 4 bits (Field list is 0x9)
+    // Pipe ID = 4 bits
+    static unsigned unique_field_list_handle;
+    static unsigned next_handle() {
+        return unique_table_offset << PIPE_ID_SHIFT | FIELD_HANDLE_START |
+               unique_field_list_handle++;
+    }
+
+    // gtest methods
+
+    /// @brief Get the singleton object for use in gtest
+    static Deparser *gtest_get_deparser() { return &singleton_object; }
+
+    /// @brief Clear/reset the deparser object
+    void gtest_clear();
+
+ private:
+    // Report deparser resources to JSON file
+    void report_resources_deparser_json(json::vector &fde_entries_i, json::vector &fde_entries_e);
+};
+
+#endif /* DEPARSER_H_ */
diff --git a/backends/tofino/bf-asm/depositfield.cpp b/backends/tofino/bf-asm/depositfield.cpp
new file mode 100644
index 00000000000..15f2b991bb4
--- /dev/null
+++ b/backends/tofino/bf-asm/depositfield.cpp
@@ -0,0 +1,39 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "depositfield.h"
+
+namespace DepositField {
+
+RotateConstant discoverRotation(int32_t val, int containerSize, int32_t tooLarge,
+                                int32_t tooSmall) {
+    int32_t containerMask = ~(UINT64_MAX << containerSize);
+    int32_t signBit = 1U << (containerSize - 1);
+    unsigned rotate = 0;
+    for (/*rotate*/; rotate < containerSize; ++rotate) {
+        if (val > tooSmall && val < tooLarge) break;
+        // Reverse the rotate-right to discover encoding.
+        int32_t rotBit = (val >> (containerSize - 1)) & 1;
+        val = ((val << 1) | rotBit) & containerMask;
+        val |= (val & signBit) ? ~containerMask : 0;
+    }
+    // If a solution has not been found, val is back to where it started.
+    rotate %= containerSize;
+    return RotateConstant{rotate, val};
+}
+
+}  // namespace DepositField
diff --git a/backends/tofino/bf-asm/depositfield.h b/backends/tofino/bf-asm/depositfield.h
new file mode 100644
index 00000000000..b7519eb2fcb
--- /dev/null
+++ b/backends/tofino/bf-asm/depositfield.h
@@ -0,0 +1,34 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef _DEPOSITFIELD_H_
+#define _DEPOSITFIELD_H_
+
+#include <stdint.h>
+
+namespace DepositField {
+
+struct RotateConstant {
+    unsigned rotate;
+    int32_t value;
+};
+
+RotateConstant discoverRotation(int32_t val, int containerSize, int32_t tooLarge, int32_t tooSmall);
+
+}  // namespace DepositField
+
+#endif /* _DEPOSITFIELD_H_ */
diff --git a/backends/tofino/bf-asm/disasm.cpp b/backends/tofino/bf-asm/disasm.cpp
new file mode 100644
index 00000000000..0a58816fc47
--- /dev/null
+++ b/backends/tofino/bf-asm/disasm.cpp
@@ -0,0 +1,29 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "disasm.h"
+
+#include <iostream>
+
+Disasm *Disasm::create(std::string target) {
+#define CREATE_TARGET(TARGET, ...) \
+    if (target == Target::TARGET::name) return new Disasm::TARGET;
+    FOR_ALL_TARGETS(CREATE_TARGET);
+#undef CREATE_TARGET
+    std::cerr << "Unsupported target " << target << std::endl;
+    return nullptr;
+}
diff --git a/backends/tofino/bf-asm/disasm.h b/backends/tofino/bf-asm/disasm.h
new file mode 100644
index 00000000000..812ed4af0b5
--- /dev/null
+++ b/backends/tofino/bf-asm/disasm.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef DISASM_H_
+#define DISASM_H_
+
+#include "backends/tofino/bf-asm/target.h"
+
+class Disasm {
+ public:
+    FOR_ALL_TARGETS(DECLARE_TARGET_CLASS)
+    virtual ~Disasm() {}
+    virtual void input_binary(uint64_t addr, char tag, uint32_t *data, size_t len) = 0;
+    static Disasm *create(std::string target);
+};
+
+#define DECLARE_DISASM_TARGET(TARGET, ...)        \
+    class Disasm::TARGET : public Disasm {        \
+     public:                                      \
+        typedef ::Target::TARGET Target;          \
+        Target::top_level_regs regs;              \
+        TARGET() { declare_registers(&regs); }    \
+        ~TARGET() { undeclare_registers(&regs); } \
+        TARGET(const TARGET &) = delete;          \
+        __VA_ARGS__                               \
+    };
+
+FOR_ALL_TARGETS(
+    DECLARE_DISASM_TARGET, void input_binary(uint64_t addr, char tag, uint32_t *data, size_t len) {
+        if (tag == 'D') {
+            regs.mem_top.input_binary(addr, tag, data, len);
+        } else {
+            regs.reg_top.input_binary(addr, tag, data, len);
+        }
+    })
+
+#endif /* DISASM_H_ */
diff --git a/backends/tofino/bf-asm/dynhash.cpp b/backends/tofino/bf-asm/dynhash.cpp
new file mode 100644
index 00000000000..c3457c4bb58
--- /dev/null
+++ b/backends/tofino/bf-asm/dynhash.cpp
@@ -0,0 +1,64 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "backends/tofino/bf-asm/json.h"
+#include "bfas.h"
+#include "sections.h"
+
+class DynHash : public Section {
+    int lineno = -1;
+    std::unique_ptr<json::obj> _dynhash = nullptr;
+    std::string _dynhashFileName;
+
+    DynHash() : Section("dynhash") {}
+
+    void input(VECTOR(value_t) args, value_t data) {
+        lineno = data.lineno;
+        if (!CHECKTYPE(data, tSTR)) return;
+        _dynhashFileName = data.s;
+    }
+
+    void process() {
+        if (_dynhashFileName.empty()) return;
+        std::ifstream inputFile(_dynhashFileName);
+        if (!inputFile && _dynhashFileName[0] != '/')
+            inputFile.open(asmfile_dir + "/" + _dynhashFileName);
+        if (!inputFile) {
+            warning(lineno, "%s: can't read file", _dynhashFileName.c_str());
+        } else {
+            inputFile >> _dynhash;
+            if (!inputFile) {
+                warning(lineno, "%s: not valid dynhash json representation",
+                        _dynhashFileName.c_str());
+                _dynhash.reset(new json::map());
+            }
+        }
+    }
+
+    void output(json::map &ctxtJson) {
+        ctxtJson["dynamic_hash_calculations"] = json::vector();  // this key required by schema
+        if (_dynhash) {
+            ctxtJson.merge(_dynhash->to<json::map>());
+        }
+    }
+
+    static DynHash singleton_dynhash;
+} DynHash::singleton_dynhash;
diff --git a/backends/tofino/bf-asm/error_mode.cpp b/backends/tofino/bf-asm/error_mode.cpp
new file mode 100644
index 00000000000..ca710fd98a2
--- /dev/null
+++ b/backends/tofino/bf-asm/error_mode.cpp
@@ -0,0 +1,202 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "error_mode.h"
+
+#include "backends/tofino/bf-asm/stage.h"
+
+DefaultErrorMode DefaultErrorMode::singleton;
+
+ErrorMode::mode_t ErrorMode::str2mode(const value_t &v) {
+    if (CHECKTYPE(v, tSTR)) {
+        if (v == "propagate") return PROPAGATE;
+        if (v == "map_to_immediate") return MAP_TO_IMMEDIATE;
+        if (v == "disable") return DISABLE_ALL_TABLES;
+        if (v == "propagate_and_map") return PROPAGATE_AND_MAP;
+        if (v == "propagate_and_disable") return PROPAGATE_AND_DISABLE;
+        if (v != "no_config") error(v.lineno, "Not a valid error mode: %s", v.s);
+    }
+    return NO_CONFIG;
+}
+
+const char *ErrorMode::mode2str(ErrorMode::mode_t m) {
+    switch (m) {
+        case NO_CONFIG:
+            return "no_config";
+        case PROPAGATE:
+            return "propagate";
+        case MAP_TO_IMMEDIATE:
+            return "map_to_immediate";
+        case DISABLE_ALL_TABLES:
+            return "disable";
+        case PROPAGATE_AND_MAP:
+            return "propagate_and_map";
+        case PROPAGATE_AND_DISABLE:
+            return "propagate_and_disable";
+        default:
+            return "<invalid mode>";
+    }
+}
+
+ErrorMode::type_t ErrorMode::str2type(const value_t &v) {
+    if (CHECKTYPE(v, tSTR)) {
+        if (v == "tcam_match") return TCAM_MATCH;
+        if (v == "tind_ecc") return TIND_ECC;
+        if (v == "gfm_parity") return GFM_PARITY;
+        if (v == "emm_ecc") return EMM_ECC;
+        if (v == "prev_err") return PREV_ERROR;
+        if (v == "actiondata") return ACTIONDATA_ERROR;
+        if (v == "imem_parity") return IMEM_PARITY_ERROR;
+        error(v.lineno, "Not a valid error type: %s", v.s);
+    }
+    return TCAM_MATCH;  // avoid invalid type here, error message has been output already
+}
+
+const char *ErrorMode::type2str(ErrorMode::type_t t) {
+    switch (t) {
+        case TCAM_MATCH:
+            return "tcam_match";
+        case TIND_ECC:
+            return "tind_ecc";
+        case GFM_PARITY:
+            return "gfm_parity";
+        case EMM_ECC:
+            return "emm_ecc";
+        case PREV_ERROR:
+            return "prev_err";
+        case ACTIONDATA_ERROR:
+            return "actiondata";
+        case IMEM_PARITY_ERROR:
+            return "imem_parity";
+        default:
+            return "<invalid type>";
+    }
+}
+
+void ErrorMode::input(value_t data) {
+    if (!CHECKTYPE2(data, tSTR, tMAP)) return;
+    if (data.type == tSTR) {
+        mode_t m = str2mode(data);
+        for (int i = 0; i < NUM_TYPE_T; ++i) {
+            if (i == LATE_ERROR && m != NO_CONFIG) m = PROPAGATE;
+            mode[i] = m;
+        }
+    } else {
+        for (auto &kv : MapIterChecked(data.map)) {
+            type_t t = str2type(kv.key);
+            mode_t m = str2mode(kv.value);
+            if (t >= LATE_ERROR && m > PROPAGATE)
+                error(kv.value.lineno, "%s error mode can only propagate, not %s", type2str(t),
+                      mode2str(m));
+            mode[t] = m;
+        }
+    }
+}
+
+template <class REGS>
+void ErrorMode::write_regs(REGS &regs, const Stage *stage, gress_t gress) {
+    auto &merge = regs.rams.match.merge;
+    int tcam_err_delay = stage->tcam_delay(gress) ? 1 : 0;
+    int fifo_err_delay =
+        stage->pipelength(gress) - stage->pred_cycle(gress) - Target::MAU_ERROR_DELAY_ADJUST();
+    bool map_to_immed = false;
+    bool propagate = false;
+#define YES(X) X
+#define NO(X)
+#define HANDLE_ERROR_CASES(REG, HAVE_O_ERR_EN)              \
+    case NO_CONFIG:                                         \
+        break;                                              \
+    case PROPAGATE:                                         \
+        HAVE_O_ERR_EN(merge.REG[gress].REG##_o_err_en = 1;) \
+        propagate = true;                                   \
+        break;                                              \
+    case PROPAGATE_AND_MAP:                                 \
+        HAVE_O_ERR_EN(merge.REG[gress].REG##_o_err_en = 1;) \
+        propagate = true;                                   \
+        /* fall through */                                  \
+    case MAP_TO_IMMEDIATE:                                  \
+        merge.REG[gress].REG##_idata_ovr = 1;               \
+        map_to_immed = true;                                \
+        break;                                              \
+    case PROPAGATE_AND_DISABLE:                             \
+        HAVE_O_ERR_EN(merge.REG[gress].REG##_o_err_en = 1;) \
+        propagate = true;                                   \
+        /* fall through */                                  \
+    case DISABLE_ALL_TABLES:                                \
+        merge.REG[gress].REG##_dis_pred = 1;                \
+        break;                                              \
+    default:                                                \
+        BUG();
+
+    switch (mode[PREV_ERROR]) { HANDLE_ERROR_CASES(prev_error_ctl, NO) }
+    merge.prev_error_ctl[gress].prev_error_ctl_delay = tcam_err_delay;
+    if (propagate) {
+        switch (stage->stage_dep[gress]) {
+            case Stage::CONCURRENT:
+                merge.prev_error_ctl[gress].prev_error_ctl_conc = 1;
+                break;
+            case Stage::ACTION_DEP:
+                merge.prev_error_ctl[gress].prev_error_ctl_action = 1;
+                break;
+            case Stage::NONE:
+                if (stage->stageno == 0) {
+                        // stage 0 does not have stage_dep set, but counts as if it was match
+                        // dependent (on the parser).  FIXME -- should just always set stage_dep to
+                        // MATCH_DEP for stage 0? fall through
+                    case Stage::MATCH_DEP:
+                        merge.prev_error_ctl[gress].prev_error_ctl_match = 1;
+                        break;
+                }
+                [[fallthrough]];
+            default:
+                BUG("unexpected stage_dep: %d", stage->stage_dep[gress]);
+        }
+    }
+
+    switch (mode[TCAM_MATCH]) { HANDLE_ERROR_CASES(tcam_match_error_ctl, YES) }
+    switch (mode[TIND_ECC]) { HANDLE_ERROR_CASES(tind_ecc_error_ctl, YES) }
+    switch (mode[GFM_PARITY]) { HANDLE_ERROR_CASES(gfm_parity_error_ctl, YES) }
+    merge.gfm_parity_error_ctl[gress].gfm_parity_error_ctl_delay = tcam_err_delay;
+    switch (mode[EMM_ECC]) { HANDLE_ERROR_CASES(emm_ecc_error_ctl, YES) }
+    merge.emm_ecc_error_ctl[gress].emm_ecc_error_ctl_delay = tcam_err_delay;
+
+    if (map_to_immed) {
+        merge.err_idata_ovr_fifo_ctl[gress].err_idata_ovr_fifo_ctl_en = 1;
+        merge.err_idata_ovr_fifo_ctl[gress].err_idata_ovr_fifo_ctl_delay = fifo_err_delay - 2;
+    }
+    if (propagate) {
+        merge.o_error_fifo_ctl[gress].o_error_fifo_ctl_en = 1;
+        merge.o_error_fifo_ctl[gress].o_error_fifo_ctl_delay = fifo_err_delay;
+    }
+
+    // action error sources can only propagate (too late for disable or map_to_immed
+    if (mode[ACTIONDATA_ERROR]) merge.actiondata_error_ctl |= 1 << gress;
+    if (mode[IMEM_PARITY_ERROR]) merge.imem_parity_error_ctl |= 1 << gress;
+
+    /* TODO -- additional error cfg regs:
+     * rams.match.merge.err_idata_ovr_ctl[gress]
+     * rams.match.merge.s2p_meter_error_ctl[gress]
+     * rams.match.merge.s2p_stats_error_ctl[gress]
+     * rams.map_alu.stats_wrap[alu].stats.statistics.ctl.stats_alu_error_enable;
+     * rams.map_alu.meter_alu_group_error_ctl[alu]
+     * rams.array.row[r].actiondata_error_uram_ctl[gress]
+     * rams.array.row[r].emm_ecc_error_uram_ctl[gress]
+     * rams.array.row[r].tind_ecc_error_uram_ctl[gress]
+     */
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void ErrorMode::write_regs, mau_regs &,
+                      const Stage *, gress_t);
diff --git a/backends/tofino/bf-asm/error_mode.h b/backends/tofino/bf-asm/error_mode.h
new file mode 100644
index 00000000000..b0d5cb75e0b
--- /dev/null
+++ b/backends/tofino/bf-asm/error_mode.h
@@ -0,0 +1,73 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_ERROR_MODE_H_
+#define BACKENDS_TOFINO_BF_ASM_ERROR_MODE_H_
+
+#include "sections.h"
+
+class Stage;
+
+class ErrorMode {
+ public:
+    typedef enum {
+        NO_CONFIG = 0,
+        PROPAGATE,
+        MAP_TO_IMMEDIATE,
+        DISABLE_ALL_TABLES,
+        PROPAGATE_AND_MAP,
+        PROPAGATE_AND_DISABLE
+    } mode_t;
+    typedef enum {
+        TCAM_MATCH,
+        TIND_ECC,
+        GFM_PARITY,
+        EMM_ECC,
+        PREV_ERROR,
+        ACTIONDATA_ERROR,
+        IMEM_PARITY_ERROR,
+        NUM_TYPE_T,
+        LATE_ERROR = ACTIONDATA_ERROR,  // this (and after) is limited
+    } type_t;
+
+    mode_t mode[NUM_TYPE_T] = {NO_CONFIG};
+    mode_t &operator[](type_t t) { return mode[t]; }
+    static mode_t str2mode(const value_t &);
+    static const char *mode2str(mode_t m);
+    static type_t str2type(const value_t &);
+    static const char *type2str(type_t t);
+
+    void input(value_t data);
+    template <class REGS>
+    void write_regs(REGS &, const Stage *, gress_t);
+};
+
+class DefaultErrorMode : public Section, public ErrorMode {
+    DefaultErrorMode() : Section("error_mode") {
+        // This code sets the default error mode when the assembler is used with an older
+        // compiler.  Current compiler should always set or override this in the .bfa file
+        for (auto &m : mode) m = PROPAGATE_AND_DISABLE;
+    }
+    static DefaultErrorMode singleton;
+
+ public:
+    void input(VECTOR(value_t) args, value_t data) override { ErrorMode::input(data); }
+    void output(json::map &) override {}
+    static ErrorMode get() { return singleton; }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_ERROR_MODE_H_ */
diff --git a/backends/tofino/bf-asm/escape.h b/backends/tofino/bf-asm/escape.h
new file mode 100644
index 00000000000..22cd45f6ac1
--- /dev/null
+++ b/backends/tofino/bf-asm/escape.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_ESCAPE_H_
+#define BACKENDS_TOFINO_BF_ASM_ESCAPE_H_
+
+#include <iomanip>
+#include <iostream>
+
+#include "lib/hex.h"
+
+class escape {
+    std::string str;
+
+ public:
+    explicit escape(const std::string &s) : str(s) {}
+    friend std::ostream &operator<<(std::ostream &os, escape e);
+};
+
+inline std::ostream &operator<<(std::ostream &os, escape e) {
+    for (char ch : e.str) {
+        switch (ch) {
+            case '\n':
+                os << "\\n";
+                break;
+            case '\t':
+                os << "\\t";
+                break;
+            case '\\':
+                os << "\\\\";
+                break;
+            default:
+                if (ch < 32 || ch >= 127)
+                    os << "\\x" << hex(ch & 0xff, 2, '0');
+                else
+                    os << ch;
+        }
+    }
+    return os;
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_ESCAPE_H_ */
diff --git a/backends/tofino/bf-asm/exact_match.cpp b/backends/tofino/bf-asm/exact_match.cpp
new file mode 100644
index 00000000000..d715a740b2f
--- /dev/null
+++ b/backends/tofino/bf-asm/exact_match.cpp
@@ -0,0 +1,528 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "tofino/exact_match.h"
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "hashexpr.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "lib/algorithm.h"
+#include "lib/hex.h"
+#include "misc.h"
+
+void ExactMatchTable::setup(VECTOR(pair_t) & data) {
+    common_init_setup(data, false, P4Table::MatchEntry);
+    for (auto &kv : MapIterChecked(data, {"meter", "stats", "stateful"})) {
+        if (common_setup(kv, data, P4Table::MatchEntry)) {
+            // Dynamic key masks are only on exact match tables
+        } else if (kv.key == "dynamic_key_masks") {
+            if (CHECKTYPE(kv.value, tSTR))
+                dynamic_key_masks = (strncmp(kv.value.s, "true", 4) == 0);
+        } else if (kv.key == "stash") {
+            CHECKTYPE(kv.value, tMAP);
+            for (auto &m : kv.value.map) {
+                if (m.key == "row") {
+                    if (CHECKTYPE(m.value, tVEC)) {
+                        auto rows = m.value.vec;
+                        for (value_t &r : rows) {
+                            if (CHECKTYPE(r, tINT)) stash_rows.push_back(r.i);
+                        }
+                    }
+                }
+                if (m.key == "col") {
+                    if (CHECKTYPE(m.value, tVEC)) {
+                        auto cols = m.value.vec;
+                        for (value_t &c : cols) {
+                            if (CHECKTYPE(c, tINT)) stash_cols.push_back(c.i);
+                        }
+                    }
+                }
+                if (m.key == "unit") {
+                    if (CHECKTYPE(m.value, tVEC)) {
+                        auto units = m.value.vec;
+                        for (value_t &u : units) {
+                            if (CHECKTYPE(u, tINT)) stash_units.push_back(u.i);
+                        }
+                    }
+                }
+            }
+            if (stash_rows.size() == 0) {
+                error(kv.value.lineno, "No 'row' attribute for stash info in exact match table %s",
+                      name());
+                return;
+            }
+            if (stash_cols.size() == 0) {
+                error(kv.value.lineno, "No 'col' attribute for stash info in exact match table %s",
+                      name());
+                return;
+            }
+            if (stash_units.size() == 0) {
+                error(kv.value.lineno, "No 'unit' attribute for stash info in exact match table %s",
+                      name());
+                return;
+            }
+            if (stash_units.size() != stash_rows.size()) {
+                error(kv.value.lineno,
+                      "Stash units not specified correctly for each row entry "
+                      "in exact match table %s",
+                      name());
+                return;
+            }
+        } else if (kv.key == "search_bus" || kv.key == "result_bus") {
+            // already dealt with in Table::setup_layout via common_init_setup
+        } else {
+            common_sram_setup(kv, data);
+        }
+    }
+    common_sram_checks();
+}
+
+void ExactMatchTable::pass1() {
+    LOG1("### Exact match table " << name() << " pass1 " << loc());
+    SRamMatchTable::pass1();
+    // Check if stashes are allocated (only for exact match tables). Note
+    // stashes are disabled on JBAY
+    if (stash_rows.size() == 0 && options.target == TOFINO && layout_size() > 0)
+        error(lineno, "No stashes allocated for exact match table %s in stage %d", name(),
+              stage->stageno);
+}
+
+/**
+ * Any bits that are not matched directly against, but appear in the key of the p4 table,
+ * are ghost bits.  The rules for ghost bits on exact match tables are:
+ *
+ *    1. Any field that does not appear in the match key must appear in the hash function.  This
+ *       is considered a ghost bit
+ *    2. A hash column can have at most one ghost bit, in order to maintain the linear
+ *       independence of the impact of each ghost bit.
+ *
+ * The following function verifies these two properties, and saves them in a map to be output
+ * in the gen_ghost_bits function call
+ */
+void ExactMatchTable::determine_ghost_bits() {
+    std::set<std::pair<std::string, int>> ghost_bits;
+    // Determine ghost bits by determine what is not in the match
+    for (auto &p4_param : p4_params_list) {
+        for (int bit = p4_param.start_bit; bit < p4_param.start_bit + p4_param.bit_width; bit++) {
+            if (!p4_param.mask.empty() && !p4_param.mask[bit]) continue;  // Skip non-masked bits.
+            bool found = false;
+            for (auto ms : match) {
+                std::string field_name = ms->name();
+                int field_bit_lo = remove_name_tail_range(field_name) + ms->fieldlobit();
+                int field_bit_hi = field_bit_lo + ms->size() - 1;
+                if (field_name == p4_param.name && field_bit_lo <= bit && field_bit_hi >= bit) {
+                    found = true;
+                    break;
+                }
+            }
+            if (found) continue;
+            ghost_bits.emplace(p4_param.name, bit);
+        }
+    }
+
+    BUG_CHECK(!input_xbar.empty(), "%s does not have an input xbar", name());
+    for (const auto &ixb : input_xbar) {
+        int way_index = 0;
+        for (auto way : ways) {
+            bitvec hash_tables;
+            if (auto *hash_group = ixb->get_hash_group(way.group_xme)) {
+                hash_tables = bitvec(hash_group->tables);
+            } else {
+                for (auto &ht : ixb->get_hash_tables()) {
+                    BUG_CHECK(ht.first.type == InputXbar::HashTable::EXACT);
+                    hash_tables[ht.first.index] = 1;
+                }
+            }
+
+            // key is the field name/field bit that is the ghost bit
+            // value is the bits that the ghost bit appears in within this way
+            std::map<std::pair<std::string, int>, bitvec> ghost_bit_impact;
+
+            // Check a phv ref against the ghost bits for sanity
+            auto check_ref = [this, way_index, &ghost_bits, &ghost_bit_impact, &ixb](Phv::Ref &ref,
+                                                                                     int hash_bit) {
+                std::string field_name = ref.name();
+                int field_bit = remove_name_tail_range(field_name) + ref.fieldlobit();
+                for (int i = 0; i < ref.size(); ++i) {
+                    auto key = std::make_pair(field_name, field_bit + i);
+                    auto ghost_bit_it = ghost_bits.find(key);
+                    if (ghost_bit_it == ghost_bits.end()) continue;
+
+                    // This is a check to make sure that the ghost bit appears only once
+                    // in the hash column, as an even number of appearances would
+                    // xor each other out, and cancel the hash out.  This check
+                    // should be done on all hash bits
+                    if (ghost_bit_impact[key].getbit(hash_bit)) {
+                        error(ixb->lineno,
+                              "Ghost bit %s:%d appears multiple times "
+                              "in the same hash col %d",
+                              key.first.c_str(), key.second, way_index);
+                        return;
+                    }
+                    ghost_bit_impact[key].setbit(hash_bit);
+                }
+            };
+
+            // Calculate the ghost bit per hash way
+            for (unsigned hash_table_id : hash_tables) {
+                auto &hash_table = ixb->get_hash_table(hash_table_id);
+                for (auto hash_bit : way.select_bits()) {
+                    if (hash_table.count(hash_bit) == 0) continue;
+                    const HashCol &hash_col = hash_table.at(hash_bit);
+                    if (hash_col.fn) {
+                        for (auto &ref : hash_col.fn->get_sources(hash_col.bit))
+                            check_ref(ref, hash_bit);
+                    } else {
+                        for (const auto &input_bit : hash_col.data)
+                            if (auto ref = ixb->get_hashtable_bit(hash_table_id, input_bit))
+                                check_ref(ref, hash_bit);
+                    }
+                }
+            }
+
+            // Verify that each ghost bit appears in the hash function
+            for (auto gb : ghost_bits) {
+                if (ghost_bit_impact.find(gb) == ghost_bit_impact.end()) {
+                    error(ixb->lineno,
+                          "Ghost bit %s:%d does not appear on the hash function "
+                          "for way %d",
+                          gb.first.c_str(), gb.second, way_index);
+                    return;
+                }
+            }
+
+            // Verify that the ghost bits are linearly independent, that only one ghost bit
+            // exists per column
+            bitvec total_use;
+            for (auto gbi : ghost_bit_impact) {
+                if (!(total_use & gbi.second).empty())
+                    error(ixb->lineno, "The ghost bits are not linear independent on way %d",
+                          way_index);
+                total_use |= gbi.second;
+            }
+
+            auto &ghost_bit_position = ghost_bit_positions[way.group_xme];
+            for (auto gbi : ghost_bit_impact) {
+                ghost_bit_position[gbi.first] |= gbi.second;
+            }
+            way_index++;
+        }
+    }
+}
+
+void ExactMatchTable::pass2() {
+    LOG1("### Exact match table " << name() << " pass2 " << loc());
+    // FIXME -- does some of this common stuff belong in SRamMatch::pass2
+    if (logical_id < 0) choose_logical_id();
+    for (auto &ixb : input_xbar) ixb->pass2();
+    setup_word_ixbar_group();
+    if (actions) actions->pass2(this);
+    if (action_bus) action_bus->pass2(this);
+    if (gateway) gateway->pass2();
+    if (idletime) idletime->pass2();
+    if (format) format->pass2(this);
+    unsigned usable = -1;
+    for (auto &ixb : input_xbar) usable &= ixb->exact_physical_ids();
+    allocate_physical_ids(usable);
+    determine_ghost_bits();
+    // Derive a stash format from current table format with a single entry (we
+    // use group 0 entry) and all fields except 'version' and 'action' (match
+    // overhead). The version bits are set by the driver.
+    if (format) {
+        stash_format.reset(new Format(this));
+        stash_format->size = MEM_WORD_WIDTH;
+        stash_format->log2size = ceil_log2(MEM_WORD_WIDTH);
+        auto group = 0;
+        for (auto f = format->begin(group); f != format->end(group); f++) {
+            if (f->first == "action" || f->first == "version") continue;
+            stash_format->add_field(f->second, f->first, group);
+        }
+    }
+    for (auto &hd : hash_dist) hd.pass2(this);
+    if (format) verify_format_pass2();
+}
+
+void ExactMatchTable::pass3() {
+    LOG1("### Exact match table " << name() << " pass3 " << loc());
+    SRamMatchTable::pass3();
+    if (action_bus) action_bus->pass3(this);
+}
+
+// Check way_map for each stash row/col pair to determine which word the ram is
+// assigned to and verify if it is the match overhead word. Allocate stash
+// overhead row for each stash row/col pair.
+void ExactMatchTable::generate_stash_overhead_rows() {
+    auto mem_units_per_word = format ? format->get_mem_units_per_table_word() : 1;
+    for (int i = 0; i < stash_rows.size(); i++) {
+        auto idx = (i + mem_units_per_word) / mem_units_per_word;
+        if (stash_overhead_rows.size() >= idx) continue;
+        auto stash_row = stash_rows[i];
+        auto stash_col = stash_cols[i];
+        for (auto &row : layout) {
+            if (row.row == stash_row) {
+                Ram stash_ram(stash_row, stash_col);
+                if (way_map.count(stash_ram) > 0) {
+                    auto way_word = way_map[stash_ram].word;
+                    BUG_CHECK(format);
+                    if (way_word == format->overhead_word) {
+                        stash_overhead_rows.push_back(stash_row);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
+/* FIXME -- should have ExactMatchTable::write_merge_regs write some of the merge stuff
+ * from write_regs? */
+template <class REGS>
+void ExactMatchTable::write_regs_vt(REGS &regs) {
+    LOG1("### Exact match table " << name() << " write_regs " << loc());
+    SRamMatchTable::write_regs(regs);
+
+    for (auto &row : layout) {
+        auto &rams_row = regs.rams.array.row[row.row];
+        for (auto &ram : row.memunits) {
+            auto &way = way_map[ram];
+            BUG_CHECK(ram.stage == INT_MIN && ram.row == row.row, "bogus %s in row %d", ram.desc(),
+                      row.row);
+            auto &ram_cfg = rams_row.ram[ram.col];
+            ram_cfg.match_nibble_s0q1_enable = version_nibble_mask.getrange(way.word * 32U, 32);
+            ram_cfg.match_nibble_s1q0_enable = UINT64_C(0xffffffff);
+        }
+    }
+
+    // Write stash regs if stashes are allocated
+    if (stash_rows.size() == 0) return;
+    auto &merge = regs.rams.match.merge;
+    auto &stash_hitmap_output_map = merge.stash_hitmap_output_map;
+    generate_stash_overhead_rows();
+    auto mem_units_per_word = format ? format->get_mem_units_per_table_word() : 1;
+    for (int i = 0; i < stash_rows.size(); i++) {
+        auto stash_row = stash_rows[i];
+        auto stash_col = stash_cols[i];
+        auto stash_unit_id = stash_units[i];
+        MemUnit stash_memunit(stash_row, stash_col);
+        auto idx = i / mem_units_per_word;
+        auto physical_row_with_overhead =
+            stash_overhead_rows.size() > idx ? stash_overhead_rows[idx] : ways[0].rams[0].row;
+        LOG5("Setting cfg for stash Row: " << stash_row << ", stash Unit: " << stash_unit_id
+                                           << " with overhead word row: "
+                                           << physical_row_with_overhead);
+        auto &stash_map_entry = stash_hitmap_output_map[stash_unit_id][stash_row];
+        stash_map_entry.enabled_3bit_muxctl_select = physical_row_with_overhead;
+        stash_map_entry.enabled_3bit_muxctl_enable = 1;
+        auto &stash_reg = regs.rams.array.row[stash_row].stash;
+        auto &input_data_ctl = stash_reg.stash_match_input_data_ctl[stash_unit_id];
+        input_data_ctl.stash_hash_adr_select = ways[0].index / EXACT_HASH_ADR_BITS;
+        input_data_ctl.stash_enable = 1;
+        input_data_ctl.stash_logical_table = logical_id;
+        input_data_ctl.stash_thread = (gress == EGRESS);
+        auto &stash_row_nxtable_bus_drive =
+            merge.stash_row_nxtable_bus_drive[stash_unit_id][stash_row];
+        for (auto &row : layout) {
+            if (row.row != stash_row) continue;
+            if (contains(row.memunits, stash_memunit)) {
+                // Assumption is that the search or match and result buses are
+                // always generated on the same index
+                auto &stash_match_mask = stash_reg.stash_match_mask[stash_unit_id];
+                if (stash_row == physical_row_with_overhead) {
+                    // FIXME -- the overhead row should always have a result bus allocated, but
+                    // sometimes it does not.  This hack has been here for awhile and is needed
+                    // for p4_16/compile_only/meters_0.p4 at least, but seems wrong and unsafe
+                    int result_bus = row.bus.count(Layout::RESULT_BUS)
+                                         ? row.bus.at(Layout::RESULT_BUS)
+                                         : row.bus.at(Layout::SEARCH_BUS);
+                    stash_row_nxtable_bus_drive = 1 << result_bus;
+                    stash_reg.stash_match_result_bus_select[stash_unit_id] = 1 << result_bus;
+
+                    // Set default next table only when there is a single next table
+                    auto &nxt_table_lut = merge.stash_next_table_lut[stash_unit_id][stash_row];
+                    std::set<Ref> nxt_tables;
+                    for (auto &n : hit_next) {
+                        for (auto &n1 : n) {
+                            nxt_tables.emplace(n1);
+                        }
+                    }
+                    if (nxt_tables.size() == 0) {
+                        nxt_table_lut = Stage::end_of_pipe();
+                    } else if (nxt_tables.size() == 1) {
+                        nxt_table_lut = miss_next.next_table_id();
+                    } else {
+                        nxt_table_lut = 0;
+                    }
+
+                    // 2 entries per stash unit
+                    nxt_table_lut |= (nxt_table_lut << 8);
+
+                    bitvec match_mask;
+                    match_mask.setrange(0, 128);
+                    // Since stash format can only have one entry (and no version bits) we
+                    // generate the stash mask on exact match format with group 0
+                    if (Format::Field *match = format->field("match", 0)) {
+                        for (auto &piece : match->bits)
+                            match_mask.clrrange(piece.lo, piece.hi + 1 - piece.lo);
+                    }
+                    for (int word = 0; word < 4; word++) {
+                        stash_match_mask[word] = match_mask.getrange(word * 32, 32);
+                    }
+                } else {
+                    stash_row_nxtable_bus_drive = 0;
+                    stash_reg.stash_match_result_bus_select[stash_unit_id] = 0;
+                    for (int word = 0; word < 4; word++) {
+                        stash_match_mask[word] = 0;
+                    }
+                }
+                input_data_ctl.stash_match_data_select = row.bus.at(Layout::SEARCH_BUS);
+                input_data_ctl.stash_hashbank_select = row.bus.at(Layout::SEARCH_BUS);
+                break;
+            }
+        }
+    }
+}
+
+void ExactMatchTable::gen_tbl_cfg(json::vector &out) const {
+    LOG3("### Exact match table " << name() << " gen_tbl_cfg " << loc());
+    unsigned size = get_number_entries();
+    json::map &tbl = *base_tbl_cfg(out, "match", size);
+    add_all_reference_tables(tbl);
+    json::map &stage_tbl = *add_common_sram_tbl_cfgs(tbl, "exact", "hash_match");
+    add_pack_format(stage_tbl, format.get(), true, false);
+    stage_tbl["memory_resource_allocation"] = nullptr;
+    if (stash_rows.size() > 0) {
+        json::map &stash_allocation = stage_tbl["stash_allocation"] = json::map();
+        // Add 'action' field if present
+        if (format && stash_format) {
+            int group = 0;
+            for (auto f = format->begin(group); f != format->end(group); f++) {
+                if (f->first == "action") stash_format->add_field(f->second, f->first, group);
+            }
+        }
+        add_pack_format(stash_allocation, stash_format.get(), false, true);
+        auto mem_units_per_word = format ? format->get_mem_units_per_table_word() : 1;
+        auto &stash_pack_formats = stash_allocation["pack_format"]->to<json::vector>();
+        for (auto &stash_pack_format : stash_pack_formats) {
+            json::map &pack = stash_pack_format->to<json::map>();
+            pack["number_memory_units_per_table_word"] = mem_units_per_word;
+            pack["table_word_width"] = MEM_WORD_WIDTH * mem_units_per_word;
+        }
+        auto num_stash_entries = stash_rows.size() / mem_units_per_word * 2;
+        stash_allocation["num_stash_entries"] = num_stash_entries;
+        json::vector &stash_entries = stash_allocation["stash_entries"] = json::vector();
+        for (int k = 0; k < stash_rows.size() / mem_units_per_word; k++) {
+            for (int i = 0; i < 2; i++) {
+                json::vector stash_entry;
+                for (int j = 0; j < mem_units_per_word; j++) {
+                    auto stash_row = stash_rows[k * mem_units_per_word + j];
+                    auto stash_col = stash_cols[k * mem_units_per_word + j];
+                    auto stash_unit = stash_units[k * mem_units_per_word + j];
+                    MemUnit stash_memunit(stash_row, stash_col);
+                    json::map stash_entry_per_unit;
+                    stash_entry_per_unit["stash_entry_id"] = (4 * stash_row) + (2 * stash_unit) + i;
+                    for (auto &row : layout) {
+                        if (row.row != stash_row) continue;
+                        if (contains(row.memunits, stash_memunit)) {
+                            int bus = row.bus.at(Layout::SEARCH_BUS);
+                            stash_entry_per_unit["stash_match_data_select"] = bus;
+                            stash_entry_per_unit["stash_hashbank_select"] = bus;
+                            stash_entry_per_unit["hash_function_id"] = k;
+                            break;
+                        }
+                    }
+                    stash_entry.push_back(std::move(stash_entry_per_unit));
+                }
+                stash_entries.push_back(std::move(stash_entry));
+            }
+        }
+    } else {
+        stage_tbl["stash_allocation"] = nullptr;
+    }
+    json::map &match_attributes = tbl["match_attributes"];
+    match_attributes["uses_dynamic_key_masks"] = dynamic_key_masks;
+    if (ways.size() > 0) {
+        json::vector &way_stage_tables = stage_tbl["ways"] = json::vector();
+        unsigned way_number = 0;
+        for (auto &way : ways) {
+            json::map way_tbl;
+            way_tbl["stage_number"] = stage->stageno;
+            way_tbl["way_number"] = way_number++;
+            way_tbl["stage_table_type"] = "hash_way";
+            auto fmt_width = get_format_width();
+            BUG_CHECK(fmt_width);
+            unsigned ram_depth = way.rams.at(0).isLamb() ? LAMB_DEPTH : SRAM_DEPTH;
+            way_tbl["size"] = way.rams.size() / fmt_width * format->groups() * ram_depth;
+            add_pack_format(way_tbl, format.get(), false);
+            way_tbl["memory_resource_allocation"] = gen_memory_resource_allocation_tbl_cfg(way);
+            way_stage_tables.push_back(std::move(way_tbl));
+        }
+    }
+    if (size == 0) {
+        if (!match_attributes.count("match_type"))
+            match_attributes["match_type"] = "match_with_no_key";
+        if (!stage_tbl["stage_table_type"]) stage_tbl["stage_table_type"] = "match_with_no_key";
+        stage_tbl["size"] = 1;
+    }
+    if (stage_tbl["stage_table_type"] == "hash_match") {
+        // hash_match table schema requires 'hash_functions' and 'ways' so add (empty) if
+        // they are not present
+        if (!stage_tbl["hash_functions"]) stage_tbl["hash_functions"] = json::vector();
+        if (!stage_tbl["ways"]) stage_tbl["ways"] = json::vector();
+    }
+}
+
+/**
+ * The ghost_bits information is required by the driver to correctly run an entry read from
+ * hardware.  Ghost bits are bits that do not appear in the key, and must be calculated
+ * from the hash matrix.
+ *
+ * The ghost_bits information is broken into two vectors:
+ *
+ * - ghost_bit_info: a vector of information on ghost bits, maps of 2 fields
+ *      1. field_name - name of the field being ghosted
+ *      2. bit_in_match_spec - awfully named for the field bit (not the bit in the entire key)
+ *
+ * - ghost_bit_to_hash_bit: a vector per each entry in the ghost_bit_info describing which
+ *   hash bits coordinate to which ghost bits
+ */
+void ExactMatchTable::gen_ghost_bits(int hash_function_number,
+                                     json::vector &ghost_bits_to_hash_bits,
+                                     json::vector &ghost_bits_info) const {
+    if (ghost_bit_positions.count(hash_function_number) == 0) return;
+    auto ghost_bit_pos = ghost_bit_positions.at(hash_function_number);
+
+    for (auto kv : ghost_bit_pos) {
+        json::map ghost_bit_info;
+        auto field_name = kv.first.first;
+        auto global_name = field_name;
+        auto p4_param = find_p4_param(field_name);
+        if (p4_param && !p4_param->key_name.empty()) field_name = p4_param->key_name;
+        ghost_bit_info["field_name"] = field_name;
+        ghost_bit_info["global_name"] = global_name;
+        ghost_bit_info["bit_in_match_spec"] = kv.first.second;
+        ghost_bits_info.push_back(std::move(ghost_bit_info));
+
+        json::vector ghost_bit_to_hash_bits;
+        for (auto hash_bit : kv.second) ghost_bit_to_hash_bits.push_back(hash_bit);
+        ghost_bits_to_hash_bits.push_back(std::move(ghost_bit_to_hash_bits));
+    }
+}
+
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(ExactMatchTable, TARGET_CLASS)
diff --git a/backends/tofino/bf-asm/exename.cpp b/backends/tofino/bf-asm/exename.cpp
new file mode 100644
index 00000000000..5fe2e6a162c
--- /dev/null
+++ b/backends/tofino/bf-asm/exename.cpp
@@ -0,0 +1,69 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "exename.h"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "bfas.h"
+
+template <size_t N>
+static void convertToAbsPath(const char *const relPath, char (&output)[N]) {
+    output[0] = '\0';  // Default to the empty string, indicating failure.
+
+    char cwd[PATH_MAX];
+    if (!getcwd(cwd, sizeof(cwd))) return;
+    const size_t cwdLen = strlen(cwd);
+    if (cwdLen == 0) return;
+    const char *separator = cwd[cwdLen - 1] == '/' ? "" : "/";
+
+    // Construct an absolute path. We're assuming that @relPath is relative to
+    // the current working directory.
+    int n = snprintf(output, N, "%s%s%s", cwd, separator, relPath);
+    BUG_CHECK(n >= 0, "Pathname too long");
+}
+
+const char *exename(const char *argv0) {
+    static char buffer[PATH_MAX];
+    if (buffer[0]) return buffer;  // done already
+    int len;
+    /* find the path of the executable.  We use a number of techniques that may fail
+     * or work on different systems, and take the first working one we find.  Fall
+     * back to not overriding the compiled-in installation path */
+    if ((len = readlink("/proc/self/exe", buffer, sizeof(buffer) - 1)) > 0 ||
+        (len = readlink("/proc/curproc/exe", buffer, sizeof(buffer) - 1)) > 0 ||
+        (len = readlink("/proc/curproc/file", buffer, sizeof(buffer) - 1)) > 0 ||
+        (len = readlink("/proc/self/path/a.out", buffer, sizeof(buffer) - 1)) > 0) {
+        buffer[len] = 0;
+    } else if (argv0 && argv0[0] == '/') {
+        snprintf(buffer, sizeof(buffer), "%s", argv0);
+    } else if (argv0 && strchr(argv0, '/')) {
+        convertToAbsPath(argv0, buffer);
+    } else if (getenv("_")) {
+        strncpy(buffer, getenv("_"), sizeof(buffer));
+        buffer[sizeof(buffer) - 1] = 0;
+    } else {
+        buffer[0] = 0;
+    }
+    return buffer;
+}
diff --git a/backends/tofino/bf-asm/exename.h b/backends/tofino/bf-asm/exename.h
new file mode 100644
index 00000000000..4e2523b5daf
--- /dev/null
+++ b/backends/tofino/bf-asm/exename.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_EXENAME_H_
+#define BACKENDS_TOFINO_BF_ASM_EXENAME_H_
+
+/** Attempt to determine the executable name and return a static path to it.  Will use
+ * argv0 if provided and nothing better can be found */
+const char *exename(const char *argv0 = nullptr);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_EXENAME_H_ */
diff --git a/backends/tofino/bf-asm/fdstream.cpp b/backends/tofino/bf-asm/fdstream.cpp
new file mode 100644
index 00000000000..0e49aee298b
--- /dev/null
+++ b/backends/tofino/bf-asm/fdstream.cpp
@@ -0,0 +1,85 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fdstream.h"
+
+#include <unistd.h>
+
+#include <cstring>
+
+#define BUFSIZE 1024
+
+fdstream::buffer_t::int_type fdstream::buffer_t::underflow() {
+    if (!gptr()) {
+        char_type *n = new char_type[BUFSIZE];
+        setg(n, n, n);
+    } else if (gptr() != egptr()) {
+        size_t len = egptr() - gptr();
+        if (len > 0) std::memmove(eback(), gptr(), len * sizeof(char_type));
+        setg(eback(), eback(), eback() + len);
+    } else {
+        setg(eback(), eback(), eback());
+    }
+    int rv = ::read(fd, egptr(), eback() + BUFSIZE - egptr());
+    if (rv > 0)
+        setg(eback(), eback(), egptr() + rv);
+    else if (gptr() == egptr())
+        return traits_type::eof();
+    return traits_type::to_int_type(*gptr());
+}
+
+fdstream::buffer_t::int_type fdstream::buffer_t::overflow(fdstream::buffer_t::int_type c) {
+    if (!pptr()) {
+        char_type *n = new char_type[BUFSIZE];
+        setp(n, n + BUFSIZE);
+    }
+    if (pptr() != pbase()) {
+        int rv = ::write(fd, pbase(), pptr() - pbase());
+        if (rv <= 0) return traits_type::eof();
+        if (pbase() + rv == pptr())
+            setp(pbase(), epptr());
+        else {
+            size_t len = pptr() - pbase() + rv;
+            std::memmove(pbase(), pbase() + rv, len);
+            setp(pbase(), epptr());
+            pbump(len);
+        }
+    }
+    if (!traits_type::eq_int_type(c, traits_type::eof())) {
+        *pptr() = c;
+        pbump(1);
+        return c;
+    } else {
+        return traits_type::not_eof(c);
+    }
+}
+
+int fdstream::buffer_t::sync() {
+    char *p = pbase(), *e = pptr();
+    while (p != e) {
+        int rv = ::write(fd, p, e - p);
+        if (rv <= 0) {
+            if (p != pbase()) std::memmove(pbase(), p, e - p);
+            setp(pbase(), epptr());
+            pbump(e - p);
+            return -1;
+        }
+        p += rv;
+    }
+    setp(pbase(), epptr());
+    return 0;
+}
diff --git a/backends/tofino/bf-asm/fdstream.h b/backends/tofino/bf-asm/fdstream.h
new file mode 100644
index 00000000000..8cd4fb96775
--- /dev/null
+++ b/backends/tofino/bf-asm/fdstream.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_FDSTREAM_H_
+#define BACKENDS_TOFINO_BF_ASM_FDSTREAM_H_
+
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <functional>
+#include <iostream>
+#include <streambuf>
+
+class fdstream : public std::iostream {
+    struct buffer_t : public std::basic_streambuf<char> {
+        int fd;
+
+     public:
+        explicit buffer_t(int _fd) : fd(_fd) {}
+        ~buffer_t() {
+            delete[] eback();
+            delete[] pbase();
+        }
+        int sync();
+        int_type underflow();
+        int_type overflow(int_type c = traits_type::eof());
+        void reset() {
+            setg(eback(), eback(), eback());
+            setp(pbase(), epptr());
+        }
+    } buffer;
+    std::function<void()> closefn;
+
+ public:
+    explicit fdstream(int fd = -1) : std::iostream(&buffer), buffer(fd) { init(&buffer); }
+    ~fdstream() {
+        if (closefn) closefn();
+    }
+    void connect(int fd) {
+        flush();
+        buffer.reset();
+        buffer.fd = fd;
+    }
+    void setclose(std::function<void()> fn) { closefn = fn; }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_FDSTREAM_H_ */
diff --git a/backends/tofino/bf-asm/flexible_headers.cpp b/backends/tofino/bf-asm/flexible_headers.cpp
new file mode 100644
index 00000000000..df3489a6a51
--- /dev/null
+++ b/backends/tofino/bf-asm/flexible_headers.cpp
@@ -0,0 +1,48 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string>
+
+#include "backends/tofino/bf-asm/sections.h"
+
+namespace BFASM {
+
+// Singleton class representing the assembler flexible_headers
+class FlexibleHeaders : public Section {
+ private:
+    std::unique_ptr<json::vector> flexHeaders;
+
+    FlexibleHeaders() : Section("flexible_headers") {}
+
+    void input(VECTOR(value_t) args, value_t data) {
+        if (!CHECKTYPE(data, tVEC)) return;
+        flexHeaders = std::move(toJson(data.vec));
+    }
+
+    void output(json::map &ctxtJson) {
+        if (flexHeaders != nullptr) ctxtJson["flexible_headers"] = std::move(flexHeaders);
+    }
+
+ public:
+    // disable any other constructors
+    FlexibleHeaders(FlexibleHeaders const &) = delete;
+    void operator=(FlexibleHeaders const &) = delete;
+
+    static FlexibleHeaders singleton_flexHeaders;
+} FlexibleHeaders::singleton_flexHeaders;
+
+};  // namespace BFASM
diff --git a/backends/tofino/bf-asm/gateway.cpp b/backends/tofino/bf-asm/gateway.cpp
new file mode 100644
index 00000000000..48d8b74b20f
--- /dev/null
+++ b/backends/tofino/bf-asm/gateway.cpp
@@ -0,0 +1,918 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/hashexpr.h"
+#include "backends/tofino/bf-asm/input_xbar.h"
+#include "backends/tofino/bf-asm/instruction.h"
+#include "backends/tofino/bf-asm/misc.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "lib/algorithm.h"
+#include "lib/hex.h"
+
+// template specialization declarations
+#include "backends/tofino/bf-asm/jbay/gateway.h"
+#include "backends/tofino/bf-asm/tofino/gateway.h"
+
+static struct {
+    unsigned units, bits, half_shift, mask, half_mask;
+} range_match_info[] = {{0, 0, 0, 0, 0}, {6, 4, 2, 0xf, 0x3}, {3, 8, 8, 0xffff, 0xff}};
+
+// Dummy value used to start gateway handles. For future use by driver,
+// Incremented from inside the gateway table
+static uint gateway_handle = 0x70000000;
+
+GatewayTable::Match::Match(value_t *v, value_t &data, range_match_t range_match) {
+    if (range_match) {
+        for (unsigned i = 0; i < range_match_info[range_match].units; i++)
+            range[i] = range_match_info[range_match].mask;
+    }
+    if (v) {
+        lineno = v->lineno;
+        if (v->type == tVEC) {
+            int last = v->vec.size - 1;
+            if (last > static_cast<int>(range_match_info[range_match].units))
+                error(lineno, "Too many set values for range match");
+            for (int i = 0; i < last; i++)
+                if (CHECKTYPE((*v)[last - i - 1], tINT)) {
+                    if ((unsigned)(*v)[last - i - 1].i > range_match_info[range_match].mask)
+                        error(lineno, "range match set too large");
+                    range[i] = (*v)[last - i - 1].i;
+                }
+            v = &(*v)[last];
+        }
+        if (v->type == tINT) {
+            val.word1 = bitvec(v->i);
+            val.word0.setrange(0, 64);
+            val.word0 -= val.word1;
+        } else if (v->type == tBIGINT) {
+            val.word1.setraw(v->bigi.data, v->bigi.size);
+            val.word0.setrange(0, v->bigi.size * 64);
+            val.word0 -= val.word1;
+        } else if (v->type == tMATCH) {
+            val = v->m;
+        } else if (v->type == tBIGMATCH) {
+            val = v->bigm;
+        }
+    }
+    if (data == "run_table") {
+        run_table = true;
+    } else if (data.type == tSTR || data.type == tVEC) {
+        next = data;
+    } else if (data.type == tMAP) {
+        for (auto &kv : MapIterChecked(data.map)) {
+            if (kv.key == "next") {
+                next = kv.value;
+            } else if (kv.key == "run_table") {
+                if (kv.value == "true")
+                    run_table = true;
+                else if (kv.value == "false")
+                    run_table = false;
+                else
+                    error(kv.value.lineno, "Syntax error, expecting boolean");
+            } else if (kv.key == "action") {
+                if (CHECKTYPE(kv.value, tSTR)) action = kv.value.s;
+            } else {
+                error(kv.key.lineno, "Syntax error, expecting gateway action description");
+            }
+        }
+        if (run_table && next.set())
+            error(data.lineno, "Can't run table and override next in the same gateway row");
+    } else {
+        error(data.lineno, "Syntax error, expecting gateway action description");
+    }
+}
+
+void GatewayTable::setup(VECTOR(pair_t) & data) {
+    setup_logical_id();
+    if (auto *v = get(data, "range")) {
+        if (CHECKTYPE(*v, tINT)) {
+            if (v->i == 2) range_match = DC_2BIT;
+            if (v->i == 4)
+                range_match = DC_4BIT;
+            else
+                error(v->lineno, "Unknown range match size %" PRId64 " bits", v->i);
+        }
+    }
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (kv.key == "name") {
+            if (CHECKTYPE(kv.value, tSTR)) gateway_name = kv.value.s;
+        } else if (kv.key == "row") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if (kv.value.i < 0 || kv.value.i > Target::GATEWAY_ROWS())
+                error(kv.value.lineno, "row %" PRId64 " out of range", kv.value.i);
+            if (layout.empty()) layout.resize(1);
+            layout[0].row = kv.value.i;
+            layout[0].lineno = kv.value.lineno;
+        } else if (kv.key == "bus") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if (kv.value.i < 0 || kv.value.i > 1)
+                error(kv.value.lineno, "bus %" PRId64 " out of range", kv.value.i);
+            if (layout.empty()) layout.resize(1);
+            layout[0].bus[Layout::SEARCH_BUS] = kv.value.i;
+            if (layout[0].lineno < 0) layout[0].lineno = kv.value.lineno;
+        } else if (kv.key == "payload_row") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if (kv.value.i < 0 || kv.value.i > 7)
+                error(kv.value.lineno, "row %" PRId64 " out of range", kv.value.i);
+            if (layout.size() < 2) layout.resize(2);
+            layout[1].row = kv.value.i;
+            layout[1].lineno = kv.value.lineno;
+        } else if (kv.key == "payload_bus") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if (kv.value.i < 0 || kv.value.i > 3)
+                error(kv.value.lineno, "bus %" PRId64 " out of range", kv.value.i);
+            if (layout.size() < 2) layout.resize(2);
+            layout[1].bus[Layout::RESULT_BUS] = kv.value.i;
+            if (layout[1].lineno < 0) layout[1].lineno = kv.value.lineno;
+        } else if (kv.key == "payload_unit") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if (kv.value.i < 0 || kv.value.i > 1)
+                error(kv.value.lineno, "payload unit %" PRId64 " out of range", kv.value.i);
+            payload_unit = kv.value.i;
+        } else if (kv.key == "gateway_unit" || kv.key == "unit") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if (kv.value.i < 0 || kv.value.i > 1)
+                error(kv.value.lineno, "gateway unit %" PRId64 " out of range", kv.value.i);
+            gw_unit = kv.value.i;
+        } else if (kv.key == "input_xbar") {
+            if (CHECKTYPE(kv.value, tMAP))
+                input_xbar.emplace_back(InputXbar::create(this, false, kv.key, kv.value.map));
+        } else if (kv.key == "format") {
+            if (CHECKTYPEPM(kv.value, tMAP, kv.value.map.size > 0, "non-empty map"))
+                format.reset(new Format(this, kv.value.map));
+        } else if (kv.key == "always_run") {
+            if ((always_run = get_bool(kv.value)) && !Target::SUPPORT_ALWAYS_RUN())
+                error(kv.key.lineno, "always_run not supported on %s", Target::name());
+        } else if (kv.key == "miss") {
+            miss = Match(0, kv.value, range_match);
+        } else if (kv.key == "condition") {
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto &v : kv.value.map) {
+                    if (v.key == "expression" && CHECKTYPE(v.value, tSTR))
+                        gateway_cond = v.value.s;
+                    else if (v.key == "true")
+                        cond_true = Match(0, v.value, range_match);
+                    else if (v.key == "false")
+                        cond_false = Match(0, v.value, range_match);
+                }
+            }
+        } else if (kv.key == "payload") {
+            if (CHECKTYPE2(kv.value, tINT, tBIGINT)) payload = get_int64(kv.value);
+            /* FIXME -- should also be able to specify payload as <action name>(<args>) */
+            have_payload = kv.key.lineno;
+        } else if (kv.key == "payload_map") {
+            if (kv.value.type == tVEC) {
+                if (kv.value.vec.size > Target::GATEWAY_PAYLOAD_GROUPS())
+                    error(kv.value.lineno, "payload_map too large (limit %d)",
+                          Target::GATEWAY_PAYLOAD_GROUPS());
+                for (auto &v : kv.value.vec) {
+                    if (v == "_")
+                        payload_map.push_back(-1);
+                    else if (CHECKTYPE(v, tINT))
+                        payload_map.push_back(v.i);
+                }
+            }
+        } else if (kv.key == "match_address") {
+            if (CHECKTYPE(kv.value, tINT)) match_address = kv.value.i;
+        } else if (kv.key == "match") {
+            if (kv.value.type == tVEC) {
+                for (auto &v : kv.value.vec) match.emplace_back(gress, stage->stageno, v);
+            } else if (kv.value.type == tMAP) {
+                for (auto &v : kv.value.map) {
+                    if (CHECKTYPE(v.key, tINT)) {
+                        if (v.value.type == tCMD && v.value.vec.size == 2 &&
+                            v.value.vec[0] == "$valid") {
+                            match.emplace_back(v.key.i, gress, stage->stageno, v.value.vec[1],
+                                               true);
+                        } else {
+                            match.emplace_back(v.key.i, gress, stage->stageno, v.value);
+                        }
+                    }
+                }
+            } else {
+                match.emplace_back(gress, stage->stageno, kv.value);
+            }
+        } else if (kv.key == "range") {
+            /* done above, to be before match parsing */
+        } else if (kv.key == "xor") {
+            if (kv.value.type == tVEC) {
+                for (auto &v : kv.value.vec) xor_match.emplace_back(gress, stage->stageno, v);
+            } else if (kv.value.type == tMAP) {
+                for (auto &v : kv.value.map)
+                    if (CHECKTYPE(v.key, tINT))
+                        xor_match.emplace_back(v.key.i, gress, stage->stageno, v.value);
+            } else {
+                xor_match.emplace_back(gress, stage->stageno, kv.value);
+            }
+        } else if (kv.key == "long_branch" && Target::LONG_BRANCH_TAGS() > 0) {
+            if (options.disable_long_branch) error(kv.key.lineno, "long branches disabled");
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto &lb : kv.value.map) {
+                    if (lb.key.type != tINT || lb.key.i < 0 ||
+                        lb.key.i >= Target::LONG_BRANCH_TAGS())
+                        error(lb.key.lineno, "Invalid long branch tag %s", value_desc(lb.key));
+                    else if (long_branch.count(lb.key.i))
+                        error(lb.key.lineno, "Duplicate long branch tag %" PRIi64, lb.key.i);
+                    else
+                        long_branch.emplace(lb.key.i, lb.value);
+                }
+            }
+        } else if (kv.key == "context_json") {
+            setup_context_json(kv.value);
+        } else if (kv.key.type == tINT || kv.key.type == tBIGINT || kv.key.type == tMATCH ||
+                   (kv.key.type == tVEC && range_match != NONE)) {
+            table.emplace_back(&kv.key, kv.value, range_match);
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+}
+
+bool GatewayTable::check_match_key(MatchKey &key, const std::vector<MatchKey> &vec, bool is_xor) {
+    if (!key.val.check()) return false;
+    if (key.val->reg.mau_id() < 0)
+        error(key.val.lineno, "%s not accessable in mau", key.val->reg.name);
+    if (key.offset >= 0) {
+        for (auto &okey : vec) {
+            if (&okey == &key) break;
+            if (key.offset < okey.offset + static_cast<int>(okey.val->size()) &&
+                okey.offset < key.offset + static_cast<int>(key.val->size()))
+                error(key.val.lineno,
+                      "Gateway %s key at offset %d overlaps previous "
+                      "value at offset %d",
+                      is_xor ? "xor" : "match", key.offset, okey.offset);
+        }
+    } else if (&key == &vec[0]) {
+        key.offset = 0;
+    } else {
+        auto *prev = &key - 1;
+        key.offset = prev->offset + prev->val->size();
+    }
+    return true;
+}
+
+void GatewayTable::verify_format() {
+    if (format->log2size > 6)
+        error(format->lineno, "Gateway payload format too large (max 64 bits)");
+    format->log2size = 6;
+    format->pass1(this);
+    if (format->groups() > Target::GATEWAY_PAYLOAD_GROUPS())
+        error(format->lineno, "Too many groups for gateway payload");
+    if (payload_map.empty()) {
+        if (format->groups() == 1) {
+            payload_map.push_back(0);
+        } else {
+            payload_map = std::vector<int>(Target::GATEWAY_PAYLOAD_GROUPS(), -1);
+            int i = Target::GATEWAY_PAYLOAD_GROUPS() - 2;
+            int grp = 0;
+            for (auto &row : table) {
+                if (!row.run_table && i >= 0) {
+                    if (grp >= format->groups() && format->groups() > 1) {
+                        error(format->lineno, "Not enough groups in format for payload");
+                        grp = 0;
+                    }
+                    payload_map[i--] = grp++;
+                }
+            }
+            if (!miss.run_table) payload_map.back() = format->groups() - 1;
+        }
+    }
+    for (auto pme : payload_map) {
+        if (pme < -1 || pme >= int(format->groups()))
+            error(format->lineno, "Invalid format group %d in payload_map", pme);
+    }
+    if (match_table) {
+        if (match_table->table_type() == TERNARY) {
+            if (format->groups() > 1)
+                error(format->lineno,
+                      "Can't have mulitple payload format groups when attached "
+                      "to a ternary table");
+        } else if (!match_table->format) {
+            // ok
+        } else if (auto *srm = match_table->to<SRamMatchTable>()) {
+            int groups = std::min(format->groups(), match_table->format->groups());
+            bool err = false;
+            for (auto &field : *format) {
+                if (auto match_field = match_table->format->field(field.first)) {
+                    int match_group = -1;
+                    for (auto gw_group : payload_map) {
+                        ++match_group;
+                        if (gw_group < 0) continue;
+                        int em_group = match_group;
+                        if (!srm->word_info.empty()) {
+                            if (match_group < srm->word_info[0].size())
+                                em_group = srm->word_info[0][match_group];
+                            else
+                                em_group = -1;
+                        }
+                        if (em_group < 0) continue;
+                        if (field.second.by_group[gw_group]->bits !=
+                            match_field->by_group[em_group]->bits) {
+                            if (!err) {
+                                error(format->lineno,
+                                      "Gateway format inconsistent with table "
+                                      "%s it is attached to",
+                                      match_table->name());
+                                error(match_table->format->lineno, "field %s inconsistent",
+                                      field.first.c_str());
+                                err = true;
+                                break;
+                            }
+                        }
+                    }
+                } else {
+                    if (!err)
+                        error(format->lineno,
+                              "Gateway format inconsistent with table %s it is "
+                              "attached to",
+                              match_table->name());
+                    error(match_table->format->lineno, "No field %s in match table format",
+                          field.first.c_str());
+                    err = true;
+                }
+            }
+        }
+    } else if (layout.size() > 1) {
+        if (!layout[1].bus.count(Layout::RESULT_BUS)) {
+            error(layout[1].lineno, "No result bus for gateway payload");
+        } else {
+            int result_bus = layout[1].bus.at(Layout::RESULT_BUS);
+            if (result_bus > 3)
+                error(layout[1].lineno, "Invalid bus %d for gateway payload", result_bus);
+            if ((result_bus & 2) && format->groups() > 1)
+                error(format->lineno,
+                      "Can't have mulitple payload format groups when using "
+                      "ternary indirect bus");
+        }
+    }
+}
+
+void GatewayTable::pass1() {
+    LOG1("### Gateway table " << name() << " pass1 " << loc());
+    if (!match_table) {
+        // needs to happen before Actions::pass1, but will have been called from the
+        // match table if this gateway is attached to one.
+        setup_map_indexing(this);
+    }
+    Table::pass1();
+#if 0
+    // redundant with (and supercedes) choose_logical_id in pass2.  That function is much
+    // better, taking dependencies into account, so logical_id should not be allocated here
+    alloc_id("logical", logical_id, stage->pass1_logical_id,
+                 LOGICAL_TABLES_PER_STAGE, true, stage->logical_id_use);
+#endif
+    if (always_run && match_table)
+        error(lineno, "always_run set on non-standalone gateway for %s", match_table->name());
+    if (gw_unit >= 0) {
+        if (auto *old = stage->gw_unit_use[layout[0].row][gw_unit])
+            error(layout[0].lineno, "gateway %d.%d already in use by table %s", layout[0].row,
+                  gw_unit, old->name());
+        else
+            stage->gw_unit_use[layout[0].row][gw_unit] = this;
+    }
+    for (auto &ixb : input_xbar) {
+        ixb->pass1();
+        if (Target::GATEWAY_SINGLE_XBAR_GROUP() && ixb->match_group() < 0)
+            error(ixb->lineno, "Gateway match keys must be in a single ixbar group");
+    }
+    for (auto &k : match)
+        if (!check_match_key(k, match, false)) break;
+    for (auto &k : xor_match)
+        if (!check_match_key(k, xor_match, true)) break;
+    std::sort(match.begin(), match.end());
+    std::sort(xor_match.begin(), xor_match.end());
+    if (table.size() > 4) error(lineno, "Gateway can only have 4 match entries max");
+    for (auto &line : table) check_next(line.next);
+    check_next(miss.next);
+    check_next(cond_false.next);
+    check_next(cond_true.next);
+    if (format) verify_format();
+
+    if (error_count > 0) return;
+    /* FIXME -- the rest of this function is a hack -- sometimes the compiler wants to
+     * generate matches just covering the bits it names in the match and other times it wants
+     * to create the whole tcam value.  Need to fix the asm syntax to be sensible and fix the
+     * compiler's output.
+     * Part of the issue is that in tofino1/2 we copy the word0/word1 bits directly to
+     * the tcam, so we need to treat unspecified bits as don't care.  Another part is that
+     * integer constants used as matches get padded with 0 out to a mulitple of 64 bits,
+     * and those should also be don't care where they don't get matched.
+     */
+    bitvec ignore(0, Target::GATEWAY_MATCH_BITS());
+    int shift = -1;
+    int maxbit = 0;
+    for (auto &r : match) {
+        if (range_match && r.offset >= 32) {
+            continue;
+        }
+        ignore.clrrange(r.offset, r.val->size());
+        if (shift < 0 || shift > r.offset) shift = r.offset;
+        if (maxbit < r.offset + r.val->size()) maxbit = r.offset + r.val->size();
+    }
+    if (shift < 0) shift = 0;
+    LOG3("shift=" << shift << " ignore=0x" << ignore);
+    for (auto &line : table) {
+        bitvec matching = (line.val.word0 ^ line.val.word1) << shift;
+        matching -= (line.val.word0 << shift) - bitvec(0, maxbit);  // ignore leading 0s
+        if (matching & ignore)
+            warning(line.lineno, "Trying to match on bits not in match of gateway");
+        line.val.word0 = (line.val.word0 << shift) | ignore;
+        line.val.word1 = (line.val.word1 << shift) | ignore;
+    }
+}
+
+int GatewayTable::find_next_lut_entry(Table *tbl, const Match &match) {
+    int rv = 0;
+    for (auto &e : tbl->hit_next) {
+        if (e == match.next) return rv;
+        ++rv;
+    }
+    for (auto &e : tbl->extra_next_lut) {
+        if (e == match.next) return rv;
+        ++rv;
+    }
+    tbl->extra_next_lut.push_back(match.next);
+    if (rv == Target::NEXT_TABLE_SUCCESSOR_TABLE_DEPTH())
+        error(tbl->lineno, "Too many next table map entries in table %s", tbl->name());
+    return rv;
+}
+
+void GatewayTable::pass2() {
+    LOG1("### Gateway table " << name() << " pass2 " << loc());
+    if (logical_id < 0) {
+        if (match_table)
+            logical_id = match_table->logical_id;
+        else
+            choose_logical_id();
+    }
+    for (auto &ixb : input_xbar) ixb->pass2();
+    need_next_map_lut = miss.next.need_next_map_lut();
+    for (auto &e : table) need_next_map_lut |= e.next.need_next_map_lut();
+    if (need_next_map_lut) {
+        Table *tbl = match_table;
+        if (!tbl) tbl = this;
+        for (auto &e : table)
+            if (!e.run_table && e.next_map_lut < 0) e.next_map_lut = find_next_lut_entry(tbl, e);
+        if (!miss.run_table && miss.next_map_lut < 0)
+            miss.next_map_lut = find_next_lut_entry(tbl, miss);
+    }
+}
+
+void GatewayTable::pass3() {
+    LOG1("### Gateway table " << name() << " pass3 " << loc());
+    if (match_table)
+        physical_ids = match_table->physical_ids;
+    else
+        allocate_physical_ids();
+}
+
+static unsigned match_input_use(const std::vector<GatewayTable::MatchKey> &match) {
+    unsigned rv = 0;
+    for (auto &r : match) {
+        unsigned lo = r.offset;
+        unsigned hi = lo + r.val->size() - 1;
+        if (lo < 32) {
+            rv |= (((UINT32_C(1) << (hi / 8 - lo / 8 + 1)) - 1) << lo / 8) & 0xf;
+            lo = 32;
+        }
+        if (lo <= hi) rv |= ((UINT32_C(1) << (hi - lo + 1)) - 1) << (lo - 24);
+    }
+    return rv;
+}
+
+/* caluclate match_bus byte use (8 bytes/bits) + hash output use (12 bits) */
+unsigned GatewayTable::input_use() const {
+    unsigned rv = match_input_use(match) | match_input_use(xor_match);
+    if (!xor_match.empty()) rv |= (rv & 0xf) << 4;
+    return rv;
+}
+
+bool GatewayTable::is_branch() const {
+    for (auto &line : table)
+        if (line.next.next_table() != nullptr) return true;
+    if (!miss.run_table && miss.next.next_table() != nullptr) return true;
+    return false;
+}
+
+/* FIXME -- how to deal with (or even specify) matches in the upper 24 bits coming from
+ * the hash bus?   Currently we assume that the input_xbar is declared to set up the
+ * hash signals correctly so that we can just match them.  Should at least check it
+ * somewhere, somehow. We do some checking in check_match_key above, but is that enough?
+ */
+template <class REGS>
+static bool setup_vh_xbar(REGS &regs, Table *table, Table::Layout &row, int base,
+                          std::vector<GatewayTable::MatchKey> &match, int group) {
+    auto &rams_row = regs.rams.array.row[row.row];
+    auto &byteswizzle_ctl =
+        rams_row.exactmatch_row_vh_xbar_byteswizzle_ctl[row.bus.at(Table::Layout::SEARCH_BUS)];
+    for (auto &r : match) {
+        if (r.offset >= 32) break; /* skip hash matches */
+        for (int bit = 0; bit < r.val->size(); ++bit) {
+            int ibyte = table->find_on_ixbar(*Phv::Ref(r.val, bit, bit), group);
+            if (ibyte < 0) {
+                error(r.val.lineno, "Can't find %s(%d) on ixbar", r.val.desc().c_str(), bit);
+                return false;
+            }
+            unsigned byte = base + (r.offset + bit) / 8;
+            byteswizzle_ctl[byte][(r.val->lo + bit) & 7] = 0x10 + ibyte;
+        }
+    }
+    return true;
+}
+
+template <class REGS>
+void enable_gateway_payload_exact_shift_ovr(REGS &regs, int bus) {
+    regs.rams.match.merge.gateway_payload_exact_shift_ovr[bus / 8] |= 1U << bus % 8;
+}
+
+template <class REGS>
+void GatewayTable::payload_write_regs(REGS &regs, int row, int type, int bus) {
+    auto &merge = regs.rams.match.merge;
+    auto &xbar_ctl = merge.gateway_to_pbus_xbar_ctl[row * 2 + bus];
+    if (type) {
+        xbar_ctl.tind_logical_select = logical_id;
+        xbar_ctl.tind_inhibit_enable = 1;
+    } else {
+        xbar_ctl.exact_logical_select = logical_id;
+        xbar_ctl.exact_inhibit_enable = 1;
+    }
+    if (have_payload >= 0 || match_address >= 0) {
+        BUG_CHECK(payload_unit == bus);
+        if (type)
+            merge.gateway_payload_tind_pbus[row] |= 1 << bus;
+        else
+            merge.gateway_payload_exact_pbus[row] |= 1 << bus;
+    }
+    if (have_payload >= 0) {
+        merge.gateway_payload_data[row][bus][0][type] = payload & 0xffffffff;
+        merge.gateway_payload_data[row][bus][1][type] = payload >> 32;
+        merge.gateway_payload_data[row][bus][0][type ^ 1] = payload & 0xffffffff;
+        merge.gateway_payload_data[row][bus][1][type ^ 1] = payload >> 32;
+    }
+    if (match_address >= 0) {
+        merge.gateway_payload_match_adr[row][bus][type] = match_address;
+        merge.gateway_payload_match_adr[row][bus][type ^ 1] = match_address;
+    } else if (options.target == TOFINO) {
+        // For Tofino A0, there is a bug in snapshot that cannot distinguish if a
+        // gateway is inhibiting a table To work around this, configure the
+        // gateway_payload_match_adr to an invalid value. Add a command line flag
+        // if this is only a tofino A0 issue?.
+        merge.gateway_payload_match_adr[row][bus][type] = 0x7ffff;
+        merge.gateway_payload_match_adr[row][bus][type ^ 1] = 0x7ffff;
+    }
+
+    int groups = format ? format->groups() : 1;
+    if (groups > 1 || payload_map.size() > 1) {
+        BUG_CHECK(type == 0);  // only supported on exact result busses
+        enable_gateway_payload_exact_shift_ovr(regs, row * 2 + bus);
+    }
+
+    int tcam_shift = 0;
+    if (type != 0 && format) {
+        auto match_table = get_match_table();
+        if (match_table) {
+            auto ternary_table = match_table->to<TernaryMatchTable>();
+            if (ternary_table && ternary_table->has_indirect()) {
+                tcam_shift = format->log2size - 2;
+            }
+        }
+    }
+
+    if (format) {
+        if (auto *attached = get_attached()) {
+            for (auto &st : attached->stats) {
+                if (type == 0) {
+                    for (unsigned i = 0; i < payload_map.size(); ++i) {
+                        auto grp = payload_map.at(i);
+                        if (grp < 0) continue;
+                        merge.mau_stats_adr_exact_shiftcount[row * 2 + bus][i] =
+                            st->determine_shiftcount(st, grp, 0, 0);
+                    }
+                } else {
+                    merge.mau_stats_adr_tcam_shiftcount[row * 2 + bus] =
+                        st->determine_shiftcount(st, 0, 0, tcam_shift);
+                }
+                break;
+            }
+
+            for (auto &m : attached->meters) {
+                if (type == 0) {
+                    for (unsigned i = 0; i < payload_map.size(); ++i) {
+                        auto grp = payload_map.at(i);
+                        if (grp < 0) continue;
+                        m->to<MeterTable>()->setup_exact_shift(regs, row * 2 + bus, grp, 0, i, m,
+                                                               attached->meter_color);
+                    }
+                } else {
+                    m->to<MeterTable>()->setup_tcam_shift(regs, row * 2 + bus, tcam_shift, m,
+                                                          attached->meter_color);
+                }
+                break;
+            }
+            for (auto &s : attached->statefuls) {
+                if (type == 0) {
+                    for (unsigned i = 0; i < payload_map.size(); ++i) {
+                        auto grp = payload_map.at(i);
+                        if (grp < 0) continue;
+                        merge.mau_meter_adr_exact_shiftcount[row * 2 + bus][i] =
+                            s->determine_shiftcount(s, grp, 0, 0);
+                    }
+                } else {
+                    merge.mau_meter_adr_tcam_shiftcount[row * 2 + bus] =
+                        s->determine_shiftcount(s, 0, 0, tcam_shift);
+                }
+                break;
+            }
+        }
+    }
+
+    if (match_table && match_table->instruction) {
+        if (auto field = match_table->instruction.args[0].field()) {
+            if (type == 0) {
+                for (unsigned i = 0; i < payload_map.size(); ++i) {
+                    auto grp = payload_map.at(i);
+                    if (grp < 0) continue;
+                    merge.mau_action_instruction_adr_exact_shiftcount[row * 2 + bus][i] =
+                        field->by_group[grp]->bit(0);
+                }
+            } else {
+                merge.mau_action_instruction_adr_tcam_shiftcount[row * 2 + bus] =
+                    field->bit(0) + tcam_shift;
+            }
+        }
+    } else if (auto *action = format ? format->field("action") : nullptr) {
+        if (type == 0) {
+            for (unsigned i = 0; i < payload_map.size(); ++i) {
+                auto grp = payload_map.at(i);
+                if (grp < 0) continue;
+                merge.mau_action_instruction_adr_exact_shiftcount[row * 2 + bus][i] =
+                    action->by_group[grp]->bit(0);
+            }
+        } else {
+            merge.mau_action_instruction_adr_tcam_shiftcount[row * 2 + bus] =
+                action->bit(0) + tcam_shift;
+        }
+    }
+
+    if (format && format->immed) {
+        if (type == 0) {
+            for (unsigned i = 0; i < payload_map.size(); ++i) {
+                auto grp = payload_map.at(i);
+                if (grp < 0) continue;
+                merge.mau_immediate_data_exact_shiftcount[row * 2 + bus][i] =
+                    format->immed->by_group[grp]->bit(0);
+            }
+        } else {
+            merge.mau_immediate_data_tcam_shiftcount[row * 2 + bus] =
+                format->immed->bit(0) + tcam_shift;
+        }
+        // FIXME -- may be redundant witehr writing this for the match table,
+        // but should always be consistent
+        merge.mau_immediate_data_mask[type][row * 2 + bus] = bitMask(format->immed_size);
+        merge.mau_payload_shifter_enable[type][row * 2 + bus].immediate_data_payload_shifter_en = 1;
+    }
+
+    if (type) {
+        merge.tind_bus_prop[row * 2 + bus].tcam_piped = 1;
+        merge.tind_bus_prop[row * 2 + bus].thread = gress;
+        merge.tind_bus_prop[row * 2 + bus].enabled = 1;
+    } else {
+        merge.exact_match_phys_result_en[row / 4U] |= 1U << (row % 4U * 2 + bus);
+        merge.exact_match_phys_result_thread[row / 4U] |= gress << (row % 4U * 2 + bus);
+        if (stage->tcam_delay(gress))
+            merge.exact_match_phys_result_delay[row / 4U] |= 1U << (row % 4U * 2 + bus);
+    }
+}
+
+template <class REGS>
+void GatewayTable::standalone_write_regs(REGS &regs) {}
+
+template <class REGS>
+void GatewayTable::write_regs_vt(REGS &regs) {
+    LOG1("### Gateway table " << name() << " write_regs " << loc());
+    auto &row = layout[0];
+    for (auto &ixb : input_xbar) {
+        // FIXME -- if there's no ixbar in the gateway, we should look for a group with
+        // all the match/xor values across all the exact match groups in the stage and use
+        // that.
+        ixb->write_regs(regs);
+        if (!setup_vh_xbar(regs, this, row, 0, match, ixb->match_group()) ||
+            !setup_vh_xbar(regs, this, row, 4, xor_match, ixb->match_group()))
+            return;
+    }
+    auto &row_reg = regs.rams.array.row[row.row];
+    auto &gw_reg = row_reg.gateway_table[gw_unit];
+    auto &merge = regs.rams.match.merge;
+    int search_bus = row.bus.at(Layout::SEARCH_BUS);
+    if (search_bus == 0) {
+        gw_reg.gateway_table_ctl.gateway_table_input_data0_select = 1;
+        gw_reg.gateway_table_ctl.gateway_table_input_hash0_select = 1;
+    } else {
+        BUG_CHECK(search_bus == 1);
+        gw_reg.gateway_table_ctl.gateway_table_input_data1_select = 1;
+        gw_reg.gateway_table_ctl.gateway_table_input_hash1_select = 1;
+    }
+    for (auto &ixb : input_xbar) {
+        if (ixb->hash_group() >= 0)
+            setup_muxctl(row_reg.vh_adr_xbar.exactmatch_row_hashadr_xbar_ctl[search_bus],
+                         ixb->hash_group());
+        if (ixb->match_group() >= 0 && gateway_needs_ixbar_group()) {
+            auto &vh_xbar_ctl = row_reg.vh_xbar[search_bus].exactmatch_row_vh_xbar_ctl;
+            setup_muxctl(vh_xbar_ctl, ixb->match_group());
+            /* vh_xbar_ctl.exactmatch_row_vh_xbar_thread = gress; */ }
+    }
+    gw_reg.gateway_table_ctl.gateway_table_logical_table = logical_id;
+    gw_reg.gateway_table_ctl.gateway_table_thread = timing_thread(gress);
+    for (auto &r : xor_match)
+        gw_reg.gateway_table_matchdata_xor_en |= bitMask(r.val->size()) << r.offset;
+    int idx = 3;
+    gw_reg.gateway_table_ctl.gateway_table_mode = range_match;
+    for (auto &line : table) {
+        BUG_CHECK(idx >= 0);
+        /* FIXME -- hardcoding version/valid to always */
+        gw_reg.gateway_table_vv_entry[idx].gateway_table_entry_versionvalid0 = 0x3;
+        gw_reg.gateway_table_vv_entry[idx].gateway_table_entry_versionvalid1 = 0x3;
+        gw_reg.gateway_table_entry_matchdata[idx][0] = line.val.word0.getrange(0, 32);
+        gw_reg.gateway_table_entry_matchdata[idx][1] = line.val.word1.getrange(0, 32);
+        if (range_match) {
+            auto &info = range_match_info[range_match];
+            for (unsigned i = 0; i < range_match_info[range_match].units; i++) {
+                gw_reg.gateway_table_data_entry[idx][0] |= (line.range[i] & info.half_mask)
+                                                           << (i * info.bits);
+                gw_reg.gateway_table_data_entry[idx][1] |=
+                    ((line.range[i] >> info.half_shift) & info.half_mask) << (i * info.bits);
+            }
+        } else {
+            gw_reg.gateway_table_data_entry[idx][0] = line.val.word0.getrange(32, 24);
+            gw_reg.gateway_table_data_entry[idx][1] = line.val.word1.getrange(32, 24);
+        }
+        if (!line.run_table) {
+            merge.gateway_inhibit_lut[logical_id] |= 1 << idx;
+        }
+        idx--;
+    }
+    if (!miss.run_table) {
+        merge.gateway_inhibit_lut[logical_id] |= 1 << 4;
+    }
+    write_next_table_regs(regs);
+    merge.gateway_en |= 1 << logical_id;
+    setup_muxctl(merge.gateway_to_logicaltable_xbar_ctl[logical_id], row.row * 2 + gw_unit);
+    if (layout.size() > 1) {
+        int result_bus = layout[1].bus.at(Layout::RESULT_BUS);
+        payload_write_regs(regs, layout[1].row, result_bus >> 1, result_bus & 1);
+    }
+    if (Table *tbl = match_table) {
+        bool tind_bus = false;
+        auto bus_type = Layout::RESULT_BUS;
+        auto *tmatch = dynamic_cast<TernaryMatchTable *>(tbl);
+        if (tmatch) {
+            tind_bus = true;
+            bus_type = Layout::TIND_BUS;
+            tbl = tmatch->indirect;
+        } else if (auto *hashaction = dynamic_cast<HashActionTable *>(tbl)) {
+            tind_bus = hashaction->layout[0].bus.at(bus_type) >= 2;
+        }
+        if (tbl) {
+            for (auto &row : tbl->layout) {
+                if (row.bus.count(bus_type)) {
+                    int bus = row.bus.at(bus_type);
+                    auto &xbar_ctl = merge.gateway_to_pbus_xbar_ctl[row.row * 2 + (bus & 1)];
+                    if (tind_bus) {
+                        xbar_ctl.tind_logical_select = logical_id;
+                        xbar_ctl.tind_inhibit_enable = 1;
+                    } else {
+                        xbar_ctl.exact_logical_select = logical_id;
+                        xbar_ctl.exact_inhibit_enable = 1;
+                    }
+                }
+            }
+        } else {
+            BUG_CHECK(tmatch);
+            auto &xbar_ctl = merge.gateway_to_pbus_xbar_ctl[tmatch->indirect_bus];
+            xbar_ctl.tind_logical_select = logical_id;
+            xbar_ctl.tind_inhibit_enable = 1;
+        }
+    } else {
+        if (gress != GHOST) merge.predication_ctl[gress].table_thread |= 1 << logical_id;
+        if (gress == INGRESS || gress == GHOST) {
+            merge.logical_table_thread[0].logical_table_thread_ingress |= 1 << logical_id;
+            merge.logical_table_thread[1].logical_table_thread_ingress |= 1 << logical_id;
+            merge.logical_table_thread[2].logical_table_thread_ingress |= 1 << logical_id;
+        } else if (gress == EGRESS) {
+            regs.dp.imem_table_addr_egress |= 1 << logical_id;
+            merge.logical_table_thread[0].logical_table_thread_egress |= 1 << logical_id;
+            merge.logical_table_thread[1].logical_table_thread_egress |= 1 << logical_id;
+            merge.logical_table_thread[2].logical_table_thread_egress |= 1 << logical_id;
+        }
+        auto &adrdist = regs.rams.match.adrdist;
+        adrdist.adr_dist_table_thread[timing_thread(gress)][0] |= 1 << logical_id;
+        adrdist.adr_dist_table_thread[timing_thread(gress)][1] |= 1 << logical_id;
+        // FIXME -- allow table_counter on standalone gateay?  What can it count?
+        if (options.match_compiler)
+            merge.mau_table_counter_ctl[logical_id / 8U].set_subfield(4, 3 * (logical_id % 8U), 3);
+        standalone_write_regs(regs);
+    }
+    if (stage->tcam_delay(gress) > 0) merge.exact_match_logical_result_delay |= 1 << logical_id;
+}
+
+std::set<std::string> gateways_in_json;
+void GatewayTable::gen_tbl_cfg(json::vector &out) const {
+    // Avoid adding gateway table multiple times to the json. The gateway table
+    // gets called multiple times in some cases based on how it is attached or
+    // associated with a match table, we should only output it to json once.
+    auto gwName = gateway_name.empty() ? name() : gateway_name;
+    if (gateways_in_json.count(gwName)) return;
+    LOG3("### Gateway table " << gwName << " gen_tbl_cfg " << loc());
+    json::map gTable;
+    gTable["direction"] = P4Table::direction_name(gress);
+    gTable["attached_to"] = match_table ? match_table->p4_name() : "-";
+    gTable["handle"] = gateway_handle++;
+    gTable["name"] = gwName;
+    gTable["table_type"] = "condition";
+
+    json::vector gStageTables;
+    json::map gStageTable;
+
+    json::map &next_table_ids = gStageTable["next_tables"];
+    json::map &next_table_names = gStageTable["next_table_names"];
+
+    auto &condTNext = cond_true.next;
+    auto &condFNext = cond_false.next;
+    if (Target::LONG_BRANCH_TAGS() > 0) {
+        json::vector &next_table_names_true = next_table_names["true"];
+        json::vector &next_table_names_false = next_table_names["false"];
+        json::vector &next_table_ids_true = next_table_ids["true"];
+        json::vector &next_table_ids_false = next_table_ids["false"];
+        if (condTNext.size() == 0) {
+            next_table_names_true.push_back(condTNext.next_table_name());
+            next_table_ids_true.push_back(condTNext.next_table_id());
+        } else {
+            for (auto t : condTNext) {
+                next_table_names_true.push_back(t.name);
+                next_table_ids_true.push_back(t->table_id());
+            }
+        }
+        if (condFNext.size() == 0) {
+            next_table_names_false.push_back(condFNext.next_table_name());
+            next_table_ids_false.push_back(condFNext.next_table_id());
+        } else {
+            for (auto t : condFNext) {
+                next_table_names_false.push_back(t.name);
+                next_table_ids_false.push_back(t->table_id());
+            }
+        }
+    } else {
+        next_table_ids["false"] = json::string(condFNext.next_table_id());
+        next_table_ids["true"] = json::string(condTNext.next_table_id());
+        next_table_names["false"] = json::string(condFNext.next_table_name());
+        next_table_names["true"] = json::string(condTNext.next_table_name());
+    }
+
+    json::map mra;
+    mra["memory_unit"] = gw_memory_unit();
+    mra["memory_type"] = "gateway";
+    mra["payload_buses"] = json::vector();
+    gStageTable["memory_resource_allocation"] = std::move(mra);
+    json::vector pack_format;  // For future use
+    gStageTable["pack_format"] = std::move(pack_format);
+
+    gStageTable["logical_table_id"] = logical_id;
+    gStageTable["stage_number"] = stage->stageno;
+    gStageTable["stage_table_type"] = "gateway";
+    gStageTable["size"] = 0;
+    gStageTables.push_back(std::move(gStageTable));
+
+    json::vector condition_fields;
+    for (auto m : match) {
+        json::map condition_field;
+        condition_field["name"] = m.val.name();
+        condition_field["start_bit"] = m.offset;
+        condition_field["bit_width"] = m.val.size();
+        condition_fields.push_back(std::move(condition_field));
+    }
+
+    gTable["stage_tables"] = std::move(gStageTables);
+    gTable["condition_fields"] = std::move(condition_fields);
+    gTable["condition"] = gateway_cond;
+    gTable["size"] = 0;
+    out.push_back(std::move(gTable));
+    gateways_in_json.insert(gwName);
+}
+
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(GatewayTable, TARGET_CLASS)
diff --git a/backends/tofino/bf-asm/gtest/asm-types.cpp b/backends/tofino/bf-asm/gtest/asm-types.cpp
new file mode 100644
index 00000000000..f03908e26bd
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/asm-types.cpp
@@ -0,0 +1,270 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/asm-types.h"
+
+#include <gtest/gtest.h>
+
+namespace {
+
+auto CaptureStderr = ::testing::internal::CaptureStderr;
+auto Stderr = ::testing::internal::GetCapturedStderr;
+auto terminate = ::testing::KilledBySignal(SIGABRT);
+
+TEST(asm_types, get_int64_0) {
+    uint32_t i = 0;
+    value_t v{tINT, 0, 0};
+    v.i = i;
+    CaptureStderr();
+    EXPECT_EQ(get_int64(v), i);
+    EXPECT_EQ(get_int64(v, 0), i);
+    EXPECT_EQ(get_int64(v, 0, "no error check"), i);
+    EXPECT_EQ(get_int64(v, 1), i);
+    EXPECT_EQ(get_int64(v, 1, "no error"), i);
+    EXPECT_EQ(get_int64(v, 64), i);
+    EXPECT_EQ(get_int64(v, 64, "no error"), i);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    // Slow tests...
+    EXPECT_EXIT(get_int64(v, 128), terminate, "Assembler BUG");
+    EXPECT_EXIT(get_int64(v, 128, "terminates"), terminate, "Assembler BUG");
+}
+
+TEST(asm_types, get_int64_32bit) {
+    uint32_t i = 0xAAAAAAAA;
+    value_t v{tINT, 0, 0};
+    v.i = i;
+    CaptureStderr();
+    EXPECT_EQ(get_int64(v), i);
+    EXPECT_EQ(get_int64(v, 0), i);
+    EXPECT_EQ(get_int64(v, 0, "no error check"), i);
+    EXPECT_EQ(get_int64(v, 32), i);
+    EXPECT_EQ(get_int64(v, 32, "no error"), i);
+    EXPECT_EQ(get_int64(v, 16), 0xAAAA);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_int64(v, 16, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_int64_64bit) {
+    uint64_t i = 0xAAAAAAAAAAAAAAAA;
+    value_t v{tINT, 0, 0};
+    v.i = i;
+    CaptureStderr();
+    EXPECT_EQ(get_int64(v), i);
+    EXPECT_EQ(get_int64(v, 0), i);
+    EXPECT_EQ(get_int64(v, 0, "no error check"), i);
+    EXPECT_EQ(get_int64(v, 64), i);
+    EXPECT_EQ(get_int64(v, 64, "no error"), i);
+    EXPECT_EQ(get_int64(v, 48), 0xAAAAAAAAAAAA);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_int64(v, 48, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_bigi_empty) {
+    value_t v{tBIGINT, 0, 0};
+    v.bigi = EMPTY_VECTOR_INIT;
+    EXPECT_EQ(get_int64(v), 0);
+    EXPECT_EQ(get_bitvec(v), bitvec());
+}
+
+TEST(asm_types, get_int64_bigi_0) {
+    uint32_t i = 0;
+    value_t v{tBIGINT, 0, 0};
+    VECTOR_init1(v.bigi, i);
+    CaptureStderr();
+    EXPECT_EQ(get_int64(v), i);
+    EXPECT_EQ(get_int64(v, 0), i);
+    EXPECT_EQ(get_int64(v, 0, "no error check"), i);
+    EXPECT_EQ(get_int64(v, 1), i);
+    EXPECT_EQ(get_int64(v, 1, "no error"), i);
+    EXPECT_EQ(get_int64(v, 64), i);
+    EXPECT_EQ(get_int64(v, 64, "no error"), i);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    // Slow tests...
+    EXPECT_EXIT(get_int64(v, 128), terminate, "Assembler BUG");
+    EXPECT_EXIT(get_int64(v, 128, "terminates"), terminate, "Assembler BUG");
+}
+
+TEST(asm_types, get_int64_bigi_32bit) {
+    uint32_t i = 0xAAAAAAAA;
+    value_t v{tBIGINT, 0, 0};
+    VECTOR_init1(v.bigi, i);
+    CaptureStderr();
+    EXPECT_EQ(get_int64(v), i);
+    EXPECT_EQ(get_int64(v, 0), i);
+    EXPECT_EQ(get_int64(v, 0, "no error check"), i);
+    EXPECT_EQ(get_int64(v, 32), i);
+    EXPECT_EQ(get_int64(v, 32, "no error"), i);
+    EXPECT_EQ(get_int64(v, 16), 0xAAAA);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_int64(v, 16, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_int64_bigi_64bit) {
+    uint64_t i = 0xAAAAAAAAAAAAAAAA;
+    value_t v{tBIGINT, 0, 0};
+    if (sizeof(uintptr_t) == sizeof(uint32_t))
+        VECTOR_init2(v.bigi, 0xAAAAAAAA, 0xAAAAAAAA);
+    else
+        VECTOR_init1(v.bigi, i);
+    CaptureStderr();
+    EXPECT_EQ(get_int64(v), i);
+    EXPECT_EQ(get_int64(v, 0), i);
+    EXPECT_EQ(get_int64(v, 0, "no error check"), i);
+    EXPECT_EQ(get_int64(v, 64), i);
+    EXPECT_EQ(get_int64(v, 64, "no error"), i);
+    EXPECT_EQ(get_int64(v, 48), 0xAAAAAAAAAAAA);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_int64(v, 48, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_bitvec_0) {
+    value_t v{tINT, 0, 0};
+    v.i = 0;
+    auto i = bitvec(0);
+    CaptureStderr();
+    EXPECT_EQ(get_bitvec(v), i);
+    EXPECT_EQ(get_bitvec(v, 0), i);
+    EXPECT_EQ(get_bitvec(v, 0, "no error check"), i);
+    EXPECT_EQ(get_bitvec(v, 1), i);
+    EXPECT_EQ(get_bitvec(v, 1, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 64), i);
+    EXPECT_EQ(get_bitvec(v, 64, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 128), i);
+    EXPECT_EQ(get_bitvec(v, 128, "no error"), i);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+}
+
+TEST(asm_types, get_bitvec_32bit) {
+    value_t v{tINT, 0, 0};
+    v.i = 0xAAAAAAAA;
+    auto i = bitvec(0xAAAAAAAA);
+    CaptureStderr();
+    EXPECT_EQ(get_bitvec(v), i);
+    EXPECT_EQ(get_bitvec(v, 0), i);
+    EXPECT_EQ(get_bitvec(v, 0, "no error check"), i);
+    EXPECT_EQ(get_bitvec(v, 32), i);
+    EXPECT_EQ(get_bitvec(v, 32, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 16), bitvec(0xAAAA));
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_bitvec(v, 16, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_bitvec_64bit) {
+    value_t v{tINT, 0, 0};
+    v.i = 0xAAAAAAAAAAAAAAAA;
+    auto i = bitvec(0xAAAAAAAAAAAAAAAA);
+    CaptureStderr();
+    EXPECT_EQ(get_bitvec(v), i);
+    EXPECT_EQ(get_bitvec(v, 0), i);
+    EXPECT_EQ(get_bitvec(v, 0, "no error check"), i);
+    EXPECT_EQ(get_bitvec(v, 64), i);
+    EXPECT_EQ(get_bitvec(v, 64, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 48), bitvec(0xAAAAAAAAAAAA));
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_bitvec(v, 48, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_bitvec_bigi_0) {
+    value_t v{tBIGINT, 0, 0};
+    VECTOR_init1(v.bigi, 0);
+    auto i = bitvec(0);
+    CaptureStderr();
+    EXPECT_EQ(get_bitvec(v), i);
+    EXPECT_EQ(get_bitvec(v, 0), i);
+    EXPECT_EQ(get_bitvec(v, 0, "no error check"), i);
+    EXPECT_EQ(get_bitvec(v, 1), i);
+    EXPECT_EQ(get_bitvec(v, 1, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 64), i);
+    EXPECT_EQ(get_bitvec(v, 64, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 128), i);
+    EXPECT_EQ(get_bitvec(v, 128, "no error"), i);
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+}
+
+TEST(asm_types, get_bitvec_bigi_32bit) {
+    value_t v{tBIGINT, 0, 0};
+    VECTOR_init1(v.bigi, 0xAAAAAAAA);
+    auto i = bitvec(0xAAAAAAAA);
+    CaptureStderr();
+    EXPECT_EQ(get_bitvec(v), i);
+    EXPECT_EQ(get_bitvec(v, 0), i);
+    EXPECT_EQ(get_bitvec(v, 0, "no error check"), i);
+    EXPECT_EQ(get_bitvec(v, 32), i);
+    EXPECT_EQ(get_bitvec(v, 32, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 16), bitvec(0xAAAA));
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_bitvec(v, 16, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_bitvec_bigi_64bit) {
+    value_t v{tBIGINT, 0, 0};
+    if (sizeof(uintptr_t) == sizeof(uint32_t))
+        VECTOR_init2(v.bigi, 0xAAAAAAAA, 0xAAAAAAAA);
+    else
+        VECTOR_init1(v.bigi, 0xAAAAAAAAAAAAAAAA);
+    auto i = bitvec(0xAAAAAAAAAAAAAAAA);
+    CaptureStderr();
+    EXPECT_EQ(get_bitvec(v), i);
+    EXPECT_EQ(get_bitvec(v, 0), i);
+    EXPECT_EQ(get_bitvec(v, 0, "no error check"), i);
+    EXPECT_EQ(get_bitvec(v, 64), i);
+    EXPECT_EQ(get_bitvec(v, 64, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 48), bitvec(0xAAAAAAAAAAAA));
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_bitvec(v, 48, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+TEST(asm_types, get_bitvec_bigi_128bit) {
+    value_t v{tBIGINT, 0, 0};
+    if (sizeof(uintptr_t) == sizeof(uint32_t))
+        VECTOR_init4(v.bigi, 0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA);
+    else
+        VECTOR_init2(v.bigi, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA);
+    bitvec i;
+    for (int j = 0; j < 4; ++j) i.putrange(j * 32, 32, 0xAAAAAAAA);
+    CaptureStderr();
+    EXPECT_EQ(get_bitvec(v), i);
+    EXPECT_EQ(get_bitvec(v, 0), i);
+    EXPECT_EQ(get_bitvec(v, 0, "no error check"), i);
+    EXPECT_EQ(get_bitvec(v, 128), i);
+    EXPECT_EQ(get_bitvec(v, 128, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 192), i);
+    EXPECT_EQ(get_bitvec(v, 192, "no error"), i);
+    EXPECT_EQ(get_bitvec(v, 48), bitvec(0xAAAAAAAAAAAA));
+    EXPECT_TRUE(Stderr().find("error") == std::string::npos);
+    CaptureStderr();
+    get_bitvec(v, 48, "my error");
+    EXPECT_TRUE(Stderr().find("error: my error") != std::string::npos);
+}
+
+}  // namespace
diff --git a/backends/tofino/bf-asm/gtest/depositfield.cpp b/backends/tofino/bf-asm/gtest/depositfield.cpp
new file mode 100644
index 00000000000..b307b445f46
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/depositfield.cpp
@@ -0,0 +1,153 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/depositfield.h"
+
+#include <gtest/gtest.h>
+
+#if __cplusplus < 201402L && __cpp_binary_literals < 201304
+#error "Binary literals are required"
+// We could fall back on boost/utility/binary.hpp
+#endif
+
+namespace {
+
+constexpr int conSize8 = 8;
+constexpr int conSize32 = 32;
+constexpr int tooLarge = 8;
+constexpr int tooSmall = -9;
+constexpr int tooSmall2 = -5;
+
+TEST(depositfield, 0) {
+    int32_t zero = 0;
+    auto res = DepositField::discoverRotation(zero, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, zero);
+    res = DepositField::discoverRotation(zero, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, zero);
+    res = DepositField::discoverRotation(zero, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, zero);
+}
+
+TEST(depositfield, large) {
+    int32_t value = tooLarge - 1;
+    auto res = DepositField::discoverRotation(value, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, value);
+    res = DepositField::discoverRotation(value, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, value);
+    res = DepositField::discoverRotation(value, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, value);
+}
+
+TEST(depositfield, small) {
+    int32_t value = tooSmall + 1;
+    int32_t value2 = tooSmall2 + 1;
+    auto res = DepositField::discoverRotation(value, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, value);
+    res = DepositField::discoverRotation(value, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, value);
+    ASSERT_TRUE(value < tooSmall2);
+    res = DepositField::discoverRotation(value, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 0U);  // Not possible '0b11111000'
+    EXPECT_EQ(res.value, value);
+    res = DepositField::discoverRotation(value2, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, value2);
+}
+
+TEST(depositfield, numTooLarge) {  // 0b00001000
+    // N.B. other solutions are valid, these are the ones we expect.
+    auto res = DepositField::discoverRotation(8, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 5U);
+    EXPECT_EQ(res.value, 1);
+    res = DepositField::discoverRotation(8, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 29U);
+    EXPECT_EQ(res.value, 1);
+    res = DepositField::discoverRotation(8, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 5U);
+    EXPECT_EQ(res.value, 1);
+}
+
+TEST(depositfield, numTooSmall) {  // 0b11110111
+    auto res = DepositField::discoverRotation(-9, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 5U);
+    EXPECT_EQ(res.value, -2);
+    res = DepositField::discoverRotation(-9, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 29U);
+    EXPECT_EQ(res.value, -2);
+    res = DepositField::discoverRotation(-9, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 5U);
+    EXPECT_EQ(res.value, -2);
+}
+
+TEST(depositfield, 0b00110000) {
+    auto res = DepositField::discoverRotation(0b00110000, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 4U);
+    EXPECT_EQ(res.value, 0b00000011);
+    res = DepositField::discoverRotation(0b00110000, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 28U);
+    EXPECT_EQ(res.value, 0b00000011);
+    res = DepositField::discoverRotation(0b00110000, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 4U);
+    EXPECT_EQ(res.value, 0b00000011);
+}
+
+TEST(depositfield, 0b00100001) {
+    // Failures are sent back with zero rotation and the value unchanged.
+    auto res = DepositField::discoverRotation(0b00100001, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, 0b00100001);
+    res = DepositField::discoverRotation(0b00100001, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, 0b00100001);
+    res = DepositField::discoverRotation(0b00100001, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, 0b00100001);
+}
+
+TEST(depositfield, 0b01111111) {  // 127
+    auto res = DepositField::discoverRotation(0b01111111, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 1U);
+    EXPECT_EQ(res.value, -2);
+    res = DepositField::discoverRotation(0b01111111, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 0U);
+    EXPECT_EQ(res.value, 0b01111111);  // Can't do.
+    res = DepositField::discoverRotation(0b01111111, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 1U);
+    EXPECT_EQ(res.value, -2);
+}
+
+TEST(depositfield, 0b10011111) {  // -97
+    auto res = DepositField::discoverRotation(-97, conSize8, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 3U);
+    EXPECT_EQ(res.value, -4);
+    res = DepositField::discoverRotation(-97, conSize32, tooLarge, tooSmall);
+    EXPECT_EQ(res.rotate, 27U);
+    EXPECT_EQ(res.value, -4);
+    res = DepositField::discoverRotation(-97, conSize8, tooLarge, tooSmall2);
+    EXPECT_EQ(res.rotate, 3U);
+    EXPECT_EQ(res.value, -4);
+}
+
+}  // namespace
diff --git a/backends/tofino/bf-asm/gtest/gateway.cpp b/backends/tofino/bf-asm/gtest/gateway.cpp
new file mode 100644
index 00000000000..437afdfc6a2
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/gateway.cpp
@@ -0,0 +1,123 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "backends/tofino/bf-asm/bfas.h"
+#include "backends/tofino/bf-asm/stage.h"
+
+namespace {
+
+// Verify that the next table registers are correctly configured for a standalone gateway with a
+// miss next table and no hit next table
+TEST(gateway, standalone_miss_next_table) {
+    const char *gateway_str = R"GATEWAY_CFG(
+version:
+  target: Tofino2
+phv ingress:
+  ig_intr_md_for_dprsr.mirror_type.$valid: B1(0)
+  ig_intr_md.ingress_port: {  stage 0: W0(16..24) }
+  hdr.data.h1: MH4
+  hdr.data.b1: MB1
+  ig_intr_md_for_tm.ucast_egress_port: {  stage 1..20: W0(0..8) }
+  ig_intr_md_for_tm.ucast_egress_port.$valid: {  stage 1..20: B1(1) }
+  ig_intr_md_for_dprsr.mirror_type: {  stage 20: MB0(0..3) }
+  hdr.data.$valid: B1(2)
+stage 0 ingress:
+  gateway cond-1 0:
+    name: cond-1
+    input_xbar:
+      exact group 0: { 16: hdr.data.b1 }
+    row: 7
+    bus: 0
+    unit: 0
+    match: { 0: hdr.data.b1 }
+    0x12:
+      next:  END
+    miss:
+      next:  test_0
+    condition:
+      expression: "(hdr.data.b1 != 18)"
+      true:  test_0
+      false:  END
+stage 2 ingress:
+  dependency: match
+  mpr_stage_id: 1
+  mpr_bus_dep_glob_exec: 0x0
+  mpr_bus_dep_long_brch: 0x0
+  mpr_always_run: 0x0
+  mpr_next_table_lut:
+    0: 0xff
+  ternary_match test_0 0:
+    always_run: true
+    p4: { name: ingress.test, size: 512 }
+    p4_param_order:
+      hdr.data.h1: { type: ternary, size: 16, full_size: 16 }
+    row: 0
+    bus: 0
+    column: 0
+    input_xbar:
+      ternary group 0: { 0: hdr.data.h1 }
+    match:
+    - { group: 0, byte_config: 3, dirtcam: 0x5 }
+    hit: [  END ]
+    miss:  END
+    indirect: test_0$tind
+  ternary_indirect test_0$tind:
+    row: 0
+    bus: 0
+    column: 2
+    input_xbar:
+      ternary group 0: { 0: hdr.data.h1 }
+    format: { action: 0..1, immediate: 2..9 }
+    action_bus: { 0 : immediate(0..7) }
+    instruction: test_0$tind(action, $DEFAULT)
+    actions:
+      ingress.setb1(1, 1):
+      - p4_param_order: { val: 8 }
+      - hit_allowed: { allowed: true }
+      - default_action: { allowed: true }
+      - handle: 0x20000002
+      - next_table: 0
+      - { val_1: immediate(0..7), val: val_1 }
+      - set MB1, val
+      ingress.noop(2, 0):
+      - hit_allowed: { allowed: true }
+      - default_action: { allowed: true }
+      - handle: 0x20000003
+      - next_table: 0
+      - {  }
+    default_action: ingress.setb1
+    default_action_parameters:
+      val: "0xAA"
+)GATEWAY_CFG";
+
+    asm_parse_string(gateway_str);
+
+    Section::process_all();
+
+    Target::JBay::mau_regs regs;
+    auto &stages = AsmStage::stages(INGRESS);
+    stages[0].write_regs(regs, false);
+    for (auto table : stages[0].tables) {
+        table->write_regs(regs);
+    }
+
+    EXPECT_EQ(regs.rams.match.merge.pred_is_a_brch, 0x01);
+}
+
+}  // namespace
diff --git a/backends/tofino/bf-asm/gtest/gtestasm.cpp b/backends/tofino/bf-asm/gtest/gtestasm.cpp
new file mode 100644
index 00000000000..85950ee1b9b
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/gtestasm.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stdio.h>
+
+#include <gtest/gtest.h>
+
+#include "lib/compile_context.h"
+#include "lib/log.h"
+#include "lib/options.h"
+
+using namespace P4;
+
+template <typename OptionsType>
+class CompileContext : public virtual BaseCompileContext {
+ public:
+    /// @return the current compilation context, which must be of type
+    /// CompileContext<OptionsType>.
+    static CompileContext &get() { return CompileContextStack::top<CompileContext>(); }
+
+    CompileContext() {}
+
+    template <typename OptionsDerivedType>
+    CompileContext(CompileContext<OptionsDerivedType> &context)
+        : optionsInstance(context.options()) {}
+
+    /// @return the compiler options for this compilation context.
+    OptionsType &options() { return optionsInstance; }
+
+ private:
+    /// The compiler options for this compilation context.
+    OptionsType optionsInstance;
+};
+
+class GTestOptions : public Util::Options {
+    static const char *defaultMessage;
+
+ public:
+    GTestOptions() : Util::Options(defaultMessage) {
+        registerOption(
+            "-T", "loglevel",
+            [](const char *arg) {
+                Log::addDebugSpec(arg);
+                return true;
+            },
+            "[Compiler debugging] Adjust logging level per file (see below)");
+    }
+    std::vector<const char *> *process(int argc, char *const argv[]) {
+        auto remainingOptions = Util::Options::process(argc, argv);
+        return remainingOptions;
+    }
+    const char *getIncludePath() const override { return ""; }
+};
+
+const char *GTestOptions::defaultMessage = "bf-asm gtest";
+
+using GTestContext = CompileContext<GTestOptions>;
+
+GTEST_API_ int main(int argc, char **argv) {
+    printf("running gtestasm\n");
+
+    // process gtest flags
+    ::testing::InitGoogleTest(&argc, argv);
+
+    // process debug flags
+    AutoCompileContext autoGTestContext(new GTestContext);
+    GTestContext::get().options().process(argc, argv);
+
+    return RUN_ALL_TESTS();
+}
diff --git a/backends/tofino/bf-asm/gtest/hashexpr.cpp b/backends/tofino/bf-asm/gtest/hashexpr.cpp
new file mode 100644
index 00000000000..f89c199472a
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/hashexpr.cpp
@@ -0,0 +1,118 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/hashexpr.h"
+
+#include <gtest/gtest.h>
+
+#include "backends/tofino/bf-asm/bfas.h"
+#include "backends/tofino/bf-asm/stage.h"
+
+namespace {
+
+// TEST(hashexpr, slice_with_rand_alg)
+//
+// Verify that a slice with a random algorithm doesn't loop forever
+//
+// Warning: If it does loop forever, then the test will hang :( Running through ctest should
+// result in an eventual timeout, but running from the command line will hang until Ctrl-C.
+TEST(hashexpr, slice_with_rand_alg) {
+    const char *hash_str = R"HASH_CFG(
+version:
+  target: Tofino2
+phv ingress:
+  Field1: MW0
+  Field2: MW1
+  Field3: MH8(0..8)
+  Field4: MB9
+  Hdr.$valid: B3(4)
+stage 0 ingress:
+  hash_action _HashTable 0:
+    always_run: true
+    p4: { name: HashTable, size: 1, disable_atomic_modify : true }
+    row: 0
+    result_bus: 1
+    hash_dist:
+      1: { hash: 1, mask: 0xffff, shift: 0 }
+    input_xbar:
+      exact group 2: { 0: Field1, 32: Field2, 64: Field3, 80: Field4 }
+      hash 4:
+        16..31: slice(stripe(crc_rev(0xc002, 0x0, 0x0, 81, { 9: Field2, 41: Field1 }, { })), 0..15)
+      hash 5:
+        16..31: slice(stripe(crc_rev(0xc002, 0x0, 0x0, 81, { 0: Field3, 73: Field4 }, { })), 0..15)
+      hash group 1:
+        table: [4, 5]
+        seed: 0x0
+    gateway:
+      name: cond-81
+      input_xbar:
+        exact group 1: { 36: Hdr.$valid }
+      row: 1
+      bus: 0
+      unit: 0
+      payload_row: 0
+      payload_unit: 1
+      payload: 0x1
+      format: { action(0): 0..0 }
+      match: { 4: Hdr.$valid }
+      0b***1:  END
+      miss: run_table
+      condition:
+        expression: "(Hdr.$valid == 1)"
+        true:  END
+        false:  END
+    next:  END
+    action_bus: { 108..111 : hash_dist(1) }
+    instruction: _HashTable(action, $DEFAULT)
+    actions:
+      MyAction(1, 7):
+      - hit_allowed: { allowed: true }
+      - default_action: { allowed: true }
+      - handle: 0x20000063
+      - next_table: 0
+      - set W15(0..15), hash_dist(1, 0..15)
+    default_action: MyAction
+)HASH_CFG";
+
+    asm_parse_string(hash_str);
+
+    Stage *stage = Stage::stage(INGRESS, 0);
+    Table *table = stage->tables[0];
+    InputXbar &ixbar = *table->input_xbar[0];
+    for (auto &kv1 : ixbar.get_hash_tables()) {
+        // Grab the hash table map
+        auto &htmap = kv1.second;
+        for (auto &kv2 : htmap) {
+            // Get the hash column/hash expression and change the hash algorithm
+            auto &hc = kv2.second;
+            auto *he = hc.fn;
+            he->hash_algorithm.hash_alg = RANDOM_DYN;
+        }
+    }
+
+    std::cerr << std::endl
+              << "If this test hangs then there is a problem with handling of RANDOM_DYN at the "
+                 "hash slice level. Terminate the hang with Ctrl-C."
+              << std::endl
+              << std::endl;
+    Section::process_all();
+
+    // Reset the target type for future tests
+    options.target = NO_TARGET;
+}
+
+}  // namespace
diff --git a/backends/tofino/bf-asm/gtest/mirror.cpp b/backends/tofino/bf-asm/gtest/mirror.cpp
new file mode 100644
index 00000000000..bfa377d83e6
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/mirror.cpp
@@ -0,0 +1,241 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "backends/tofino/bf-asm/deparser.h"
+#include "backends/tofino/bf-asm/sections.h"
+
+namespace {
+
+/* Tests for mirror
+ *
+ * Currently we cannot run tests for multiple targets (e.g., Tofino and JBay)
+ * in a single run. As a result, all tests except Tofino are disabled.
+ */
+
+#define TOF_MIRR_CFG regs.header.hir.main_i.mirror_cfg
+#define TOF_MIRR_TBL regs.header.hir.main_i.mirror_tbl
+
+#define JBAY_MIRR_BASE regs.dprsrreg.ho_i
+#define JBAY_MIRR_ENTRY him.mirr_hdr_tbl.entry
+#define JBAY_MIRR_SEL regs.dprsrreg.inp.ipp.ingr.m_mirr_sel
+
+#define FTR_MDP_MIRR_BASE regs.mdp_mem.tmm_ext_ram.tmm_ext[0]
+#define FTR_DPRSR_MIRR_BASE regs.dprsr.dprsr_phvxb_rspec.ehm_xb
+
+/// Mirror configuration for Tofino
+struct TofinoMirrorCfg {
+    std::string sel_phv_;
+    int sel_phv_lo_;
+
+    std::map<int, std::string> entry_id_phv;
+    std::map<int, std::vector<std::string>> entry_phvs;
+
+    TofinoMirrorCfg(std::string sel_phv, int sel_phv_lo)
+        : sel_phv_(sel_phv), sel_phv_lo_(sel_phv_lo) {}
+};
+
+/// Mirror configuration for JBay
+struct JBayMirrorCfg {
+    std::string sel_phv_;
+    int sel_phv_lo_;
+
+    std::string sel_pov_;
+    int sel_pov_lo_;
+
+    std::map<int, std::string> entry_id_phv;
+    std::map<int, std::vector<std::string>> entry_phvs;
+
+    JBayMirrorCfg(std::string sel_phv, int sel_phv_lo, std::string sel_pov, int sel_pov_lo)
+        : sel_phv_(sel_phv), sel_phv_lo_(sel_phv_lo), sel_pov_(sel_pov), sel_pov_lo_(sel_pov_lo) {}
+};
+
+/// Map from register name to Phv::Register*
+std::map<std::string, const Phv::Register *> phvRegs;
+
+/// Populate register name -> register map
+void populateRegIds() {
+    if (!phvRegs.size()) {
+        // Initialize the PHVs.
+        // Triggered by requesting a slice for a field. The field does not need to exist.
+        Phv::get(INGRESS, 0, "jbay_dummy$");
+
+        // Walk through the registers and record them
+        for (int i = 0; i < Phv::num_regs(); ++i) {
+            if (const auto *reg = Phv::reg(i)) phvRegs[reg->name] = reg;
+        }
+    }
+}
+
+/// Get the MAU ID of a given register name
+int mau_id(std::string name) { return phvRegs.count(name) ? phvRegs.at(name)->mau_id() : -1; }
+
+/// Get the deparser ID of a given register name
+int deparser_id(std::string name) {
+    return phvRegs.count(name) ? phvRegs.at(name)->deparser_id() : -1;
+}
+
+/// Find a Digest for a given target
+Deparser::Digest *findDigest(Deparser *dprsr, target_t target) {
+    for (auto &digest : dprsr->digests) {
+        if (digest.type->target == target) return &digest;
+    }
+
+    BUG("Could not find the Digest for %s", toString(target).c_str());
+    return nullptr;
+}
+
+/** Reset all target information
+ *
+ * This function should be called when switching from one target to another
+ * (e.g., Tofino to JBay) in tests to reset state.
+ */
+void resetTarget() {
+    options.target = NO_TARGET;
+    Phv::test_clear();
+    phvRegs.clear();
+    Deparser *dprsr = dynamic_cast<Deparser *>(Section::test_get("deparser"));
+    dprsr->gtest_clear();
+}
+
+/// Verify that registers match a mirror configuration (Tofino)
+void tofinoCheckMirrorRegs(Target::Tofino::deparser_regs &regs, TofinoMirrorCfg &cfg) {
+    populateRegIds();
+
+    Deparser *dprsr = dynamic_cast<Deparser *>(Section::test_get("deparser"));
+    auto *digest = findDigest(dprsr, TOFINO);
+
+    // Tell the digest code to set the registers
+    digest->type->setregs(regs, *dprsr, *digest);
+
+    // Verify the registers:
+    // 1. Verify common registers
+    EXPECT_EQ(TOF_MIRR_CFG.phv, deparser_id(cfg.sel_phv_));
+    EXPECT_EQ(TOF_MIRR_CFG.shft, cfg.sel_phv_lo_);
+    EXPECT_EQ(TOF_MIRR_CFG.valid, 1);
+
+    // 2. Verify the entries
+    for (auto &kv : cfg.entry_id_phv) {
+        int id = kv.first;
+        EXPECT_EQ(TOF_MIRR_TBL[id].id_phv, deparser_id(cfg.entry_id_phv[id]));
+        int idx = 0;
+        for (auto &phv : cfg.entry_phvs[id]) {
+            EXPECT_EQ(TOF_MIRR_TBL[id].phvs[idx], deparser_id(phv));
+            idx++;
+        }
+        EXPECT_EQ(TOF_MIRR_TBL[id].len, cfg.entry_phvs[id].size());
+    }
+}
+
+/// Verify that registers match a mirror configuration (JBay)
+void jbayCheckMirrorRegs(Target::JBay::deparser_regs &regs, JBayMirrorCfg &cfg) {
+    // Base index for POV PHV. Want this to be non-zero.
+    const int povBase = 64;
+
+    populateRegIds();
+
+    Deparser *dprsr = dynamic_cast<Deparser *>(Section::test_get("deparser"));
+    auto *digest = findDigest(dprsr, JBAY);
+
+    // Ensure the POV register in the config is actually recorded as a POV in
+    // the deparser object
+    int povReg = mau_id(cfg.sel_pov_);
+    dprsr->pov[INGRESS][Phv::reg(povReg)] = povBase;
+
+    // Tell the digest code to set the registers
+    digest->type->setregs(regs, *dprsr, *digest);
+
+    // Verify the registers:
+    // 1. Verify common registers
+    EXPECT_EQ(JBAY_MIRR_SEL.phv, deparser_id(cfg.sel_phv_));
+    EXPECT_EQ(JBAY_MIRR_SEL.pov, povBase + cfg.sel_pov_lo_);
+    EXPECT_EQ(JBAY_MIRR_SEL.shft, cfg.sel_phv_lo_);
+    EXPECT_EQ(JBAY_MIRR_SEL.disable_, 0);
+
+    // 2. Verify the entries
+    for (auto &base : JBAY_MIRR_BASE) {
+        for (auto &kv : cfg.entry_id_phv) {
+            int id = kv.first;
+            EXPECT_EQ(base.JBAY_MIRR_ENTRY[id].id_phv, deparser_id(cfg.entry_id_phv[id]));
+            int idx = 0;
+            for (auto &phv : cfg.entry_phvs[id]) {
+                EXPECT_EQ(base.JBAY_MIRR_ENTRY[id].phvs[idx], deparser_id(phv));
+                idx++;
+            }
+            EXPECT_EQ(base.JBAY_MIRR_ENTRY[id].len, cfg.entry_phvs[id].size());
+        }
+    }
+}
+
+TEST(mirror, digest_tofino) {
+    const char *mirror_str = R"MIRR_CFG(
+version:
+  target: Tofino
+deparser ingress:
+  mirror:
+    select: B9(0..3)  # bit[3..0]: ingress::ig_intr_md_for_dprsr.mirror_type
+    1:
+      - H19(0..7)  # bit[7..0]: ingress::Thurmond.Circle.LaUnion[7:0].0-7
+      - B9  # ingress::Thurmond.Longwood.Matheson
+      - B9  # ingress::Thurmond.Longwood.Matheson
+      - H56(0..8)  # bit[8..0]: ingress::Thurmond.Armagh.Moorcroft
+)MIRR_CFG";
+
+    resetTarget();
+
+    auto *digest = ::get(Deparser::Digest::Type::all[TOFINO][INGRESS], "mirror");
+    ASSERT_NE(digest, nullptr) << "Unable to find the mirror digest";
+
+    Target::Tofino::deparser_regs regs;
+    asm_parse_string(mirror_str);
+
+    TofinoMirrorCfg mirrorCfg("B9", 0);
+    mirrorCfg.entry_id_phv[1] = "H19";
+    mirrorCfg.entry_phvs[1] = {"B9", "B9", "H56", "H56"};
+    tofinoCheckMirrorRegs(regs, mirrorCfg);
+}
+
+TEST(mirror, digest_jbay) {
+    const char *mirror_str = R"MIRR_CFG(
+version:
+  target: Tofino2
+deparser ingress:
+  mirror:
+    select: { B9(0..3): B8(1) }  # bit[3..0]: ingress::ig_intr_md_for_dprsr.mirror_type
+    1:
+      - H19(0..7)  # bit[7..0]: ingress::Thurmond.Circle.LaUnion[7:0].0-7
+      - B9  # ingress::Thurmond.Longwood.Matheson
+      - B9  # ingress::Thurmond.Longwood.Matheson
+      - H56(0..8)  # bit[8..0]: ingress::Thurmond.Armagh.Moorcroft
+)MIRR_CFG";
+
+    resetTarget();
+
+    auto *digest = ::get(Deparser::Digest::Type::all[JBAY][INGRESS], "mirror");
+    ASSERT_NE(digest, nullptr) << "Unable to find the mirror digest";
+
+    Target::JBay::deparser_regs regs;
+    asm_parse_string(mirror_str);
+
+    JBayMirrorCfg mirrorCfg("B9", 0, "B8", 1);
+    mirrorCfg.entry_id_phv[1] = "H19";
+    mirrorCfg.entry_phvs[1] = {"B9", "B9", "H56", "H56"};
+    jbayCheckMirrorRegs(regs, mirrorCfg);
+}
+
+}  // namespace
diff --git a/backends/tofino/bf-asm/gtest/parser-test.cpp b/backends/tofino/bf-asm/gtest/parser-test.cpp
new file mode 100644
index 00000000000..f2bed038d8d
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/parser-test.cpp
@@ -0,0 +1,1055 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "backends/tofino/bf-asm/bfas.h"
+#include "backends/tofino/bf-asm/parser-tofino-jbay.h"
+
+namespace {
+
+// TEST(parser_test, get_parser_deepest_depth)
+//
+//
+// While calculating the maximum depth, the assembler goes through the parser tree
+// and visits every state recursively.  The parser depth for a state is taken into account
+// and included in the calculation at the time it is visited.
+//
+// Every state used to be visited at most one time, which was the source of the problem:
+//
+//     In cases where parsing trees contained states that were called from more than one
+//     parent state, the depth calculation would be wrong unless the depth was at its maximum
+//     value the first time that state was visited.
+//
+// Made a change in the parse depth calculation to keep track of the largest parser depth
+// "seen" for each state.  When a state has already been visited, the recursion continues
+// when the current parser depth is larger than the largest parser depth seen up to that
+// point for that state.
+
+// The parser code provided in parser_str below contains that behavior as parse_udp and
+// parse_tcp are called from both parse_ipv4 and parse_ipv6, two states with different depths,
+// the longest one being parse_ipv6 that is visited after parse_ipv4.  Without the fix,
+// parser->get_prsr_max_dph() returns 6 instead of 7.
+//
+TEST(parser_test, get_parser_deepest_depth) {
+    const char *parser_str = R"PARSER_CFG(
+version:
+  target: Tofino
+parser egress:
+  start: $entry_point
+  init_zero: [ B19, B18, B16 ]
+  bitwise_or: [ B16, B18 ]
+  hdr_len_adj: 27
+  meta_opt: 8191
+  states:
+    $entry_point:
+      *:
+        load: { byte1 : 27 }
+        buf_req: 28
+        next: start
+    start:
+      match: [ byte1 ]
+      0x0a:
+        counter:
+          imm: 38
+        0..1: H16  # bit[7..15] -> H16 bit[8..0]: egress::eg_intr_md.egress_port
+        intr_md: 9
+        shift: 27
+        buf_req: 27
+        next: parse_mirror_tagging_state
+      0x**:
+        counter:
+          imm: 38
+        0..1: H16  # bit[7..15] -> H16 bit[8..0]: egress::eg_intr_md.egress_port
+        intr_md: 9
+        shift: 27
+        buf_req: 27
+        next: parse_normal_tagging_state
+    parse_mirror_tagging_state:
+      *:
+        counter: dec 1
+        B19: 10  # value 10 -> B19 bit[7..0]: egress::eg_md.packet_state
+        load: { half : 13..14 }
+        shift: 1
+        buf_req: 15
+        next: parse_ethernet
+    parse_ethernet:
+      match: [ half ]
+      0x0800:
+        counter: dec 14
+        0..1: TH32  # egress::hdr.ethernet.dst_addr[47:32].32-47
+        2..5: TW19  # egress::hdr.ethernet.dst_addr[31:0].0-31
+        6..7: TH31  # egress::hdr.ethernet.src_addr[47:32].32-47
+        8..11: TW18  # egress::hdr.ethernet.src_addr[31:0].0-31
+        12..13: TH30  # egress::hdr.ethernet.ether_type
+        B18: 1  # value 1 -> B18 bit[0]: egress::hdr.ethernet.$valid
+        load: { byte1 : 23 }
+        shift: 14
+        buf_req: 24
+        next: parse_ipv4
+      0x86dd:
+        counter: dec 14
+        0..1: TH32  # egress::hdr.ethernet.dst_addr[47:32].32-47
+        2..5: TW19  # egress::hdr.ethernet.dst_addr[31:0].0-31
+        6..7: TH31  # egress::hdr.ethernet.src_addr[47:32].32-47
+        8..11: TW18  # egress::hdr.ethernet.src_addr[31:0].0-31
+        12..13: TH30  # egress::hdr.ethernet.ether_type
+        B18: 1  # value 1 -> B18 bit[0]: egress::hdr.ethernet.$valid
+        shift: 14
+        buf_req: 14
+        next: parse_ipv6
+      0x****:
+        counter: dec 14
+        0..1: TH32  # egress::hdr.ethernet.dst_addr[47:32].32-47
+        2..5: TW19  # egress::hdr.ethernet.dst_addr[31:0].0-31
+        6..7: TH31  # egress::hdr.ethernet.src_addr[47:32].32-47
+        8..11: TW18  # egress::hdr.ethernet.src_addr[31:0].0-31
+        12..13: TH30  # egress::hdr.ethernet.ether_type
+        B18: 1  # value 1 -> B18 bit[0]: egress::hdr.ethernet.$valid
+        shift: 14
+        buf_req: 14
+        next: min_parse_depth_accept_initial
+    parse_ipv4:
+      match: [ byte1 ]
+      0x06:
+        counter: dec 20
+        0..3: TW4
+            # - bit[0..3] -> TW4 bit[31..28]: egress::hdr.ipv4.version
+            # - bit[4..7] -> TW4 bit[27..24]: egress::hdr.ipv4.ihl
+            # - bit[8..15] -> TW4 bit[23..16]: egress::hdr.ipv4.diffserv
+            # - bit[16..31] -> TW4 bit[15..0]: egress::hdr.ipv4.total_len
+        4..7: TW6
+            # - bit[32..47] -> TW6 bit[31..16]: egress::hdr.ipv4.identification
+            # - bit[48..50] -> TW6 bit[15..13]: egress::hdr.ipv4.flags
+            # - bit[51..63] -> TW6 bit[12..0]: egress::hdr.ipv4.frag_offset
+        8..11: TW5
+            # - bit[64..71] -> TW5 bit[31..24]: egress::hdr.ipv4.ttl
+            # - bit[72..79] -> TW5 bit[23..16]: egress::hdr.ipv4.protocol
+            # - bit[80..95] -> TW5 bit[15..0]: egress::hdr.ipv4.hdr_checksum
+        12..13: TH27  # egress::hdr.ipv4.src_addr[31:16].16-31
+        14..15: TH26  # egress::hdr.ipv4.src_addr[15:0].0-15
+        16..17: TH25  # egress::hdr.ipv4.dst_addr[31:16].16-31
+        18..19: TH24  # egress::hdr.ipv4.dst_addr[15:0].0-15
+        B18: 2  # value 1 -> B18 bit[1]: egress::hdr.ipv4.$valid
+        load: { half : 22..23 }
+        shift: 20
+        buf_req: 24
+        next: parse_tcp
+      0x11:
+        counter: dec 20
+        0..3: TW4
+            # - bit[0..3] -> TW4 bit[31..28]: egress::hdr.ipv4.version
+            # - bit[4..7] -> TW4 bit[27..24]: egress::hdr.ipv4.ihl
+            # - bit[8..15] -> TW4 bit[23..16]: egress::hdr.ipv4.diffserv
+            # - bit[16..31] -> TW4 bit[15..0]: egress::hdr.ipv4.total_len
+        4..7: TW6
+            # - bit[32..47] -> TW6 bit[31..16]: egress::hdr.ipv4.identification
+            # - bit[48..50] -> TW6 bit[15..13]: egress::hdr.ipv4.flags
+            # - bit[51..63] -> TW6 bit[12..0]: egress::hdr.ipv4.frag_offset
+        8..11: TW5
+            # - bit[64..71] -> TW5 bit[31..24]: egress::hdr.ipv4.ttl
+            # - bit[72..79] -> TW5 bit[23..16]: egress::hdr.ipv4.protocol
+            # - bit[80..95] -> TW5 bit[15..0]: egress::hdr.ipv4.hdr_checksum
+        12..13: TH27  # egress::hdr.ipv4.src_addr[31:16].16-31
+        14..15: TH26  # egress::hdr.ipv4.src_addr[15:0].0-15
+        16..17: TH25  # egress::hdr.ipv4.dst_addr[31:16].16-31
+        18..19: TH24  # egress::hdr.ipv4.dst_addr[15:0].0-15
+        B18: 2  # value 1 -> B18 bit[1]: egress::hdr.ipv4.$valid
+        load: { half : 20..21 }
+        shift: 20
+        buf_req: 22
+        next: parse_udp
+      0x**:
+        counter: dec 20
+        0..3: TW4
+            # - bit[0..3] -> TW4 bit[31..28]: egress::hdr.ipv4.version
+            # - bit[4..7] -> TW4 bit[27..24]: egress::hdr.ipv4.ihl
+            # - bit[8..15] -> TW4 bit[23..16]: egress::hdr.ipv4.diffserv
+            # - bit[16..31] -> TW4 bit[15..0]: egress::hdr.ipv4.total_len
+        4..7: TW6
+            # - bit[32..47] -> TW6 bit[31..16]: egress::hdr.ipv4.identification
+            # - bit[48..50] -> TW6 bit[15..13]: egress::hdr.ipv4.flags
+            # - bit[51..63] -> TW6 bit[12..0]: egress::hdr.ipv4.frag_offset
+        8..11: TW5
+            # - bit[64..71] -> TW5 bit[31..24]: egress::hdr.ipv4.ttl
+            # - bit[72..79] -> TW5 bit[23..16]: egress::hdr.ipv4.protocol
+            # - bit[80..95] -> TW5 bit[15..0]: egress::hdr.ipv4.hdr_checksum
+        12..13: TH27  # egress::hdr.ipv4.src_addr[31:16].16-31
+        14..15: TH26  # egress::hdr.ipv4.src_addr[15:0].0-15
+        16..17: TH25  # egress::hdr.ipv4.dst_addr[31:16].16-31
+        18..19: TH24  # egress::hdr.ipv4.dst_addr[15:0].0-15
+        B18: 2  # value 1 -> B18 bit[1]: egress::hdr.ipv4.$valid
+        shift: 20
+        buf_req: 20
+        next: min_parse_depth_accept_initial
+    parse_tcp:
+      match: [ half ]
+      0x0050:
+        counter: dec 20
+        0..1: TH8  # egress::hdr.tcp.src_port
+        2..3: TH7  # egress::hdr.tcp.dst_port
+        4..7: TW17  # egress::hdr.tcp.seq_no
+        8..11: TW16  # egress::hdr.tcp.ack_no
+        12: TB5
+            # - bit[96..99] -> TB5 bit[7..4]: egress::hdr.tcp.data_offset
+            # - bit[100..103] -> TB5 bit[3..0]: egress::hdr.tcp.res
+        13: TB6  # egress::hdr.tcp.flags
+        14..15: TH6  # egress::hdr.tcp.window
+        16..19: TW7
+            # - bit[128..143] -> TW7 bit[31..16]: egress::hdr.tcp.checksum
+            # - bit[144..159] -> TW7 bit[15..0]: egress::hdr.tcp.urgent_ptr
+        B18: 4  # value 1 -> B18 bit[2]: egress::hdr.tcp.$valid
+        shift: 20
+        buf_req: 20
+        next: parse_app
+      0x01bb:
+        counter: dec 20
+        0..1: TH8  # egress::hdr.tcp.src_port
+        2..3: TH7  # egress::hdr.tcp.dst_port
+        4..7: TW17  # egress::hdr.tcp.seq_no
+        8..11: TW16  # egress::hdr.tcp.ack_no
+        12: TB5
+            # - bit[96..99] -> TB5 bit[7..4]: egress::hdr.tcp.data_offset
+            # - bit[100..103] -> TB5 bit[3..0]: egress::hdr.tcp.res
+        13: TB6  # egress::hdr.tcp.flags
+        14..15: TH6  # egress::hdr.tcp.window
+        16..19: TW7
+            # - bit[128..143] -> TW7 bit[31..16]: egress::hdr.tcp.checksum
+            # - bit[144..159] -> TW7 bit[15..0]: egress::hdr.tcp.urgent_ptr
+        B18: 4  # value 1 -> B18 bit[2]: egress::hdr.tcp.$valid
+        shift: 20
+        buf_req: 20
+        next: parse_app
+      0x15b3:
+        counter: dec 20
+        0..1: TH8  # egress::hdr.tcp.src_port
+        2..3: TH7  # egress::hdr.tcp.dst_port
+        4..7: TW17  # egress::hdr.tcp.seq_no
+        8..11: TW16  # egress::hdr.tcp.ack_no
+        12: TB5
+            # - bit[96..99] -> TB5 bit[7..4]: egress::hdr.tcp.data_offset
+            # - bit[100..103] -> TB5 bit[3..0]: egress::hdr.tcp.res
+        13: TB6  # egress::hdr.tcp.flags
+        14..15: TH6  # egress::hdr.tcp.window
+        16..19: TW7
+            # - bit[128..143] -> TW7 bit[31..16]: egress::hdr.tcp.checksum
+            # - bit[144..159] -> TW7 bit[15..0]: egress::hdr.tcp.urgent_ptr
+        B18: 4  # value 1 -> B18 bit[2]: egress::hdr.tcp.$valid
+        shift: 20
+        buf_req: 20
+        next: parse_recirculation
+      0x****:
+        counter: dec 20
+        0..1: TH8  # egress::hdr.tcp.src_port
+        2..3: TH7  # egress::hdr.tcp.dst_port
+        4..7: TW17  # egress::hdr.tcp.seq_no
+        8..11: TW16  # egress::hdr.tcp.ack_no
+        12: TB5
+            # - bit[96..99] -> TB5 bit[7..4]: egress::hdr.tcp.data_offset
+            # - bit[100..103] -> TB5 bit[3..0]: egress::hdr.tcp.res
+        13: TB6  # egress::hdr.tcp.flags
+        14..15: TH6  # egress::hdr.tcp.window
+        16..19: TW7
+            # - bit[128..143] -> TW7 bit[31..16]: egress::hdr.tcp.checksum
+            # - bit[144..159] -> TW7 bit[15..0]: egress::hdr.tcp.urgent_ptr
+        B18: 4  # value 1 -> B18 bit[2]: egress::hdr.tcp.$valid
+        shift: 20
+        buf_req: 20
+        next: end
+    parse_app:
+      *:
+        counter: dec 1
+        0: TB4  # egress::hdr.app.byte
+        B18: 8  # value 1 -> B18 bit[3]: egress::hdr.app.$valid
+        shift: 1
+        buf_req: 1
+        next: end
+    parse_recirculation:
+      *:
+        counter: dec 3
+        0: B17  # egress::hdr.recir.packet_state
+        1..2: TH33  # egress::hdr.recir.pattern_state_machine_state
+        B18: 16  # value 1 -> B18 bit[4]: egress::hdr.recir.$valid
+        shift: 3
+        buf_req: 3
+        next: parse_app
+    parse_udp:
+      match: [ half ]
+      0x0035:
+        counter: dec 8
+        0..1: TH7  # egress::hdr.udp.src_port
+        2..3: TH6  # egress::hdr.udp.dst_port
+        4..7: TW7
+            # - bit[32..47] -> TW7 bit[31..16]: egress::hdr.udp.hdr_length
+            # - bit[48..63] -> TW7 bit[15..0]: egress::hdr.udp.checksum
+        B18: 32  # value 1 -> B18 bit[5]: egress::hdr.udp.$valid
+        shift: 8
+        buf_req: 8
+        next: parse_app
+      0x15b3:
+        counter: dec 8
+        0..1: TH7  # egress::hdr.udp.src_port
+        2..3: TH6  # egress::hdr.udp.dst_port
+        4..7: TW7
+            # - bit[32..47] -> TW7 bit[31..16]: egress::hdr.udp.hdr_length
+            # - bit[48..63] -> TW7 bit[15..0]: egress::hdr.udp.checksum
+        B18: 32  # value 1 -> B18 bit[5]: egress::hdr.udp.$valid
+        shift: 8
+        buf_req: 8
+        next: parse_recirculation
+      0x****:
+        counter: dec 8
+        0..1: TH7  # egress::hdr.udp.src_port
+        2..3: TH6  # egress::hdr.udp.dst_port
+        4..7: TW7
+            # - bit[32..47] -> TW7 bit[31..16]: egress::hdr.udp.hdr_length
+            # - bit[48..63] -> TW7 bit[15..0]: egress::hdr.udp.checksum
+        B18: 32  # value 1 -> B18 bit[5]: egress::hdr.udp.$valid
+        shift: 8
+        buf_req: 8
+        next: end
+    min_parse_depth_accept_initial:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB4  # egress::hdr.prsr_pad_0[0].blob[87:80].80-87
+        1..2: TH28  # egress::hdr.prsr_pad_0[0].blob[79:64].64-79
+        3..4: TH7  # egress::hdr.prsr_pad_0[0].blob[63:48].48-63
+        5..6: TH6  # egress::hdr.prsr_pad_0[0].blob[47:32].32-47
+        7..10: TW7  # egress::hdr.prsr_pad_0[0].blob[31:0].0-31
+        B16: 4  # value 4 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB5  # egress::hdr.prsr_pad_0[1].blob[87:80].80-87
+        1..2: TH29  # egress::hdr.prsr_pad_0[1].blob[79:64].64-79
+        3..4: TH11  # egress::hdr.prsr_pad_0[1].blob[63:48].48-63
+        5..6: TH10  # egress::hdr.prsr_pad_0[1].blob[47:32].32-47
+        7..8: TH9  # egress::hdr.prsr_pad_0[1].blob[31:16].16-31
+        B16: 2  # value 2 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 9
+        buf_req: 9
+        next: min_parse_depth_accept_loop.$it1.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$it1.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        0..1: TH8  # egress::hdr.prsr_pad_0[1].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: min_parse_depth_accept_loop.$it2
+      0b**:
+        0..1: TH8  # egress::hdr.prsr_pad_0[1].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: end
+    min_parse_depth_accept_loop.$it2:
+      *:
+        counter: dec 11
+        0: TB6  # egress::hdr.prsr_pad_0[2].blob[87:80].80-87
+        1: TB16  # egress::hdr.prsr_pad_0[2].blob[79:72].72-79
+        2: TB7  # egress::hdr.prsr_pad_0[2].blob[71:64].64-71
+        3..6: TW17  # egress::hdr.prsr_pad_0[2].blob[63:32].32-63
+        7..10: TW16  # egress::hdr.prsr_pad_0[2].blob[31:0].0-31
+        B16: 1  # value 1 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$it2.$split_0
+    min_parse_depth_accept_loop.$it2.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        buf_req: 0
+        next: end
+      0b**:
+        buf_req: 0
+        next: end
+    parse_ipv6:
+      *:
+        counter: dec 40
+        0..3: TW5
+            # - bit[0..3] -> TW5 bit[31..28]: egress::hdr.ipv6.version
+            # - bit[4..11] -> TW5 bit[27..20]: egress::hdr.ipv6.traffic_class
+            # - bit[12..31] -> TW5 bit[19..0]: egress::hdr.ipv6.flow_label
+        4..7: TW4
+            # - bit[32..47] -> TW4 bit[31..16]: egress::hdr.ipv6.payload_len
+            # - bit[48..55] -> TW4 bit[15..8]: egress::hdr.ipv6.next_hdr
+            # - bit[56..63] -> TW4 bit[7..0]: egress::hdr.ipv6.hop_limit
+        8..11: TW21  # egress::hdr.ipv6.src_addr[127:96].96-127
+        12..15: TW20  # egress::hdr.ipv6.src_addr[95:64].64-95
+        16: TB16  # egress::hdr.ipv6.src_addr[63:56].56-63
+        17: TB7  # egress::hdr.ipv6.src_addr[55:48].48-55
+        18..19: TH29  # egress::hdr.ipv6.src_addr[47:32].32-47
+        20..21: TH28  # egress::hdr.ipv6.src_addr[31:16].16-31
+        22..23: TH27  # egress::hdr.ipv6.src_addr[15:0].0-15
+        24..25: TH26  # egress::hdr.ipv6.dst_addr[127:112].112-127
+        B18: 64  # value 1 -> B18 bit[6]: egress::hdr.ipv6.$valid
+        load: { byte1 : 6 }
+        shift: 26
+        buf_req: 26
+        next: parse_ipv6.$split_0
+    parse_ipv6.$split_0:
+      *:
+        0..1: TH25  # egress::hdr.ipv6.dst_addr[111:96].96-111
+        2..3: TH24  # egress::hdr.ipv6.dst_addr[95:80].80-95
+        4..5: TH11  # egress::hdr.ipv6.dst_addr[79:64].64-79
+        6..7: TH10  # egress::hdr.ipv6.dst_addr[63:48].48-63
+        10..13: TW6  # egress::hdr.ipv6.dst_addr[31:0].0-31
+        shift: 8
+        buf_req: 14
+        next: parse_ipv6.$split_1
+    parse_ipv6.$split_1:
+      match: [ byte1 ]
+      0x06:
+        0..1: TH9  # egress::hdr.ipv6.dst_addr[47:32].32-47
+        load: { half : 8..9 }
+        shift: 6
+        buf_req: 10
+        next: parse_tcp
+      0x11:
+        0..1: TH9  # egress::hdr.ipv6.dst_addr[47:32].32-47
+        load: { half : 6..7 }
+        shift: 6
+        buf_req: 8
+        next: parse_udp
+      0x**:
+        0..1: TH9  # egress::hdr.ipv6.dst_addr[47:32].32-47
+        shift: 6
+        buf_req: 6
+        next: end
+    parse_normal_tagging_state:
+      *:
+        B19: 1  # value 1 -> B19 bit[7..0]: egress::eg_md.packet_state
+        load: { half : 12..13 }
+        buf_req: 14
+        next: parse_ethernet
+)PARSER_CFG";
+
+    options.target = NO_TARGET;
+    Phv::test_clear();
+
+    createSingleAsmParser();
+    AsmParser *asm_parser = dynamic_cast<AsmParser *>(::asm_parser);
+    asm_parse_string(parser_str);
+    std::vector<Parser *> parser_vector = asm_parser->test_get_parser(EGRESS);
+    EXPECT_GT(parser_vector.size(), 0);
+    Parser *parser = parser_vector.back();
+    parser->process();
+    EXPECT_EQ(parser->get_prsr_max_dph(), 4);
+}
+
+// TEST(parser_test, get_parser_deepest_depth_loop_no_stack)
+//
+//           verify that parser with loops that do not store into
+//           header stacks are supported and that the parser max
+//           depth is set to the maximum supported by the target.
+//
+TEST(parser_test, get_parser_depth_loop_no_stack) {
+    const char *parser_str = R"PARSER_CFG(
+version:
+  target: Tofino
+parser egress:
+  start: $entry_point.start
+  init_zero: [ B17, B16 ]
+  bitwise_or: [ B16, B17 ]
+  hdr_len_adj: 27
+  meta_opt: 8191
+  states:
+    $entry_point.start:
+      *:
+        counter:
+          imm: 65
+        0..1: H16  # bit[7..15] -> H16 bit[8..0]: egress::eg_intr_md.egress_port
+        27..28: TH14  # egress::hdr.ether.dstAddr[47:32].32-47
+        B17: 1  # value 1 -> B17 bit[0]: egress::hdr.ether.$valid
+        intr_md: 9
+        shift: 29
+        buf_req: 29
+        next: $entry_point.start.$split_0
+    $entry_point.start.$split_0:
+      *:
+        counter: dec 27
+        0..3: TW5  # egress::hdr.ether.dstAddr[31:0].0-31
+        4..5: TH13  # egress::hdr.ether.srcAddr[47:32].32-47
+        6..9: TW4  # egress::hdr.ether.srcAddr[31:0].0-31
+        10..11: TH12  # egress::hdr.ether.etherType
+        load: { half : 10..11 }
+        shift: 12
+        buf_req: 12
+        next: $entry_point.start.$split_1
+    $entry_point.start.$split_1:
+      *:
+        counter: dec 14
+        buf_req: 0
+        next: L3_start_0
+    L3_start_0:
+      match: [ half ]
+      0x0800:
+        counter: dec 1
+        0: TB4  # egress::hdr.h.a
+        B17: 2  # value 1 -> B17 bit[1]: egress::hdr.h.$valid
+        shift: 1
+        buf_req: 1
+        next: min_parse_depth_accept_initial
+      0x8100:
+        counter: dec 2
+        0: TB9  # egress::hdr.i.etherType[15:8].8-15
+        1: TB8  # egress::hdr.i.etherType[7:0].0-7
+        B17: 4  # value 1 -> B17 bit[2]: egress::hdr.i.$valid
+        shift: 2
+        buf_req: 2
+        next: L3_start_0
+      0x****:
+        buf_req: 0
+        next: min_parse_depth_accept_initial
+    min_parse_depth_accept_initial:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB5  # egress::hdr.prsr_pad_0[0].blob[87:80].80-87
+        1..2: TH15  # egress::hdr.prsr_pad_0[0].blob[79:64].64-79
+        3..6: TW7  # egress::hdr.prsr_pad_0[0].blob[63:32].32-63
+        7..10: TW6  # egress::hdr.prsr_pad_0[0].blob[31:0].0-31
+        B16: 4  # value 4 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB6  # egress::hdr.prsr_pad_0[1].blob[87:80].80-87
+        1..2: TH16  # egress::hdr.prsr_pad_0[1].blob[79:64].64-79
+        3..4: TH9  # egress::hdr.prsr_pad_0[1].blob[63:48].48-63
+        5..6: TH8  # egress::hdr.prsr_pad_0[1].blob[47:32].32-47
+        7..8: TH7  # egress::hdr.prsr_pad_0[1].blob[31:16].16-31
+        B16: 2  # value 2 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 9
+        buf_req: 9
+        next: min_parse_depth_accept_loop.$it1.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$it1.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        0..1: TH6  # egress::hdr.prsr_pad_0[1].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: min_parse_depth_accept_loop.$it2
+      0b**:
+        0..1: TH6  # egress::hdr.prsr_pad_0[1].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: end
+    min_parse_depth_accept_loop.$it2:
+      *:
+        counter: dec 11
+        0: TB7  # egress::hdr.prsr_pad_0[2].blob[87:80].80-87
+        1..2: TH17  # egress::hdr.prsr_pad_0[2].blob[79:64].64-79
+        3..6: TW8  # egress::hdr.prsr_pad_0[2].blob[63:32].32-63
+        7..8: TH11  # egress::hdr.prsr_pad_0[2].blob[31:16].16-31
+        9..10: TH10  # egress::hdr.prsr_pad_0[2].blob[15:0].0-15
+        B16: 1  # value 1 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$it2.$split_0
+    min_parse_depth_accept_loop.$it2.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        buf_req: 0
+        next: end
+      0b**:
+        buf_req: 0
+        next: end
+)PARSER_CFG";
+
+    options.target = NO_TARGET;
+    Phv::test_clear();
+
+    createSingleAsmParser();
+    AsmParser *asm_parser = dynamic_cast<AsmParser *>(::asm_parser);
+    asm_parse_string(parser_str);
+    std::vector<Parser *> parser_vector = asm_parser->test_get_parser(EGRESS);
+    EXPECT_GT(parser_vector.size(), 0);
+    Parser *parser = parser_vector.back();
+    parser->process();
+    EXPECT_EQ(parser->get_prsr_max_dph(), 0x3ff - 1);
+}
+
+// TEST(parser_test, get_parser_depth_loop_with_stack)
+//
+//           verify that when a parser has loops that store into header
+//           stacks, that the max parser depth is set according to the
+//           number of entries in the stack.
+//
+TEST(parser_test, get_parser_depth_loop_with_stack) {
+    const char *parser_str = R"PARSER_CFG(
+version:
+  target: Tofino
+phv egress:
+  eg_intr_md.egress_port: H17(0..8)
+  hdr.vlan$0.pcp: TW0(29..31)
+  hdr.vlan$0.dei: TW0(28)
+  hdr.vlan$0.vid: TW0(16..27)
+  hdr.vlan$0.ether_type: TW0(0..15)
+  hdr.vlan$1.pcp: TW1(29..31)
+  hdr.vlan$1.dei: TW1(28)
+  hdr.vlan$1.vid: TW1(16..27)
+  hdr.vlan$1.ether_type: TW1(0..15)
+  hdr.vlan$2.pcp: TW2(29..31)
+  hdr.vlan$2.dei: TW2(28)
+  hdr.vlan$2.vid: TW2(16..27)
+  hdr.vlan$2.ether_type: TW2(0..15)
+  hdr.vlan$3.pcp: TW3(29..31)
+  hdr.vlan$3.dei: TW3(28)
+  hdr.vlan$3.vid: TW3(16..27)
+  hdr.vlan$3.ether_type: TW3(0..15)
+  hdr.vlan$4.pcp: TH1(13..15)
+  hdr.vlan$4.dei: TH1(12)
+  hdr.vlan$4.vid: TH1(0..11)
+  hdr.vlan$4.ether_type: TH0
+  hdr.vlan$5.pcp: TH3(13..15)
+  hdr.vlan$5.dei: TH3(12)
+  hdr.vlan$5.vid: TH3(0..11)
+  hdr.vlan$5.ether_type: TH2
+  hdr.vlan$6.pcp: TH5(13..15)
+  hdr.vlan$6.dei: TH5(12)
+  hdr.vlan$6.vid: TH5(0..11)
+  hdr.vlan$6.ether_type: TH4
+  hdr.vlan$7.pcp: TW12(29..31)
+  hdr.vlan$7.dei: TW12(28)
+  hdr.vlan$7.vid: TW12(16..27)
+  hdr.vlan$7.ether_type: TW12(0..15)
+  hdr.vlan$8.pcp: TW13(29..31)
+  hdr.vlan$8.dei: TW13(28)
+  hdr.vlan$8.vid: TW13(16..27)
+  hdr.vlan$8.ether_type: TW13(0..15)
+  hdr.vlan$9.pcp: TW14(29..31)
+  hdr.vlan$9.dei: TW14(28)
+  hdr.vlan$9.vid: TW14(16..27)
+  hdr.vlan$9.ether_type: TW14(0..15)
+  hdr.vlan$10.pcp: TW15(29..31)
+  hdr.vlan$10.dei: TW15(28)
+  hdr.vlan$10.vid: TW15(16..27)
+  hdr.vlan$10.ether_type: TW15(0..15)
+  hdr.vlan$11.pcp: TH19(13..15)
+  hdr.vlan$11.dei: TH19(12)
+  hdr.vlan$11.vid: TH19(0..11)
+  hdr.vlan$11.ether_type: TH18
+  hdr.vlan$12.pcp: TH21(13..15)
+  hdr.vlan$12.dei: TH21(12)
+  hdr.vlan$12.vid: TH21(0..11)
+  hdr.vlan$12.ether_type: TH20
+  hdr.vlan$13.pcp: TH23(13..15)
+  hdr.vlan$13.dei: TH23(12)
+  hdr.vlan$13.vid: TH23(0..11)
+  hdr.vlan$13.ether_type: TH22
+  hdr.vlan$14.pcp: TB13(5..7)
+  hdr.vlan$14.dei: TB13(4)
+  hdr.vlan$14.vid.0-7: TB14
+  hdr.vlan$14.vid.8-11: TB13(0..3)
+  hdr.vlan$14.ether_type.0-7: TB3
+  hdr.vlan$14.ether_type.8-15: TB12
+  hdr.prsr_pad_0$0.blob.0-31: TW20
+  hdr.prsr_pad_0$0.blob.32-63: TW21
+  hdr.prsr_pad_0$0.blob.64-79: TH36
+  hdr.prsr_pad_0$0.blob.80-87: TB0
+  hdr.prsr_pad_0$1.blob.0-31: TW22
+  hdr.prsr_pad_0$1.blob.32-63: TW23
+  hdr.prsr_pad_0$1.blob.64-79: TH37
+  hdr.prsr_pad_0$1.blob.80-87: TB1
+  hdr.prsr_pad_0$2.blob.0-15: TH30
+  hdr.prsr_pad_0$2.blob.16-31: TH31
+  hdr.prsr_pad_0$2.blob.32-47: TH32
+  hdr.prsr_pad_0$2.blob.48-63: TH33
+  hdr.prsr_pad_0$2.blob.64-79: TH38
+  hdr.prsr_pad_0$2.blob.80-87: TB2
+  hdr.eth.dst_addr.0-7: TB15
+  hdr.eth.dst_addr.8-15: TB20
+  hdr.eth.dst_addr.16-23: TB21
+  hdr.eth.dst_addr.24-31: TB22
+  hdr.eth.dst_addr.32-47: TH41
+  hdr.eth.src_addr.0-15: TH34
+  hdr.eth.src_addr.16-31: TH35
+  hdr.eth.src_addr.32-47: TH40
+  hdr.eth.ethertype: TH39
+  hdr.eth.$valid: B17(0)
+  hdr.vlan.$stkvalid: H16(0..14)
+  hdr.vlan$0.$valid: H16(14)
+  hdr.vlan$1.$valid: H16(13)
+  hdr.vlan$2.$valid: H16(12)
+  hdr.vlan$3.$valid: H16(11)
+  hdr.vlan$4.$valid: H16(10)
+  hdr.vlan$5.$valid: H16(9)
+  hdr.vlan$6.$valid: H16(8)
+  hdr.vlan$7.$valid: H16(7)
+  hdr.vlan$8.$valid: H16(6)
+  hdr.vlan$9.$valid: H16(5)
+  hdr.vlan$10.$valid: H16(4)
+  hdr.vlan$11.$valid: H16(3)
+  hdr.vlan$12.$valid: H16(2)
+  hdr.vlan$13.$valid: H16(1)
+  hdr.vlan$14.$valid: H16(0)
+  hdr.prsr_pad_0.$stkvalid: B16(0..2)
+  hdr.prsr_pad_0$0.$valid: B16(2)
+  hdr.prsr_pad_0$1.$valid: B16(1)
+  hdr.prsr_pad_0$2.$valid: B16(0)
+  context_json:
+    B16:
+    - { name : hdr.prsr_pad_0$0.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.prsr_pad_0.$stkvalid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.prsr_pad_0$1.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.prsr_pad_0$2.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    B17:
+    - { name : hdr.eth.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    H16:
+    - { name : hdr.vlan$0.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan.$stkvalid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$1.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$2.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$3.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$4.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$5.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$6.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$7.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$8.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$9.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$10.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$11.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$12.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$13.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    - { name : hdr.vlan$14.$valid, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+    H17:
+    - { name : eg_intr_md.egress_port, live_start : parser, live_end : deparser, mutually_exclusive_with: [  ] }
+parser egress:
+  start: $entry_point
+  init_zero: [ B17, H16, B16 ]
+  bitwise_or: [ TH39, B16, H16 ]
+  hdr_len_adj: 27
+  meta_opt: 8191
+  states:
+    $entry_point:
+      *:
+        counter:
+          imm: 24
+        0..1: H17  # bit[7..15] -> H17 bit[8..0]: egress::eg_intr_md.egress_port
+        27..28: TH41  # egress::hdr.eth.dst_addr[47:32].32-47
+        29: TB22  # egress::hdr.eth.dst_addr[31:24].24-31
+        30: TB21  # egress::hdr.eth.dst_addr[23:16].16-23
+        31: TB20  # egress::hdr.eth.dst_addr[15:8].8-15
+        B17: 1  # value 1 -> B17 bit[0]: egress::hdr.eth.$valid
+        intr_md: 9
+        shift: 32
+        buf_req: 32
+        next: start.$oob_stall_0
+    start.$oob_stall_0:
+      *:
+        load: { half : 7..8 }
+        buf_req: 9
+        next: start.$split_0
+    start.$split_0:
+      match: [ half ]
+      0x8100:
+        0: TB15  # egress::hdr.eth.dst_addr[7:0].0-7
+        1..2: TH40  # egress::hdr.eth.src_addr[47:32].32-47
+        3..4: TH35  # egress::hdr.eth.src_addr[31:16].16-31
+        5..6: TH34  # egress::hdr.eth.src_addr[15:0].0-15
+        7..8: TH39  # egress::hdr.eth.ethertype
+        load: { half : 11..12 }
+        shift: 9
+        buf_req: 13
+        next: CommonParser_parse_vlan_0
+      0x****:
+        0: TB15  # egress::hdr.eth.dst_addr[7:0].0-7
+        1..2: TH40  # egress::hdr.eth.src_addr[47:32].32-47
+        3..4: TH35  # egress::hdr.eth.src_addr[31:16].16-31
+        5..6: TH34  # egress::hdr.eth.src_addr[15:0].0-15
+        7..8: TH39  # egress::hdr.eth.ethertype
+        shift: 9
+        buf_req: 9
+        next: min_parse_depth_accept_initial
+    CommonParser_parse_vlan_0:
+      match: [ half ]
+      0x8100:
+        counter: dec 4
+        0..3: TW0
+            # - bit[0..2] -> TW0 bit[31..29]: egress::hdr.vlan[0].pcp
+            # - bit[3] -> TW0 bit[28]: egress::hdr.vlan[0].dei
+            # - bit[4..15] -> TW0 bit[27..16]: egress::hdr.vlan[0].vid
+            # - bit[16..31] -> TW0 bit[15..0]: egress::hdr.vlan[0].ether_type
+        H16: 16384  # value 16384 -> H16 bit[14..0]: egress::hdr.vlan.$stkvalid
+        TH39: 2  # value 2 -> TH39 bit[15..0]: egress::hdr.eth.ethertype
+        load: { half : 2..3 }
+        shift: 4
+        buf_req: 4
+        offset_inc: 1
+        next: CommonParser_parse_vlan_0
+      0x****:
+        counter: dec 4
+        0..3: TW0
+            # - bit[0..2] -> TW0 bit[31..29]: egress::hdr.vlan[0].pcp
+            # - bit[3] -> TW0 bit[28]: egress::hdr.vlan[0].dei
+            # - bit[4..15] -> TW0 bit[27..16]: egress::hdr.vlan[0].vid
+            # - bit[16..31] -> TW0 bit[15..0]: egress::hdr.vlan[0].ether_type
+        H16: 16384  # value 16384 -> H16 bit[14..0]: egress::hdr.vlan.$stkvalid
+        TH39: 2  # value 2 -> TH39 bit[15..0]: egress::hdr.eth.ethertype
+        shift: 4
+        buf_req: 4
+        offset_inc: 1
+        next: min_parse_depth_accept_initial
+    min_parse_depth_accept_initial:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB0  # egress::hdr.prsr_pad_0[0].blob[87:80].80-87
+        1..2: TH36  # egress::hdr.prsr_pad_0[0].blob[79:64].64-79
+        3..6: TW21  # egress::hdr.prsr_pad_0[0].blob[63:32].32-63
+        7..10: TW20  # egress::hdr.prsr_pad_0[0].blob[31:0].0-31
+        B16: 4  # value 4 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB1  # egress::hdr.prsr_pad_0[1].blob[87:80].80-87
+        1..2: TH37  # egress::hdr.prsr_pad_0[1].blob[79:64].64-79
+        3..6: TW23  # egress::hdr.prsr_pad_0[1].blob[63:32].32-63
+        7..10: TW22  # egress::hdr.prsr_pad_0[1].blob[31:0].0-31
+        B16: 2  # value 2 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$it1.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$it1.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB2  # egress::hdr.prsr_pad_0[2].blob[87:80].80-87
+        1..2: TH38  # egress::hdr.prsr_pad_0[2].blob[79:64].64-79
+        3..4: TH33  # egress::hdr.prsr_pad_0[2].blob[63:48].48-63
+        5..6: TH32  # egress::hdr.prsr_pad_0[2].blob[47:32].32-47
+        7..8: TH31  # egress::hdr.prsr_pad_0[2].blob[31:16].16-31
+        B16: 1  # value 1 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 9
+        buf_req: 9
+        next: min_parse_depth_accept_loop.$it2.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$it2.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        0..1: TH30  # egress::hdr.prsr_pad_0[2].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: end
+      0b**:
+        0..1: TH30  # egress::hdr.prsr_pad_0[2].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: end
+)PARSER_CFG";
+
+    options.target = NO_TARGET;
+    Phv::test_clear();
+
+    createSingleAsmParser();
+    AsmParser *asm_parser = dynamic_cast<AsmParser *>(::asm_parser);
+    asm_parse_string(parser_str);
+    std::vector<Parser *> parser_vector = asm_parser->test_get_parser(EGRESS);
+    EXPECT_GT(parser_vector.size(), 0);
+    Parser *parser = parser_vector.back();
+    parser->process();
+    EXPECT_EQ(parser->get_prsr_max_dph(), 6);
+}
+
+// TEST(parser_test, get_parser_depth_untaken_path)
+//
+//           verify that untaken paths are not considered
+//           in the parser depth calculation.
+//
+TEST(parser_test, get_parser_depth_untaken_path) {
+    const char *parser_str = R"PARSER_CFG(
+version:
+  target: Tofino
+parser egress:
+  start: $entry_point.start
+  init_zero: [ B17, B16 ]
+  bitwise_or: [ TH15, B16, B17 ]
+  hdr_len_adj: 27
+  meta_opt: 8191
+  states:
+    $entry_point.start:
+      *:
+        counter:
+          imm: 38
+        0..1: H16  # bit[7..15] -> H16 bit[8..0]: egress::eg_intr_md.egress_port
+        intr_md: 9
+        shift: 27
+        buf_req: 27
+        next: $entry_point.start.$oob_stall_0
+    $entry_point.start.$oob_stall_0:
+      *:
+        load: { half : 12..13 }
+        buf_req: 14
+        next: CommonParser_start_0
+    CommonParser_start_0:
+      match: [ half ]
+      0x****:
+        counter: dec 14
+        0..1: TH17  # egress::hdr.eth.dst_addr[47:32].32-47
+        2..5: TW9  # egress::hdr.eth.dst_addr[31:0].0-31
+        6..7: TH16  # egress::hdr.eth.src_addr[47:32].32-47
+        8..11: TW8  # egress::hdr.eth.src_addr[31:0].0-31
+        12..13: TH15  # egress::hdr.eth.ethertype
+        B17: 1  # value 1 -> B17 bit[0]: egress::hdr.eth.$valid
+        shift: 14
+        buf_req: 14
+        next: min_parse_depth_accept_initial
+      0x8100:
+        counter: dec 14
+        0..1: TH17  # egress::hdr.eth.dst_addr[47:32].32-47
+        2..5: TW9  # egress::hdr.eth.dst_addr[31:0].0-31
+        6..7: TH16  # egress::hdr.eth.src_addr[47:32].32-47
+        8..11: TW8  # egress::hdr.eth.src_addr[31:0].0-31
+        12..13: TH15  # egress::hdr.eth.ethertype
+        B17: 1  # value 1 -> B17 bit[0]: egress::hdr.eth.$valid
+        load: { half : 16..17 }
+        shift: 14
+        buf_req: 18
+        next: CommonParser_parse_vlan_0
+    min_parse_depth_accept_initial:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB0  # egress::hdr.prsr_pad_0[0].blob[87:80].80-87
+        1..2: TH12  # egress::hdr.prsr_pad_0[0].blob[79:64].64-79
+        3..6: TW2  # egress::hdr.prsr_pad_0[0].blob[63:32].32-63
+        7..10: TW1  # egress::hdr.prsr_pad_0[0].blob[31:0].0-31
+        B16: 4  # value 4 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB1  # egress::hdr.prsr_pad_0[1].blob[87:80].80-87
+        1..2: TH13  # egress::hdr.prsr_pad_0[1].blob[79:64].64-79
+        3..4: TH1  # egress::hdr.prsr_pad_0[1].blob[63:48].48-63
+        5..6: TH0  # egress::hdr.prsr_pad_0[1].blob[47:32].32-47
+        7..10: TW3  # egress::hdr.prsr_pad_0[1].blob[31:0].0-31
+        B16: 2  # value 2 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 11
+        buf_req: 11
+        next: min_parse_depth_accept_loop.$it1.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$it1.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        counter: dec 11
+        0: TB2  # egress::hdr.prsr_pad_0[2].blob[87:80].80-87
+        1..2: TH14  # egress::hdr.prsr_pad_0[2].blob[79:64].64-79
+        3..4: TH5  # egress::hdr.prsr_pad_0[2].blob[63:48].48-63
+        5..6: TH4  # egress::hdr.prsr_pad_0[2].blob[47:32].32-47
+        7..8: TH3  # egress::hdr.prsr_pad_0[2].blob[31:16].16-31
+        B16: 1  # value 1 -> B16 bit[2..0]: egress::hdr.prsr_pad_0.$stkvalid
+        shift: 9
+        buf_req: 9
+        next: min_parse_depth_accept_loop.$it2.$split_0
+      0b**:
+        buf_req: 0
+        next: end
+    min_parse_depth_accept_loop.$it2.$split_0:
+      match: [ ctr_neg, ctr_zero ]
+      0x0:
+        0..1: TH2  # egress::hdr.prsr_pad_0[2].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: end
+      0b**:
+        0..1: TH2  # egress::hdr.prsr_pad_0[2].blob[15:0].0-15
+        shift: 2
+        buf_req: 2
+        next: end
+    CommonParser_parse_vlan_0:
+      match: [ half ]
+      0x8100:
+        counter: dec 4
+        0..3: TW0
+            # - bit[0..2] -> TW0 bit[31..29]: egress::hdr.vlan.pcp
+            # - bit[3] -> TW0 bit[28]: egress::hdr.vlan.dei
+            # - bit[4..15] -> TW0 bit[27..16]: egress::hdr.vlan.vid
+            # - bit[16..31] -> TW0 bit[15..0]: egress::hdr.vlan.ether_type
+        B17: 2  # value 1 -> B17 bit[1]: egress::hdr.vlan.$valid
+        TH15: 2  # value 2 -> TH15 bit[15..0]: egress::hdr.eth.ethertype
+        load: { half : 16..17 }
+        shift: 4
+        buf_req: 18
+        next: CommonParser_start_0
+      0x****:
+        counter: dec 4
+        0..3: TW0
+            # - bit[0..2] -> TW0 bit[31..29]: egress::hdr.vlan.pcp
+            # - bit[3] -> TW0 bit[28]: egress::hdr.vlan.dei
+            # - bit[4..15] -> TW0 bit[27..16]: egress::hdr.vlan.vid
+            # - bit[16..31] -> TW0 bit[15..0]: egress::hdr.vlan.ether_type
+        B17: 2  # value 1 -> B17 bit[1]: egress::hdr.vlan.$valid
+        TH15: 2  # value 2 -> TH15 bit[15..0]: egress::hdr.eth.ethertype
+        shift: 4
+        buf_req: 4
+        next: min_parse_depth_accept_initial
+)PARSER_CFG";
+
+    options.target = NO_TARGET;
+    Phv::test_clear();
+
+    createSingleAsmParser();
+    AsmParser *asm_parser = dynamic_cast<AsmParser *>(::asm_parser);
+    asm_parse_string(parser_str);
+    std::vector<Parser *> parser_vector = asm_parser->test_get_parser(EGRESS);
+    EXPECT_GT(parser_vector.size(), 0);
+    Parser *parser = parser_vector.back();
+    parser->process();
+    EXPECT_EQ(parser->get_prsr_max_dph(), 4);
+}
+
+}  // namespace
diff --git a/backends/tofino/bf-asm/gtest/register-matcher.cpp b/backends/tofino/bf-asm/gtest/register-matcher.cpp
new file mode 100644
index 00000000000..77b5abf64ce
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/register-matcher.cpp
@@ -0,0 +1,175 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/gtest/register-matcher.h"
+
+#include <cctype>
+#include <iostream>
+
+namespace BfAsm {
+
+namespace Test {
+
+RegisterMatcher::RegisterMatcher(const char *spec) : bitsize(0) {
+    enum State {
+        INIT,
+        IDENT,
+        WIDTH,
+        BIN_VALUE,
+        OCT_VALUE,
+        HEX_VALUE,
+    } state(INIT);
+    uint32_t width(0);
+    bitvec value;
+    uint8_t digit(0);
+    bool negate(false);
+
+    while (true) {
+        switch (state) {
+            case INIT:
+                if (std::isdigit(*spec)) {
+                    width = *spec - '0';
+                    state = WIDTH;
+                } else if (*spec == '~') {
+                    negate = !negate;
+                } else if (std::isalpha(*spec) || *spec == '_') {
+                    /* -- ignore identifiers in the spec */
+                    state = IDENT;
+                }
+                break;
+
+            case IDENT:
+                if (*spec == '~') {
+                    state = INIT;
+                    negate = true;
+                } else if (!std::isalpha(*spec) && !std::isdigit(*spec) && *spec != '_') {
+                    state = INIT;
+                    negate = false;
+                }
+                break;
+
+            case WIDTH:
+                if (std::isdigit(*spec)) {
+                    width = 10 * width + *spec - '0';
+                } else if (*spec == 'b') {
+                    state = BIN_VALUE;
+                    value = bitvec();
+                } else if (*spec == 'x') {
+                    state = HEX_VALUE;
+                    value = bitvec();
+                } else if (*spec == 'o') {
+                    state = OCT_VALUE;
+                    value = bitvec();
+                }
+                break;
+
+            case BIN_VALUE:
+                if (*spec == '0' || *spec == '1') {
+                    digit = *spec - '0';
+                    if (negate) digit = ~digit;
+                    value <<= 1;
+                    value |= bitvec(digit & 0x01);
+                } else if (*spec == '|' || *spec == 0) {
+                    pushBits(value, width);
+                    state = INIT;
+                    negate = false;
+                }
+                break;
+
+            case HEX_VALUE:
+                if (std::isxdigit(*spec)) {
+                    if (*spec >= '0' && *spec <= '9') {
+                        digit = *spec - '0';
+                    } else if (*spec >= 'a' && *spec <= 'f') {
+                        digit = *spec - 'a' + 10;
+                    } else if (*spec >= 'A' && *spec <= 'F') {
+                        digit = *spec - 'A' + 10;
+                    }
+                    if (negate) digit = ~digit;
+                    value <<= 4;
+                    value |= bitvec(digit & 0x0f);
+                } else if (*spec == '|' || *spec == 0) {
+                    pushBits(value, width);
+                    state = INIT;
+                    negate = false;
+                }
+                break;
+
+            case OCT_VALUE:
+                if (*spec >= '0' && *spec <= '7') {
+                    digit = *spec - '0';
+                    if (negate) digit = ~digit;
+                    value <<= 3;
+                    value |= bitvec(digit & 0x07);
+                } else if (*spec == '|' || *spec == 0) {
+                    pushBits(value, width);
+                    state = INIT;
+                    negate = false;
+                }
+                break;
+        }
+
+        if (*spec == 0) break;
+        ++spec;
+    }
+}
+
+void RegisterMatcher::pushBits(const bitvec &bits, uint32_t width) {
+    expected <<= width;
+    bitvec mask;
+    mask.setrange(0, width);
+    expected |= bits & mask;
+    bitsize += width;
+}
+
+bool RegisterMatcher::checkRegister(std::ostream &os, const uint8_t reg[], uint32_t rsize) const {
+    const uint32_t bytesize((bitsize + 7) / 8);
+    if (rsize < bytesize) {
+        os << "checked register is shorter than the expected value";
+        return false;
+    }
+
+    uint32_t bitindex(0);
+    bool fail(false);
+    for (int i(0); i < rsize; ++i) {
+        const uint8_t byte(expected.getrange(bitindex, 8));
+        fail = (byte != reg[i]) || fail;
+        bitindex += 8;
+    }
+
+    if (fail) {
+        os << std::hex << std::setfill('0');
+        os << "  expected: ";
+        for (auto i(rsize); i > 0; --i) {
+            uint8_t byte(expected.getrange((i - 1) * 8, 8));
+            os << ' ' << std::setw(2) << static_cast<unsigned int>(byte);
+            bitindex += 8;
+        }
+        os << '\n';
+        os << "    actual: ";
+        for (auto i(rsize); i > 0; --i) {
+            os << ' ' << std::setw(2) << static_cast<unsigned int>(reg[i - 1]);
+        }
+        os << '\n';
+    }
+
+    return !fail;
+}
+
+}  // namespace Test
+
+}  // namespace BfAsm
diff --git a/backends/tofino/bf-asm/gtest/register-matcher.h b/backends/tofino/bf-asm/gtest/register-matcher.h
new file mode 100644
index 00000000000..aa47a8203fd
--- /dev/null
+++ b/backends/tofino/bf-asm/gtest/register-matcher.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_GTEST_REGISTER_MATCHER_H_
+#define BACKENDS_TOFINO_BF_ASM_GTEST_REGISTER_MATCHER_H_
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <iosfwd>
+#include <sstream>
+
+#include "backends/tofino/bf-asm/ubits.h"
+#include "lib/bitvec.h"
+
+namespace BfAsm {
+
+namespace Test {
+
+class RegisterMatcher {
+ private:
+    bitvec expected;
+    uint32_t bitsize;
+
+ public:
+    explicit RegisterMatcher(const char *spec);
+
+    bool checkRegister(std::ostream &os, const uint8_t reg[], uint32_t size) const;
+
+    template <int N>
+    bool checkRegister(std::ostream &os, const ubits<N> &bits) const {
+        static_assert(N > 0 && N <= 64);
+        const uint64_t value(bits);
+        return checkRegister(os, reinterpret_cast<const uint8_t *>(&value), (N + 7) / 8);
+    }
+
+ private:
+    void pushBits(const bitvec &bits, uint32_t width);
+};
+
+}  // namespace Test
+
+}  // namespace BfAsm
+
+#define EXPECT_REGISTER(reg, expected)                                                          \
+    do {                                                                                        \
+        RegisterMatcher matcher(expected);                                                      \
+        std::ostringstream oss;                                                                 \
+        if (!matcher.checkRegister(oss, reg)) {                                                 \
+            ADD_FAILURE() << "check of the register " << #reg << " has failed:\n" << oss.str(); \
+        }                                                                                       \
+    } while (false)
+
+#endif /* BACKENDS_TOFINO_BF_ASM_GTEST_REGISTER_MATCHER_H_ */
diff --git a/backends/tofino/bf-asm/hash_action.cpp b/backends/tofino/bf-asm/hash_action.cpp
new file mode 100644
index 00000000000..2c99bbfd4f6
--- /dev/null
+++ b/backends/tofino/bf-asm/hash_action.cpp
@@ -0,0 +1,231 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string_view>
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "input_xbar.h"
+#include "misc.h"
+
+// target specific instantiatitions
+
+Table::Format::Field *HashActionTable::lookup_field(const std::string &n,
+                                                    const std::string &act) const {
+    auto *rv = format ? format->field(n) : nullptr;
+    if (!rv && gateway) rv = gateway->lookup_field(n, act);
+    if (!rv && !act.empty()) {
+        if (auto call = get_action()) {
+            rv = call->lookup_field(n, act);
+        }
+    }
+    return rv;
+}
+
+void HashActionTable::setup(VECTOR(pair_t) & data) {
+    common_init_setup(data, false, P4Table::MatchEntry);
+    for (auto &kv : MapIterChecked(data, {"meter", "stats", "stateful"})) {
+        if (kv.key == "search_bus" || kv.key == "result_bus") {
+            // already dealt with in Table::setup_layout via common_init_setup
+        } else if (!common_setup(kv, data, P4Table::MatchEntry)) {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (!action.set() && !actions)
+        error(lineno, "Table %s has neither action table nor immediate actions", name());
+    if (action.args.size() > 2)
+        error(lineno, "Unexpected number of action table arguments %zu", action.args.size());
+    if (actions && !action_bus) action_bus = ActionBus::create();
+}
+
+void HashActionTable::pass1() {
+    LOG1("### Hash Action " << name() << " pass1 " << loc());
+    MatchTable::pass1();
+    for (auto &hd : hash_dist) {
+        if (hd.xbar_use == 0) hd.xbar_use |= HashDistribution::ACTION_DATA_ADDRESS;
+        hd.pass1(this, HashDistribution::OTHER, false);
+    }
+    if (!gateway && !hash_dist.empty())
+        warning(hash_dist[0].lineno, "No gateway in hash_action means hash_dist can't be used");
+}
+
+void HashActionTable::pass2() {
+    LOG1("### Hash Action " << name() << " pass2 " << loc());
+    if (logical_id < 0) choose_logical_id();
+    if (Target::GATEWAY_NEEDS_SEARCH_BUS()) {  // FIXME -- misnamed param?
+        if (layout.size() != 1 || layout[0].bus.empty()) {
+            error(lineno, "Need explicit row/bus in hash_action table");
+        } else if (layout[0].bus.size() > 1) {
+            error(lineno, "Can't have both bus and result_bus in hash_action table");
+        } else {
+            BUG_CHECK(layout[0].bus.count(Layout::RESULT_BUS), "should have result bus (only)");
+        }
+    }
+    allocate_physical_ids();
+    determine_word_and_result_bus();
+    for (auto &ixb : input_xbar) ixb->pass2();
+    if (actions) actions->pass2(this);
+    if (action_bus) action_bus->pass2(this);
+    if (gateway) gateway->pass2();
+    if (idletime) idletime->pass2();
+    for (auto &hd : hash_dist) hd.pass2(this);
+}
+
+/**
+ * Again by definition, the bus of the hash action table by definition is the result bus
+ */
+void HashActionTable::determine_word_and_result_bus() {
+    for (auto &row : layout) {
+        row.word = 0;
+    }
+}
+
+void HashActionTable::pass3() {
+    LOG1("### Hash Action " << name() << " pass3 " << loc());
+    MatchTable::pass3();
+    if (action_bus) action_bus->pass3(this);
+}
+
+template <class REGS>
+void HashActionTable::write_merge_regs_vt(REGS &regs, int type, int bus) {
+    attached.write_merge_regs(regs, this, type, bus);
+}
+
+template <class REGS>
+void HashActionTable::write_regs_vt(REGS &regs) {
+    LOG1("### Hash Action " << name() << " write_regs " << loc());
+    /* FIXME -- setup layout with no rams so other functions can write registers properly */
+    int bus_type = layout[0].bus[Layout::RESULT_BUS] >> 1;
+    MatchTable::write_regs(regs, bus_type, this);
+    auto &merge = regs.rams.match.merge;
+    merge.exact_match_logical_result_en |= 1 << logical_id;
+    if (stage->tcam_delay(gress)) merge.exact_match_logical_result_delay |= 1 << logical_id;
+    if (actions) actions->write_regs(regs, this);
+    if (idletime) idletime->write_regs(regs);
+    if (gateway) gateway->write_regs(regs);
+    for (auto &hd : hash_dist) hd.write_regs(regs, this);
+    if (options.match_compiler && !enable_action_data_enable &&
+        (!gateway || gateway->empty_match())) {
+        /* this seems unneeded? (won't actually be used...) */
+        merge.next_table_format_data[logical_id].match_next_table_adr_default =
+            merge.next_table_format_data[logical_id].match_next_table_adr_miss_value.value;
+    }
+}
+
+/**
+ * Unlike the hash functions for exact match tables, the hash action table does not require
+ * the Galois position.  On the contrary, the hash action just requires an identity matrix
+ * of what the address that is to be generated, as they simply use this address as a baseline
+ * for generating the corresponding address.
+ *
+ * Thus, the hash function that is provided starts at bit 0, and is in reverse p4 param order.
+ * This is under the guarantee that the compiler will allocate the hash in reverse p4 param
+ * order as well.
+ *
+ * FIXME: Possibly this should be validated before this is the output, but currently the
+ * compiler will set up the hash in that order
+ */
+void HashActionTable::add_hash_functions(json::map &stage_tbl) const {
+    json::vector &hash_functions = stage_tbl["hash_functions"] = json::vector();
+
+    if (input_xbar.empty()) return;
+    BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+    auto &ht = input_xbar[0]->get_hash_tables();
+    if (ht.size() == 0) return;
+
+    int hash_bit_index = 0;
+    json::map hash_function;
+    json::vector &hash_bits = hash_function["hash_bits"] = json::vector();
+    for (auto it = p4_params_list.rbegin(); it != p4_params_list.rend(); it++) {
+        auto &p4_param = *it;
+        for (size_t i = p4_param.start_bit; i < p4_param.start_bit + p4_param.bit_width; i++) {
+            // Check if the param bit is used in hash function before adding to
+            // json. E.g. The param can have a mask which will exclude some bits
+            // to not be a part of the hash function
+            if (!input_xbar[0]->is_p4_param_bit_in_hash(p4_param.name, i)) continue;
+
+            json::map hash_bit;
+            hash_bit["hash_bit"] = hash_bit_index;
+            hash_bit["seed"] = 0;
+            json::vector &bits_to_xor = hash_bit["bits_to_xor"] = json::vector();
+            json::map field;
+            std::string field_name, global_name;
+            field_name = p4_param.key_name.empty() ? p4_param.name : p4_param.key_name;
+            global_name = p4_param.name;
+            field["field_bit"] = i;
+            field["field_name"] = field_name;
+            field["global_name"] = global_name;
+            field["hash_match_group"] = 0;
+            field["hash_match_group_bit"] = 0;
+            bits_to_xor.push_back(std::move(field));
+            hash_bits.push_back(std::move(hash_bit));
+
+            hash_bit_index++;
+        }
+    }
+    hash_function["hash_function_number"] = 0;
+    hash_functions.push_back(std::move(hash_function));
+}
+
+void HashActionTable::gen_tbl_cfg(json::vector &out) const {
+    // FIXME: Support multiple hash_dist's
+    int size = hash_dist.empty() ? 1 : 1 + hash_dist[0].mask;
+    json::map &tbl = *base_tbl_cfg(out, "match_entry", size);
+    std::string_view stage_tbl_type = "match_with_no_key";
+    size = 1;
+    if (p4_table && p4_table->p4_stage_table_type() == "gateway_with_entries") {
+        stage_tbl_type = "gateway_with_entries";
+        size = p4_size();
+    } else if (!p4_params_list.empty()) {
+        stage_tbl_type = "hash_action";
+        size = p4_size();
+    }
+    json::map &match_attributes = tbl["match_attributes"];
+    json::vector &stage_tables = match_attributes["stage_tables"];
+    json::map &stage_tbl = *add_stage_tbl_cfg(match_attributes, stage_tbl_type.data(), size);
+    stage_tbl["memory_resource_allocation"] = nullptr;
+    if (!match_attributes.count("match_type"))
+        match_attributes["match_type"] = stage_tbl_type.data();
+    // This is a only a glass required field, as it is only required when no default action
+    // is specified, which is impossible for Brig through p4-16
+    stage_tbl["default_next_table"] = Stage::end_of_pipe();
+    add_pack_format(stage_tbl, 0, 0, hash_dist.empty() ? 1 : 0);
+    add_result_physical_buses(stage_tbl);
+    if (actions) {
+        actions->gen_tbl_cfg(tbl["actions"]);
+        actions->add_action_format(this, stage_tbl);
+    } else if (action && action->actions) {
+        action->actions->gen_tbl_cfg(tbl["actions"]);
+        action->actions->add_action_format(this, stage_tbl);
+    }
+    common_tbl_cfg(tbl);
+    if (stage_tbl_type == "hash_action" && !p4_params_list.empty()) add_hash_functions(stage_tbl);
+    if (idletime)
+        idletime->gen_stage_tbl_cfg(stage_tbl);
+    else if (options.match_compiler)
+        stage_tbl["stage_idletime_table"] = nullptr;
+    add_all_reference_tables(tbl);
+    gen_idletime_tbl_cfg(stage_tbl);
+    merge_context_json(tbl, stage_tbl);
+}
+
+DEFINE_TABLE_TYPE(HashActionTable)
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void HashActionTable::write_merge_regs,
+                      (mau_regs & regs, int type, int bus),
+                      { write_merge_regs_vt(regs, type, bus); })
diff --git a/backends/tofino/bf-asm/hash_dist.cpp b/backends/tofino/bf-asm/hash_dist.cpp
new file mode 100644
index 00000000000..f447841020f
--- /dev/null
+++ b/backends/tofino/bf-asm/hash_dist.cpp
@@ -0,0 +1,227 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "hash_dist.h"
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "lib/range.h"
+
+static void set_output_bit(unsigned &xbar_use, value_t &v) {
+    if (CHECKTYPE(v, tSTR)) {
+        if (v == "immediate_lo" || v == "lo")
+            xbar_use |= HashDistribution::IMMEDIATE_LOW;
+        else if (v == "immediate_hi" || v == "hi")
+            xbar_use |= HashDistribution::IMMEDIATE_HIGH;
+        else if (v == "meter" || v == "meter_address")
+            xbar_use |= HashDistribution::METER_ADDRESS;
+        else if (v == "stats" || v == "stats_address")
+            xbar_use |= HashDistribution::STATISTICS_ADDRESS;
+        else if (v == "action" || v == "action_address")
+            xbar_use |= HashDistribution::ACTION_DATA_ADDRESS;
+        else if (v == "hashmod")
+            xbar_use |= HashDistribution::HASHMOD_DIVIDEND;
+        else
+            error(v.lineno, "Unrecognized hash_dist output %s", v.s);
+    }
+}
+
+static const char *xbar_use_string(unsigned xbar_use) {
+    static char buffer[256];
+    static const char *bits[] = {"immed hi",   "immed lo",    "meter addr",
+                                 "stats addr", "action addr", "hashmod-div"};
+    char *p = buffer, *e = buffer + sizeof(buffer);
+    for (int bit = 0; bit < sizeof(bits) / sizeof(bits[0]); ++bit) {
+        if (!(xbar_use & (1U << bit))) continue;
+        xbar_use &= ~(1U << bit);
+        if (p != buffer) p += snprintf(p, p < e ? e - p : 0, xbar_use ? ", " : " and ");
+        p += snprintf(p, p < e ? e - p : 0, "%s", bits[bit]);
+    }
+    if (xbar_use) {
+        if (p != buffer) p += snprintf(p, p < e ? e - p : 0, " and ");
+        p += snprintf(p, p < e ? e - p : 0, "<0x%x>", xbar_use);
+    }
+    return buffer;
+}
+
+HashDistribution::HashDistribution(int id_, value_t &data, unsigned u)
+    : lineno(data.lineno), id(id_), xbar_use(u) {
+    if (id < 0 || id >= 6) error(data.lineno, "Invalid hash_dist unit id %d", id);
+    if (CHECKTYPE(data, tMAP)) {
+        for (auto &kv : MapIterChecked(data.map)) {
+            if (kv.key == "hash") {
+                if (CHECKTYPE(kv.value, tINT) && (unsigned)(hash_group = kv.value.i) >= 8U)
+                    error(kv.value.lineno, "Invalid hash group");
+            } else if (kv.key == "mask") {
+                if (CHECKTYPE(kv.value, tINT)) mask = kv.value.i;
+            } else if (kv.key == "shift") {
+                if (CHECKTYPE(kv.value, tINT)) shift = kv.value.i;
+            } else if (kv.key == "expand") {
+                if (CHECKTYPE(kv.value, tINT)) expand = kv.value.i;
+            } else if (kv.key == "output") {
+                if (kv.value.type == tVEC)
+                    for (auto &s : kv.value.vec) set_output_bit(xbar_use, s);
+                else
+                    set_output_bit(xbar_use, kv.value);
+            } else {
+                warning(kv.key.lineno, "ignoring unknown item %s in hash_dist", value_desc(kv.key));
+            }
+        }
+    }
+}
+
+void HashDistribution::parse(std::vector<HashDistribution> &out, const value_t &data,
+                             unsigned xbar_use) {
+    if (CHECKTYPE(data, tMAP))
+        for (auto &kv : data.map)
+            if (CHECKTYPE(kv.key, tINT)) out.emplace_back(kv.key.i, kv.value, xbar_use);
+}
+
+bool HashDistribution::compatible(HashDistribution *a) {
+    if (hash_group != a->hash_group) return false;
+    if (id != a->id) return false;
+    if (shift != a->shift) return false;
+    if (expand != a->expand) return false;
+    if (delay_type != a->delay_type) return false;
+    if (non_linear != a->non_linear) return false;
+    if (meter_pre_color && !a->meter_pre_color && (mask & ~a->mask)) return false;
+    if (!meter_pre_color && a->meter_pre_color && (~mask & a->mask)) return false;
+    return true;
+}
+
+void HashDistribution::pass1(Table *tbl, delay_type_t delay_type, bool non_linear) {
+    LOG1("Hash dist pass1");
+    this->tbl = tbl;
+    this->delay_type = delay_type;
+    this->non_linear = non_linear;
+    bool err = false;
+    for (auto *use : tbl->stage->hash_dist_use[id]) {
+        if (!compatible(use)) {
+            err = true;
+            error(lineno, "hash_dist unit %d in table %s not compatible with", id, tbl->name());
+            warning(use->lineno, "previous use in table %s", use->tbl->name());
+        }
+    }
+    if (expand >= 0) {
+        int min_shift = 7, diff = 7, other = id - 1;
+        switch (id % 3) {
+            case 0:
+                min_shift = 0;
+                diff = -7;
+                other = id + 1;
+                // fall through
+            case 1:
+                if (expand < min_shift || expand >= min_shift + 16) {
+                    error(lineno, "hash_dist unit %d expand can't pull from bit %d", id, expand);
+                    err = true;
+                }
+                break;
+            case 2:
+                error(lineno, "hash_dist unit %d cannot be expanded", id);
+                err = true;
+                break;
+            default:
+                error(lineno,
+                      "a mod 3 check should only hit these particular cases, of 0, 1, and 2");
+                BUG();
+        }
+        if (!err) {
+            for (auto *use : tbl->stage->hash_dist_use[other])
+                if (use->expand != -1 && use->expand != expand - diff) {
+                    error(lineno, "hash_dist unit %d int table %s expand not compatible with", id,
+                          tbl->name());
+                    warning(use->lineno, "previous use in table %s", use->tbl->name());
+                }
+        }
+    }
+    if (err) return;
+    tbl->stage->hash_dist_use[id].push_back(this);
+    for (int i = 0; i < 3; i++) {
+        if (id % 3 == i) continue;
+        int m = 3 * (id / 3) + i;
+        for (auto *use : tbl->stage->hash_dist_use[id]) {
+            if (use->hash_group != hash_group) {
+                error(lineno, "hash_dist %d and %d use different hash groups", id, m);
+                warning(use->lineno, "previous use here");
+            }
+        }
+    }
+}
+
+void HashDistribution::pass2(Table *tbl) {
+    for (auto &hd : tbl->hash_dist) {
+        if (&hd == this) return;
+        if (id == hd.id) {
+            error(lineno, "mulitple definitions for hash_dist %d in table %s", id, tbl->name());
+            error(hd.lineno, "previous definition");
+            break;
+        }
+        if (xbar_use & hd.xbar_use)
+            error(lineno, "confliction output use between hash_dist %d and %d in table %s %s", id,
+                  hd.id, tbl->name(), xbar_use_string(xbar_use & hd.xbar_use));
+    }
+}
+
+template <class REGS>
+void HashDistribution::write_regs(REGS &regs, Table *tbl) {
+    /* from HashDistributionResourceAllocation.write_config: */
+    auto &merge = regs.rams.match.merge;
+    if (non_linear) merge.mau_selector_hash_sps_enable |= 1 << id;
+    if (tbl->gress == EGRESS) merge.mau_hash_group_config.hash_group_egress |= 1 << id;
+    merge.mau_hash_group_config.hash_group_enable |= 1 << id;
+    merge.mau_hash_group_config.hash_group_sel.set_subfield(hash_group | 8U, 4 * (id / 3), 4);
+    merge.mau_hash_group_config.hash_group_ctl.set_subfield(delay_type, 2 * id, 2);
+    merge.mau_hash_group_shiftcount.set_subfield(shift, 3 * id, 3);
+    merge.mau_hash_group_mask[id] |= mask;
+    if (expand >= 0) {
+        switch (id % 3) {
+            case 0:
+                merge.mau_hash_group_expand[id / 3].hash_slice_group0_expand = 1;
+                merge.mau_hash_group_expand[id / 3].hash_slice_group2_expand = expand;
+                merge.mau_hash_group_config.hash_group_enable |= 1 << (id + 2);
+                merge.mau_hash_group_config.hash_group_ctl.set_subfield(delay_type, 2 * (id + 2),
+                                                                        2);
+                break;
+            case 1:
+                merge.mau_hash_group_expand[id / 3].hash_slice_group1_expand = 1;
+                merge.mau_hash_group_expand[id / 3].hash_slice_group2_expand = expand - 7;
+                merge.mau_hash_group_config.hash_group_enable |= 1 << (id + 1);
+                merge.mau_hash_group_config.hash_group_ctl.set_subfield(delay_type, 2 * (id + 1),
+                                                                        2);
+                break;
+            default:
+                BUG();
+        }
+    }
+    for (int oxbar : Range(0, 4))
+        if ((xbar_use >> oxbar) & 1)
+            merge.mau_hash_group_xbar_ctl[oxbar][tbl->logical_id / 8U].set_subfield(
+                8 | id, 4 * (tbl->logical_id % 8U), 4);
+    if (xbar_use & HASHMOD_DIVIDEND) {
+        int mgroup = tbl->get_selector()->meter_group();
+        merge.mau_hash_group_xbar_ctl[5][mgroup / 8U].set_subfield(8 | id, 4 * (mgroup % 8U), 4);
+    }
+    if (meter_pre_color) {
+        merge.mau_meter_precolor_hash_sel.set_subfield(8 | id, 4 * (id / 3), 4);
+        int ctl = 16 | meter_mask_index;
+        if (id >= 3) ctl |= 8;
+        merge.mau_meter_precolor_hash_map_to_logical_ctl[tbl->logical_id / 4U].set_subfield(
+            ctl, 5 * (tbl->logical_id % 4U), 5);
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void HashDistribution::write_regs, mau_regs &,
+                      Table *)
diff --git a/backends/tofino/bf-asm/hash_dist.h b/backends/tofino/bf-asm/hash_dist.h
new file mode 100644
index 00000000000..de8005d754c
--- /dev/null
+++ b/backends/tofino/bf-asm/hash_dist.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_HASH_DIST_H_
+#define BACKENDS_TOFINO_BF_ASM_HASH_DIST_H_
+
+#include <vector>
+
+#include "asm-types.h"
+
+class Stage;
+class Table;
+
+/* config for a hash distribution unit in match central.
+ * FIXME -- need to abstract this away rather than have it be explicit
+ * FIXME -- in the asm code */
+
+struct HashDistribution {
+    // FIXME -- need less 'raw' data for this */
+    Table *tbl = 0;
+    int lineno = -1;
+    int hash_group = -1, id = -1;
+    int shift = 0, mask = 0, expand = -1;
+    bool meter_pre_color = false;
+    int meter_mask_index = 0;
+    enum {
+        IMMEDIATE_HIGH = 1 << 0,
+        IMMEDIATE_LOW = 1 << 1,
+        METER_ADDRESS = 1 << 2,
+        STATISTICS_ADDRESS = 1 << 3,
+        ACTION_DATA_ADDRESS = 1 << 4,
+        HASHMOD_DIVIDEND = 1 << 5
+    };
+    unsigned xbar_use = 0;
+    enum delay_type_t { SELECTOR = 0, OTHER = 1 };
+    delay_type_t delay_type = SELECTOR;
+    bool non_linear = false;
+    HashDistribution(int id, value_t &data, unsigned u = 0);
+    static void parse(std::vector<HashDistribution> &out, const value_t &v, unsigned u = 0);
+    bool compatible(HashDistribution *a);
+    void pass1(Table *tbl, delay_type_t dt, bool nl);
+    void pass2(Table *tbl);
+    template <class REGS>
+    void write_regs(REGS &regs, Table *);
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_HASH_DIST_H_ */
diff --git a/backends/tofino/bf-asm/hashdump.cpp b/backends/tofino/bf-asm/hashdump.cpp
new file mode 100644
index 00000000000..572e4e4e31e
--- /dev/null
+++ b/backends/tofino/bf-asm/hashdump.cpp
@@ -0,0 +1,132 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <fstream>
+#include <iostream>
+
+#include "backends/tofino/bf-asm/json.h"
+#include "gen/tofino/disas.regs.mau_addrmap.h"
+#include "lib/hex.h"
+
+static Tofino::regs_mau_addrmap regs;
+
+static void dump_hashtables(std::ostream &out);
+
+int verbose = 0;
+int get_file_log_level(const char *file, int *level) { return *level = verbose; }
+
+int main(int ac, char **av) {
+    for (int i = 1; i < ac; i++) {
+        if (av[i][0] == '-' || av[i][0] == '+') {
+            bool flag = av[i][0] == '+';
+            for (char *arg = av[i] + 1; *arg;) switch (*arg++) {
+                    case 'v':
+                        verbose++;
+                        break;
+                    default:
+                        std::cerr << "Unknown option " << (flag ? '+' : '-') << arg[-1]
+                                  << std::endl;
+                        std::cerr << "usage: " << av[0] << " file" << std::endl;
+                }
+        } else {
+            std::ifstream in(av[i]);
+            if (!in) {
+                std::cerr << "Can't open " << av[i] << std::endl;
+                continue;
+            }
+            std::unique_ptr<json::obj> data;
+            in >> data;
+            if (!in || regs.unpack_json(data.get())) {
+                std::cerr << "Can't read/unpack json from " << av[i] << std::endl;
+                continue;
+            }
+            dump_hashtables(std::cout);
+        }
+    }
+}
+
+static bool col_nonzero(int i, int col) {
+    for (int word = i * 8; word < i * 8 + 8; word++) {
+        auto &x = regs.dp.xbar_hash.hash.galois_field_matrix[word][col];
+        if (x.byte0 || x.byte1) return true;
+    }
+    return false;
+}
+
+static bool col_valid_nonzero(int i, int col) {
+    for (int word = i * 8; word < i * 8 + 8; word++) {
+        auto &x = regs.dp.xbar_hash.hash.galois_field_matrix[word][col];
+        if (x.valid0 || x.valid1) return true;
+    }
+    return false;
+}
+
+static bool ht_nonzero(int i) {
+    for (int col = 0; col < 52; col++) {
+        if ((regs.dp.xbar_hash.hash.hash_seed[col] >> i) & 1) return true;
+        if (col_nonzero(i, col)) return true;
+        if (col_valid_nonzero(i, col)) return true;
+    }
+    return false;
+}
+
+static void dump_ht(std::ostream &out, int i) {
+    for (int col = 0; col < 52; col++) {
+        if (col_nonzero(i, col)) {
+            out << "    " << col << ": 0x";
+            bool pfx = true;
+            for (int word = 8 * i + 7; word >= 8 * i; word--) {
+                auto &w = regs.dp.xbar_hash.hash.galois_field_matrix[word][col];
+                if (!pfx || w.byte1) {
+                    out << hex(w.byte1, pfx ? 0 : 2, '0');
+                    pfx = false;
+                }
+                if (!pfx || w.byte0) {
+                    out << hex(w.byte0, pfx ? 0 : 2, '0');
+                    pfx = false;
+                }
+            }
+            out << '\n';
+        }
+        if (col_valid_nonzero(i, col)) {
+            out << "    valid " << col << ": 0b";
+            bool pfx = true;
+            for (int word = 8 * i + 7; word >= 8 * i; word--) {
+                auto &w = regs.dp.xbar_hash.hash.galois_field_matrix[word][col];
+                if (!pfx || w.valid1) {
+                    out << (w.valid1 ? '1' : '0');
+                    pfx = false;
+                }
+                if (!pfx || w.valid0) {
+                    out << (w.valid0 ? '1' : '0');
+                    pfx = false;
+                }
+            }
+            out << '\n';
+        }
+        if ((regs.dp.xbar_hash.hash.hash_seed[col] >> i) & 1) out << "    seed " << col << ": 1\n";
+    }
+}
+
+static void dump_hashtables(std::ostream &out) {
+    for (int i = 0; i < 8; i++) {
+        if (ht_nonzero(i)) {
+            out << "hash " << i << ":\n";
+            dump_ht(out, i);
+        }
+    }
+}
diff --git a/backends/tofino/bf-asm/hashexpr.cpp b/backends/tofino/bf-asm/hashexpr.cpp
new file mode 100644
index 00000000000..00bd2b0bc52
--- /dev/null
+++ b/backends/tofino/bf-asm/hashexpr.cpp
@@ -0,0 +1,837 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "hashexpr.h"
+
+#include "input_xbar.h"
+#include "lib/bitops.h"
+#include "lib/bitvec.h"
+
+static bool check_ixbar(Phv::Ref &ref, InputXbar *ix, InputXbar::HashTable hash_table) {
+    if (!ref.check()) return false;
+    if (ref->reg.mau_id() < 0) {
+        error(ref.lineno, "%s not accessable in mau", ref->reg.name);
+        return false;
+    }
+    if (!hash_table) return true;
+    for (auto in : ix->find_hash_inputs(*ref, hash_table)) {
+        BUG_CHECK(in->lo >= 0, "invalid lo in IXBar::Input");
+        return true;
+    }
+    error(ref.lineno, "%s not in %s input", ref.name(), hash_table.toString().c_str());
+    return false;
+}
+
+/**
+ * Generating a list of ixbar_input_t and hash_matrix_output_t to be sent to the
+ * dynamic_hash library.  The vectors are part of the function call as they
+ * must be on the stack to avoid using new and delete
+ */
+void HashExpr::gen_ixbar_init(ixbar_init_t *ixbar_init, std::vector<ixbar_input_t> &inputs,
+                              std::vector<hash_matrix_output_t> &outputs, int logical_hash_bit,
+                              InputXbar *ix, InputXbar::HashTable hash_table) {
+    inputs.clear();
+    outputs.clear();
+
+    gen_ixbar_inputs(inputs, ix, hash_table);
+    hash_matrix_output_t hmo;
+    hmo.p4_hash_output_bit = logical_hash_bit;
+    hmo.gfm_start_bit = 0;
+    hmo.bit_size = 1;
+    outputs.push_back(hmo);
+
+    ixbar_init->ixbar_inputs = inputs.data();
+    ixbar_init->inputs_sz = inputs.size();
+    ixbar_init->hash_matrix_outputs = outputs.data();
+    ixbar_init->outputs_sz = outputs.size();
+}
+
+/**
+ * The function call for PhvRef, Random, Identity, and Crc functions.  The input xbar is
+ * initialized, and the data returned writes out a vector of inputs.  For Stripe,
+ * Slice, and others, they recursively will call this function
+ */
+void HashExpr::gen_data(bitvec &data, int logical_hash_bit, InputXbar *ix,
+                        InputXbar::HashTable hash_table) {
+    ixbar_init_t ixbar_init;
+    hash_column_t hash_matrix[PARITY_GROUPS_DYN][HASH_MATRIX_WIDTH_DYN] = {};
+    std::vector<ixbar_input_t> inputs;
+    std::vector<hash_matrix_output_t> outputs;
+
+    gen_ixbar_init(&ixbar_init, inputs, outputs, logical_hash_bit, ix, hash_table);
+
+    bool non_zero = false;
+    int loops = 0;
+    // It is possible that a hash column can be genereated as all 0s if using RANDOM_DYN algo, so
+    // regeneration is required if a hash column is all 0s and using RANDOM_DYN.
+    while (!non_zero) {
+        determine_hash_matrix(&ixbar_init, ixbar_init.ixbar_inputs, ixbar_init.inputs_sz,
+                              &hash_algorithm, hash_matrix);
+        if (hash_algorithm.hash_alg != RANDOM_DYN ||
+            ix->global_column0_extract(hash_table, hash_matrix)) {
+            non_zero = true;
+        }
+        BUG_CHECK(loops++ < 1000, "Looping trying to get a valid RANDOM_DYN matrix");
+    }
+    data |= ix->global_column0_extract(hash_table, hash_matrix);
+}
+
+class HashExpr::PhvRef : HashExpr {
+    Phv::Ref what;
+    PhvRef(gress_t gr, int stg, const value_t &v) : HashExpr(v.lineno), what(gr, stg, v) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override {
+        return ::check_ixbar(what, ix, hash_table);
+    }
+    int width() override { return what.size(); }
+    int input_size() override { return what.size(); }
+    bool match_phvref(const Phv::Ref &ref) override {
+        if (what->reg != ref->reg || what->lo != ref->lo) return false;
+        return true;
+    }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const PhvRef &>(a_);
+        return *what == *a.what;
+    }
+    void build_algorithm() override {
+        hash_algorithm.hash_alg = IDENTITY_DYN;
+        hash_algorithm.msb = false;
+        hash_algorithm.extend = false;
+        hash_algorithm.final_xor = 0ULL;
+        hash_algorithm.poly = 0ULL;
+        hash_algorithm.init = 0ULL;
+        hash_algorithm.reverse = false;
+    }
+
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override;
+    void get_sources(int bit, std::vector<Phv::Ref> &rv) const override {
+        if (bit >= 0)
+            rv.emplace_back(what, bit, bit);
+        else
+            rv.emplace_back(what);
+    }
+    Phv::Ref *get_ghost_slice() override { return &what; }
+    void dbprint(std::ostream &out) const override {
+        out << "HashExpr: PhvRef" << std::endl;
+        out << "hash algorithm: [ algo : " << hash_algorithm.hash_alg
+            << ", msb : " << hash_algorithm.msb << ", extend : " << hash_algorithm.extend
+            << ", final_xor : " << hash_algorithm.final_xor << ", poly : " << hash_algorithm.poly
+            << ", init : " << hash_algorithm.init << ", reverse : " << hash_algorithm.reverse
+            << std::endl;
+        if (what) out << "Phv: " << what << std::endl;
+    }
+};
+
+class HashExpr::Random : HashExpr {
+    std::vector<Phv::Ref> what;
+    explicit Random(int lineno) : HashExpr(lineno) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override {
+        bool rv = true;
+        for (auto &ref : what) rv &= ::check_ixbar(ref, ix, hash_table);
+        return rv;
+    }
+    int width() override { return 0; }
+    int input_size() override {
+        int rv = 0;
+        for (auto &ref : what) rv += ref->size();
+        return rv;
+    }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const Random &>(a_);
+        if (what.size() != a.what.size()) return false;
+        auto it = a.what.begin();
+        for (auto &el : what)
+            if (*el != **it++) return false;
+        return true;
+    }
+    void build_algorithm() override {
+        hash_algorithm.hash_alg = RANDOM_DYN;
+        hash_algorithm.msb = false;
+        hash_algorithm.extend = false;
+        hash_algorithm.final_xor = 0ULL;
+        hash_algorithm.poly = 0ULL;
+        hash_algorithm.init = 0ULL;
+        hash_algorithm.reverse = false;
+    }
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override;
+    void get_sources(int, std::vector<Phv::Ref> &rv) const override {
+        rv.insert(rv.end(), what.begin(), what.end());
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "HashExpr: Random" << std::endl;
+        out << "hash algorithm: [ algo : " << hash_algorithm.hash_alg
+            << ", msb : " << hash_algorithm.msb << ", extend : " << hash_algorithm.extend
+            << ", final_xor : " << hash_algorithm.final_xor << ", poly : " << hash_algorithm.poly
+            << ", init : " << hash_algorithm.init << ", reverse : " << hash_algorithm.reverse
+            << std::endl;
+        for (auto &e : what) {
+            out << "Phv: " << e << std::endl;
+        }
+    }
+};
+
+class HashExpr::Crc : HashExpr {
+    bitvec poly;
+    bitvec init;
+    bitvec final_xor;
+    ///> It is a multimap to allow two fields to have the exact same hash matrix requirements
+    std::multimap<unsigned, Phv::Ref> what;
+    std::map<int, bitvec> constants;
+    std::vector<Phv::Ref> vec_what;
+    bool reverse = false;
+    int total_input_bits = -1;
+    explicit Crc(int lineno) : HashExpr(lineno) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override;
+    int width() override { return poly.max().index(); }
+    int input_size() override {
+        if (total_input_bits >= 0) return total_input_bits;
+        if (what.empty()) {
+            int rv = 0;
+            for (auto &ref : vec_what) rv += ref->size();
+            return rv;
+        } else {
+            return what.rbegin()->first + what.rbegin()->second->size();
+        }
+    }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const Crc &>(a_);
+        if (what.size() != a.what.size()) return false;
+        if (vec_what.size() != a.vec_what.size()) return false;
+        auto it = a.what.begin();
+        for (auto &el : what)
+            if (el.first != it->first || *el.second != *(it++)->second) return false;
+        auto it2 = a.vec_what.begin();
+        for (auto &el : vec_what)
+            if (*el != **it2++) return false;
+        return true;
+    }
+    void build_algorithm() override {
+        hash_algorithm.hash_bit_width = poly.max().index();
+        hash_algorithm.hash_alg = CRC_DYN;
+        hash_algorithm.reverse = reverse;
+        hash_algorithm.poly = poly.getrange(32, 32) << 32;
+        hash_algorithm.poly |= poly.getrange(0, 32);
+        hash_algorithm.init = init.getrange(32, 32) << 32;
+        hash_algorithm.init |= init.getrange(0, 32);
+        hash_algorithm.final_xor = final_xor.getrange(0, 32);
+        hash_algorithm.final_xor |= final_xor.getrange(32, 32) << 32;
+        hash_algorithm.extend = false;
+        hash_algorithm.msb = false;
+    }
+
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override;
+    void get_sources(int, std::vector<Phv::Ref> &rv) const override {
+        rv.insert(rv.end(), vec_what.begin(), vec_what.end());
+    }
+};
+
+/**
+ * @brief XOR hashing algorithm implemented on the hashing matrix
+ *
+ * This expression implements XOR over the hasing matrix. The input
+ * message is handled as a big integer number - the highest bit is
+ * the begining, the zero-th bit is the end. The message is split
+ * from the begining into blocks of length bit_width and these blocks
+ * are bitwise XORed together.
+ */
+class HashExpr::XorHash : public HashExpr {
+ private:
+    std::multimap<unsigned, Phv::Ref> what;
+    int bit_width;
+    friend class HashExpr;
+
+ public:
+    explicit XorHash(int lineno, int bit_width_);
+
+    /* -- avoid copying */
+    XorHash &operator=(XorHash &&) = delete;
+
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override;
+    int width() override;
+    int input_size() override;
+    bool operator==(const HashExpr &a_) const override;
+    void build_algorithm() override;
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override;
+    void get_sources(int, std::vector<Phv::Ref> &rv) const override;
+};
+
+class HashExpr::Xor : HashExpr {
+    std::vector<HashExpr *> what;
+    explicit Xor(int lineno) : HashExpr(lineno) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override {
+        bool rv = true;
+        for (auto *e : what) rv |= e->check_ixbar(ix, hash_table);
+        return rv;
+    }
+    void gen_data(bitvec &data, int logical_hash_bit, InputXbar *ix,
+                  InputXbar::HashTable hash_table) override;
+    int width() override {
+        int rv = 0;
+        for (auto *e : what) {
+            int w = e->width();
+            if (w > rv) rv = w;
+        }
+        return rv;
+    }
+    int input_size() override {
+        int rv = 0;
+        for (auto *e : what) rv += e->input_size();
+        return rv;
+    }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const Xor &>(a_);
+        if (what.size() != a.what.size()) return false;
+        auto it = a.what.begin();
+        for (auto &el : what)
+            if (*el != **it++) return false;
+        return true;
+    }
+    void build_algorithm() override {
+        for (auto *e : what) {
+            if (e) e->build_algorithm();
+        }
+    }
+
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override {}
+    void get_sources(int bit, std::vector<Phv::Ref> &rv) const override {
+        for (auto *e : what) e->get_sources(bit, rv);
+    }
+    Phv::Ref *get_ghost_slice() override {
+        for (auto *e : what) {
+            auto g = e->get_ghost_slice();
+            if (g) return g;
+        }
+        return nullptr;
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "HashExpr: Xor" << std::endl;
+        for (auto *e : what) {
+            e->dbprint(out);
+        }
+    }
+};
+
+class HashExpr::Mask : HashExpr {
+    HashExpr *what;
+    bitvec mask;
+    Mask(int lineno, HashExpr *w, bitvec m) : HashExpr(lineno), what(w), mask(m) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override {
+        return what->check_ixbar(ix, hash_table);
+    }
+    void gen_data(bitvec &data, int bit, InputXbar *ix, InputXbar::HashTable hash_table) override {
+        if (mask[bit]) what->gen_data(data, bit, ix, hash_table);
+    }
+    int width() override { return what->width(); }
+    int input_size() override { return what->input_size(); }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const Mask &>(a_);
+        return mask == a.mask && *what == *a.what;
+    }
+    void build_algorithm() override { what->build_algorithm(); }
+
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override {}
+    void get_sources(int bit, std::vector<Phv::Ref> &rv) const override {
+        if (mask[bit]) what->get_sources(bit, rv);
+    }
+    Phv::Ref *get_ghost_slice() override { return what->get_ghost_slice(); }
+    void dbprint(std::ostream &out) const override {
+        out << "HashExpr: Mask " << mask << std::endl;
+        what->dbprint(out);
+    }
+};
+
+class HashExpr::Stripe : HashExpr {
+    std::vector<HashExpr *> what;
+    bool supress_error_cascade = false;
+    explicit Stripe(int lineno) : HashExpr(lineno) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override {
+        bool rv = true;
+        for (auto *e : what) rv |= e->check_ixbar(ix, hash_table);
+        return rv;
+    }
+    void gen_data(bitvec &data, int logical_hash_bit, InputXbar *ix,
+                  InputXbar::HashTable hash_table) override;
+    int width() override { return 0; }
+    int input_size() override {
+        int rv = 0;
+        for (auto *e : what) rv += e->input_size();
+        return rv;
+    }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const Stripe &>(a_);
+        if (what.size() != a.what.size()) return false;
+        auto it = a.what.begin();
+        for (auto &el : what)
+            if (*el != **it++) return false;
+        return true;
+    }
+    void build_algorithm() override {
+        for (auto *e : what) {
+            e->build_algorithm();
+        }
+        // Does not set the extend algorithm, as the gen_data for extend does this
+        // in the source
+    }
+
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override {}
+    void get_sources(int bit, std::vector<Phv::Ref> &rv) const override {
+        for (auto *e : what) {
+            if (bit >= 0) {
+                int width = e->width();
+                if (bit < width) {
+                    e->get_sources(bit, rv);
+                    break;
+                }
+                bit -= width;
+            } else {
+                e->get_sources(bit, rv);
+            }
+        }
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "HashExpr: Stripe" << std::endl;
+        for (auto *e : what) {
+            e->dbprint(out);
+        }
+    }
+};
+
+class HashExpr::Slice : HashExpr {
+    HashExpr *what = nullptr;
+    int start = 0, _width = 0;
+    explicit Slice(int lineno) : HashExpr(lineno) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override {
+        return what->check_ixbar(ix, hash_table);
+    }
+    void gen_data(bitvec &data, int logical_hash_bit, InputXbar *ix,
+                  InputXbar::HashTable hash_table) override {
+        what->gen_data(data, logical_hash_bit + start, ix, hash_table);
+    }
+    int width() override {
+        if (_width == 0) {
+            _width = what->width();
+            if (_width > 0) {
+                _width -= start;
+                if (_width <= 0) _width = -1;
+            }
+        }
+        return _width;
+    }
+    int input_size() override { return what->input_size(); }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const Slice &>(a_);
+        if (start != a.start || _width != a._width) return false;
+        return *what == *a.what;
+    }
+    void build_algorithm() override { what->build_algorithm(); }
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override {}
+    void get_sources(int bit, std::vector<Phv::Ref> &rv) const override {
+        if (bit >= start)
+            what->get_sources(bit - start, rv);
+        else if (bit < 0)
+            what->get_sources(bit, rv);
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "HashExpr: Slice" << std::endl;
+        if (what) out << what << std::endl;
+        out << "start: " << start << " ,width: " << _width << std::endl;
+    }
+};
+
+class HashExpr::SExtend : HashExpr {
+    HashExpr *what;
+    SExtend(int lineno, HashExpr *w) : HashExpr(lineno), what(w) {}
+    friend class HashExpr;
+    bool check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) override {
+        return what->check_ixbar(ix, hash_table);
+    }
+    void gen_data(bitvec &data, int bit, InputXbar *ix, InputXbar::HashTable hash_table) override {
+        int width = what->width();
+        if (width > 0 && bit >= width) bit = width - 1;
+        what->gen_data(data, bit, ix, hash_table);
+    }
+    int width() override { return 0; }
+    int input_size() override { return what->input_size(); }
+    bool operator==(const HashExpr &a_) const override {
+        if (typeid(*this) != typeid(a_)) return false;
+        auto &a = static_cast<const SExtend &>(a_);
+        return *what == *a.what;
+    }
+    void build_algorithm() override { what->build_algorithm(); }
+    void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) override {}
+    void get_sources(int bit, std::vector<Phv::Ref> &rv) const override {
+        int width = what->width();
+        if (width > 0 && bit >= width) bit = width - 1;
+        what->get_sources(bit, rv);
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "HashExpr: SExtend" << std::endl;
+        if (what) out << what << std::endl;
+    }
+};
+
+// The ordering for crc expression is:
+// crc(poly, @optional init, @optional input_bits, map)
+HashExpr *HashExpr::create(gress_t gress, int stage, const value_t &what) {
+    if (what.type == tCMD) {
+        if (what[0] == "random") {
+            Random *rv = new Random(what.lineno);
+            for (int i = 1; i < what.vec.size; i++) rv->what.emplace_back(gress, stage, what[i]);
+            return rv;
+        } else if (what[0] == "xor") {
+            if (what.vec.size != 3) {
+                error(what[1].lineno,
+                      "Syntax error, invalid number of parameters for 'xor' hash expression");
+                return nullptr;
+            }
+            if (!CHECKTYPE(what[1], tINT)) {
+                return nullptr;
+            }
+            if (!CHECKTYPE(what[2], tMAP)) {
+                return nullptr;
+            }
+            std::unique_ptr<XorHash> rv(new XorHash(what.lineno, what[1].i));
+            for (auto &kv : what[2].map) {
+                if (CHECKTYPE(kv.key, tINT)) {
+                    rv->what.emplace(kv.key.i, Phv::Ref(gress, stage, kv.value));
+                } else {
+                    return nullptr;
+                }
+            }
+
+            return rv.release();
+        } else if ((what[0] == "crc" || what[0] == "crc_rev" || what[0] == "crc_reverse") &&
+                   CHECKTYPE2(what[1], tBIGINT, tINT)) {
+            Crc *rv = new Crc(what.lineno);
+            if (what[0] != "crc") rv->reverse = true;
+            rv->poly = get_bitvec(what[1]);
+            // Shift and set LSB to 1 to generate polynomial from Koopman number
+            // provided in assembly
+            rv->poly <<= 1;
+            rv->poly[0] = 1;
+            int i = 2;
+
+            if (what.vec.size > i && (what[i].type == tINT || what[i].type == tBIGINT))
+                rv->init = get_bitvec(what[i++]);
+            if (what.vec.size > i && (what[i].type == tINT || what[i].type == tBIGINT))
+                rv->final_xor = get_bitvec(what[i++]);
+            if (what.vec.size > i && what[i].type == tINT) rv->total_input_bits = what[i++].i;
+
+            if (what.vec.size > i && what[i].type == tMAP) {
+                for (auto &kv : what[i].map) {
+                    if (CHECKTYPE(kv.key, tINT)) {
+                        rv->what.emplace(kv.key.i, Phv::Ref(gress, stage, kv.value));
+                    }
+                }
+            } else {
+                for (; i < what.vec.size; i++) {
+                    rv->vec_what.emplace_back(gress, stage, what[i]);
+                }
+            }
+            return rv;
+        } else if (what[0] == "^") {
+            Xor *rv = new Xor(what.lineno);
+            for (int i = 1; i < what.vec.size; i++)
+                rv->what.push_back(create(gress, stage, what[i]));
+            return rv;
+        } else if (what[0] == "&") {
+            HashExpr *op = nullptr;
+            bitvec mask;
+            bool have_mask = false;
+            for (int i = 1; i < what.vec.size; i++) {
+                if (what[i].type == tINT || what[i].type == tBIGINT) {
+                    if (have_mask) {
+                        mask &= get_bitvec(what[i]);
+                    } else {
+                        mask = get_bitvec(what[i]);
+                        have_mask = true;
+                    }
+                } else if (op) {
+                    error(what.lineno, "Invalid mask operation");
+                    return nullptr;
+                } else {
+                    op = create(gress, stage, what[i]);
+                }
+            }
+            if (!op) {
+                error(what.lineno, "Invalid mask operation");
+                return nullptr;
+            } else if (have_mask) {
+                return new Mask(what.lineno, op, mask);
+            } else {
+                return op;
+            }
+        } else if (what[0] == "stripe") {
+            Stripe *rv = new Stripe(what.lineno);
+            for (int i = 1; i < what.vec.size; i++)
+                rv->what.push_back(create(gress, stage, what[i]));
+            return rv;
+        } else if (what[0] == "slice") {
+            if (what.vec.size < 3 || what[2].type == tRANGE
+                    ? what.vec.size > 3 || what[2].range.hi < what[2].range.lo
+                    : what[2].type != tINT || what.vec.size > 4 ||
+                          (what.vec.size == 4 && what[3].type != tINT)) {
+                error(what.lineno, "Invalid slice operation");
+                return nullptr;
+            }
+            Slice *rv = new Slice(what.lineno);
+            rv->what = create(gress, stage, what[1]);
+            if (what[2].type == tRANGE) {
+                rv->start = what[2].range.lo;
+                rv->_width = what[2].range.hi - what[2].range.lo + 1;
+            } else {
+                rv->start = what[2].i;
+                if (what.vec.size > 3) rv->_width = what[3].i;
+            }
+            return rv;
+        } else if (what[0] == "sextend" || what[0] == "sign_extend") {
+            if (what.vec.size != 2) {
+                error(what.lineno, "Invalid sign extension");
+                return nullptr;
+            }
+            return new SExtend(what.lineno, create(gress, stage, what[1]));
+        } else if (what.vec.size == 2) {
+            return new PhvRef(gress, stage, what);
+        } else {
+            error(what.lineno, "Unsupported hash operation '%s'", what[0].s);
+        }
+    } else if (what.type == tSTR) {
+        return new PhvRef(gress, stage, what);
+    } else {
+        error(what.lineno, "Syntax error, expecting hash expression");
+    }
+    return nullptr;
+}
+
+void HashExpr::find_input(Phv::Ref what, std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                          InputXbar::HashTable hash_table) {
+    bool found = false;
+    auto vec = ix->find_hash_inputs(*what, hash_table);
+    for (auto *in : vec) {
+        int group_bit_position = in->lo + (what->lo - in->what->lo);
+        ixbar_input_t input;
+        input.type = ixbar_input_type::tPHV;
+        input.ixbar_bit_position = group_bit_position + ix->global_bit_position_adjust(hash_table);
+        input.bit_size = what->size();
+        input.u.valid = true;
+        input.symmetric_info.is_symmetric = false;
+        inputs.push_back(input);
+        found = true;
+        break;
+    }
+    if (!found) {
+        error(ix->lineno, "Cannot find associated field %s[%d:%d] in %s", what->reg.name, what->hi,
+              what->lo, hash_table.toString().c_str());
+    }
+}
+
+void HashExpr::generate_ixbar_inputs_with_gaps(const std::multimap<unsigned, Phv::Ref> &what,
+                                               std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                                               InputXbar::HashTable hash_table) {
+    unsigned previous_range_hi = 0;
+    for (auto &entry : what) {
+        if (previous_range_hi != entry.first) {
+            ixbar_input_t invalid_input = {
+                ixbar_input_type::tPHV,           // type
+                0,                                // ixbar_bit_position
+                entry.first - previous_range_hi,  // bit_size
+                {},                               // symmetric_info
+                false                             // u.valid
+            };
+            inputs.push_back(invalid_input);
+        }
+
+        auto &ref = entry.second;
+        find_input(ref, inputs, ix, hash_table);
+        previous_range_hi = entry.first + ref->size();
+    }
+    if (previous_range_hi != input_size()) {
+        ixbar_input_t invalid_input = {
+            ixbar_input_type::tPHV,            // type
+            0,                                 // ixbar_bit_position
+            input_size() - previous_range_hi,  // bit_size
+            {},                                // symmetric_info
+            false                              // u.valid
+        };
+        inputs.push_back(invalid_input);
+    }
+}
+
+/**
+ * Creates a vector with a single entry corresponding to the identity input
+ */
+void HashExpr::PhvRef::gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                                        InputXbar::HashTable hash_table) {
+    find_input(what, inputs, ix, hash_table);
+}
+
+/**
+ * Iterates through the list of references to build a corresponding vector for the
+ * dynamic hash library
+ */
+void HashExpr::Random::gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                                        InputXbar::HashTable hash_table) {
+    for (auto &ref : what) {
+        find_input(ref, inputs, ix, hash_table);
+    }
+}
+
+/**
+ * Iterates through the crc map, and will generate ixbar_input_t inputs for the holes.
+ * These are marked as invalid, so that the hash calculation will be correct
+ */
+void HashExpr::Crc::gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                                     InputXbar::HashTable hash_table) {
+    generate_ixbar_inputs_with_gaps(what, inputs, ix, hash_table);
+}
+
+bool HashExpr::Crc::check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) {
+    bool rv = true;
+    if (!vec_what.empty()) {
+        int off = 0;
+        for (auto &ref : vec_what) {
+            rv &= ::check_ixbar(ref, ix, InputXbar::HashTable());
+            if (ref) {
+                for (auto *in : ix->find_hash_inputs(*ref, hash_table)) {
+                    if (in->lo >= 0) {
+                        what.emplace(off, ref);
+                        break;
+                    }
+                }
+                off += ref.size();
+            }
+        }
+        vec_what.clear();
+    } else {
+        int max = -1;
+        for (auto &ref : what) {
+            rv &= ::check_ixbar(ref.second, ix, hash_table);
+        }
+    }
+    return rv;
+}
+
+HashExpr::XorHash::XorHash(int lineno, int bit_width_) : HashExpr(lineno), bit_width(bit_width_) {}
+
+bool HashExpr::XorHash::check_ixbar(InputXbar *ix, InputXbar::HashTable hash_table) {
+    bool rv(true);
+    for (auto &ref : what) {
+        rv = ::check_ixbar(ref.second, ix, hash_table) && rv;
+    }
+    return rv;
+}
+
+int HashExpr::XorHash::width() { return bit_width; }
+
+int HashExpr::XorHash::input_size() {
+    if (what.empty()) return 0;
+    return what.rbegin()->first + what.rbegin()->second->size();
+}
+
+bool HashExpr::XorHash::operator==(const HashExpr &a_) const {
+    if (typeid(*this) != typeid(a_)) return false;
+    auto &a = static_cast<const XorHash &>(a_);
+
+    if (what.size() != a.what.size()) return false;
+    if (bit_width != a.bit_width) return false;
+
+    auto iter1(what.begin());
+    auto iter2(a.what.begin());
+    while (iter1 != what.end()) {
+        if (*iter1 != *iter2) return false;
+        ++iter1;
+        ++iter2;
+    }
+    return true;
+}
+
+void HashExpr::XorHash::build_algorithm() {
+    memset(&hash_algorithm, 0, sizeof(hash_algorithm));
+    hash_algorithm.hash_alg = XOR_DYN;
+    hash_algorithm.extend = false;
+    hash_algorithm.msb = false;
+    hash_algorithm.hash_bit_width = bit_width;
+}
+
+void HashExpr::XorHash::gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                                         InputXbar::HashTable hash_table) {
+    generate_ixbar_inputs_with_gaps(what, inputs, ix, hash_table);
+}
+
+void HashExpr::XorHash::get_sources(int, std::vector<Phv::Ref> &rv) const {}
+
+void HashExpr::Xor::gen_data(bitvec &data, int bit, InputXbar *ix,
+                             InputXbar::HashTable hash_table) {
+    for (auto *e : what) e->gen_data(data, bit, ix, hash_table);
+}
+
+void HashExpr::Stripe::gen_data(bitvec &data, int bit, InputXbar *ix,
+                                InputXbar::HashTable hash_table) {
+    while (1) {
+        int total_size = 0;
+        for (auto *e : what) {
+            int sz = e->width();
+            if (bit < total_size + sz) {
+                e->gen_data(data, bit - total_size, ix, hash_table);
+                return;
+            }
+            total_size += sz;
+        }
+        if (total_size == 0) {
+            if (!supress_error_cascade) {
+                error(lineno, "Can't stripe unsized data");
+                supress_error_cascade = true;
+            }
+            break;
+        }
+        bit %= total_size;
+    }
+}
+
+void dump(const HashExpr *h) {
+    if (h)
+        h->dbprint(std::cout);
+    else
+        std::cout << "(null)";
+    std::cout << std::endl;
+}
+void dump(const HashExpr &h) {
+    h.dbprint(std::cout);
+    std::cout << std::endl;
+}
diff --git a/backends/tofino/bf-asm/hashexpr.h b/backends/tofino/bf-asm/hashexpr.h
new file mode 100644
index 00000000000..cb63eca5e34
--- /dev/null
+++ b/backends/tofino/bf-asm/hashexpr.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_HASHEXPR_H_
+#define BACKENDS_TOFINO_BF_ASM_HASHEXPR_H_
+
+#include "backends/tofino/bf-utils/dynamic_hash/dynamic_hash.h"
+#include "input_xbar.h"
+#include "phv.h"
+
+class HashExpr : public IHasDbPrint {
+    class PhvRef;
+    class Random;
+    class Crc;
+    class XorHash;
+    class Xor;
+    class Mask;
+    class Stripe;
+    class Slice;
+    class SExtend;
+
+ protected:
+    explicit HashExpr(int l) : lineno(l) {}
+
+ public:
+    int lineno;
+    bfn_hash_algorithm_t hash_algorithm = {};  // Zero-init to make Klockwork happy
+    static HashExpr *create(gress_t, int stage, const value_t &);
+    virtual void build_algorithm() = 0;
+    virtual bool check_ixbar(InputXbar *ix, InputXbar::HashTable ht) = 0;
+    virtual void gen_data(bitvec &data, int bit, InputXbar *ix, InputXbar::HashTable hash_table);
+    void gen_ixbar_init(ixbar_init_t *ixbar_init, std::vector<ixbar_input_t> &inputs,
+                        std::vector<hash_matrix_output_t> &outputs, int logical_hash_bit,
+                        InputXbar *ix, InputXbar::HashTable hash_table);
+    virtual void gen_ixbar_inputs(std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                                  InputXbar::HashTable hash_table) = 0;
+    virtual void get_sources(int bit, std::vector<Phv::Ref> &) const = 0;
+    std::vector<Phv::Ref> get_sources(int bit) const {
+        std::vector<Phv::Ref> rv;
+        get_sources(bit, rv);
+        return rv;
+    }
+    virtual int width() = 0;
+    virtual int input_size() = 0;
+    virtual bool match_phvref(const Phv::Ref &ref) { return false; }
+    virtual bool operator==(const HashExpr &) const = 0;
+    void find_input(Phv::Ref what, std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                    InputXbar::HashTable hash_table);
+    bool operator!=(const HashExpr &a) const { return !operator==(a); }
+    virtual void dbprint(std::ostream &out) const {}
+    virtual Phv::Ref *get_ghost_slice() { return nullptr; }
+    virtual ~HashExpr() {}
+
+ private:
+    void generate_ixbar_inputs_with_gaps(const std::multimap<unsigned, Phv::Ref> &what,
+                                         std::vector<ixbar_input_t> &inputs, InputXbar *ix,
+                                         InputXbar::HashTable hash_table);
+};
+
+extern void dump(const HashExpr *);
+extern void dump(const HashExpr &);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_HASHEXPR_H_ */
diff --git a/backends/tofino/bf-asm/idletime.cpp b/backends/tofino/bf-asm/idletime.cpp
new file mode 100644
index 00000000000..297988c4621
--- /dev/null
+++ b/backends/tofino/bf-asm/idletime.cpp
@@ -0,0 +1,217 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "misc.h"
+
+void IdletimeTable::setup(VECTOR(pair_t) & data) {
+    setup_layout(layout, data);
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (kv.key == "precision") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                precision = kv.value.i;
+                if (precision != 1 && precision != 2 && precision != 3 && precision != 6)
+                    error(kv.value.lineno, "Invalid idletime precision %d", precision);
+            }
+        } else if (kv.key == "sweep_interval") {
+            if (CHECKTYPE(kv.value, tINT)) sweep_interval = kv.value.i;
+        } else if (kv.key == "notification") {
+            if (kv.value == "disable")
+                disable_notification = true;
+            else if (kv.value == "two_way")
+                two_way_notification = true;
+            else if (kv.value != "enable")
+                error(kv.value.lineno, "Unknown notification style '%s'", value_desc(kv.value));
+        } else if (kv.key == "per_flow_enable") {
+            per_flow_enable = get_bool(kv.value);
+        } else if (kv.key == "context_json") {
+            setup_context_json(kv.value);
+        } else if (kv.key == "row" || kv.key == "column" || kv.key == "bus") {
+            /* already done in setup_layout */
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    alloc_rams(false, stage->mapram_use);
+    for (auto &r : layout) {
+        if (!r.bus.count(Layout::IDLE_BUS)) continue;
+        int &idle_bus = r.bus.at(Layout::IDLE_BUS);
+        if (idle_bus >= IDLETIME_BUSSES) {
+            error(r.lineno, "bus %d invalid", idle_bus);
+            continue;
+        }
+        if (r.row >= 4 && idle_bus < 10)
+            idle_bus += 10;
+        else if (r.row < 4 && idle_bus >= 10)
+            error(r.lineno, "idletime bus %d not accessable on row %d", idle_bus, r.row);
+        if (Table *old = stage->idletime_bus_use[idle_bus]) {
+            if (old != this)
+                error(r.lineno,
+                      "Table %s trying to use idletime bus %d which is already in "
+                      "use by table %s",
+                      name(), idle_bus, old->name());
+        } else {
+            stage->idletime_bus_use[idle_bus] = this;
+        }
+    }
+}
+
+void IdletimeTable::pass1() {
+    LOG1("### Idletime table " << name() << " pass1 " << loc());
+    alloc_vpns();
+}
+
+void IdletimeTable::pass2() { LOG1("### Idletime table " << name() << " pass2 " << loc()); }
+
+void IdletimeTable::pass3() { LOG1("### Idletime table " << name() << " pass3 " << loc()); }
+
+// This is the same as AttachedTable::json_memunit, but IdletimeTable is not a derived class
+// of AttachedTable, so we duplicate it
+int IdletimeTable::json_memunit(const MemUnit &r) const {
+    if (r.stage >= 0) {
+        return r.stage * Target::SRAM_STRIDE_STAGE() + r.row * Target::SRAM_STRIDE_ROW() +
+               r.col * Target::SRAM_STRIDE_COLUMN();
+    } else if (r.row >= 0) {
+        // per-stage logical sram
+        return r.row * Target::SRAM_LOGICAL_UNITS_PER_ROW() + r.col;
+    } else {
+        // lamb
+        return r.col;
+    }
+}
+
+static int precision_bits[] = {0, 0, 1, 2, 0, 0, 3};
+
+template <class REGS>
+void IdletimeTable::write_merge_regs_vt(REGS &regs, int type, int bus) {
+    auto &merge = regs.rams.match.merge;
+    merge.mau_payload_shifter_enable[type][bus].idletime_adr_payload_shifter_en = 1;
+    merge.mau_idletime_adr_mask[type][bus] =
+        (~1U << precision_bits[precision]) & ((1U << IDLETIME_ADDRESS_BITS) - 1);
+    merge.mau_idletime_adr_default[type][bus] =
+        (1U << IDLETIME_ADDRESS_PER_FLOW_ENABLE_START_BIT) | ((1 << precision_bits[precision]) - 1);
+}
+
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void IdletimeTable::write_merge_regs,
+                      (mau_regs & regs, int type, int bus),
+                      { write_merge_regs_vt(regs, type, bus); })
+
+int IdletimeTable::precision_shift() const { return precision_bits[precision] + 1; }
+int IdletimeTable::direct_shiftcount() const { return 67 - precision_bits[precision]; }
+
+template <class REGS>
+void IdletimeTable::write_regs_vt(REGS &regs) {
+    LOG1("### Idletime table " << name() << " write_regs " << loc());
+    auto &map_alu = regs.rams.map_alu;
+    auto &adrdist = regs.rams.match.adrdist;
+    int minvpn = 1000000, maxvpn = -1;
+    for (Layout &logical_row : layout)
+        for (auto v : logical_row.vpns) {
+            if (v < minvpn) minvpn = v;
+            if (v > maxvpn) maxvpn = v;
+        }
+    // regs.cfg_regs.mau_cfg_lt_has_idle |= 1 << logical_id;
+    for (Layout &row : layout) {
+        int idle_bus = row.bus.at(Layout::IDLE_BUS);
+        auto &map_alu_row = map_alu.row[row.row];
+        auto &adrmux = map_alu_row.adrmux;
+        auto vpn = row.vpns.begin();
+        for (auto &memunit : row.memunits) {
+            int col = memunit.col;
+            BUG_CHECK(memunit.stage == INT_MIN && memunit.row == row.row, "bogus %s in row %d",
+                      memunit.desc(), row.row);
+            setup_muxctl(map_alu_row.vh_xbars.adr_dist_idletime_adr_xbar_ctl[col], idle_bus % 10);
+            auto &mapram_cfg = adrmux.mapram_config[col];
+            // auto &mapram_ctl = adrmux.mapram_ctl[col];
+            if (disable_notification) mapram_cfg.idletime_disable_notification = 1;
+            if (two_way_notification) mapram_cfg.two_way_idletime_notification = 1;
+            if (per_flow_enable) mapram_cfg.per_flow_idletime = 1;
+            mapram_cfg.idletime_bitwidth = precision_bits[precision];
+            mapram_cfg.mapram_type = MapRam::IDLETIME;
+            mapram_cfg.mapram_logical_table = logical_id;
+            mapram_cfg.mapram_vpn_members = 0;  // FIXME
+            mapram_cfg.mapram_vpn = *vpn++;
+            if (gress == INGRESS)
+                mapram_cfg.mapram_ingress = 1;
+            else
+                mapram_cfg.mapram_egress = 1;
+            mapram_cfg.mapram_enable = 1;
+            if ((precision == 1) || (precision == 2)) {
+                mapram_cfg.mapram_parity_generate = 1;
+                mapram_cfg.mapram_parity_check = 1;
+            } else {
+                if ((precision != 3) && (precision != 6))
+                    error(lineno, "Unknown idletime precision = %d", precision);
+                mapram_cfg.mapram_ecc_generate = 1;
+                mapram_cfg.mapram_ecc_check = 1;
+            }
+            auto &adrmux_ctl = adrmux.ram_address_mux_ctl[1][col];
+            adrmux_ctl.map_ram_wadr_mux_select = MapRam::Mux::IDLETIME;
+            adrmux_ctl.map_ram_wadr_mux_enable = 1;
+            adrmux_ctl.map_ram_radr_mux_select_smoflo = 1;
+            adrmux_ctl.ram_ofo_stats_mux_select_statsmeter = 1;
+            adrmux_ctl.ram_stats_meter_adr_mux_select_idlet = 1;
+            setup_muxctl(adrmux.idletime_logical_to_physical_sweep_grant_ctl[col], logical_id);
+            setup_muxctl(adrmux.idletime_physical_to_logical_req_inc_ctl[col], logical_id);
+            unsigned clear_val = ~(~0U << precision);
+            if (per_flow_enable || precision == 1) clear_val &= ~1U;
+            for (unsigned i = 0; i < 8U / precision; i++)
+                adrmux.idletime_cfg_rd_clear_val[col].set_subfield(clear_val, i * precision,
+                                                                   precision);
+            if (gress)
+                regs.cfg_regs.mau_cfg_mram_thread[col / 3U] |= 1U << (col % 3U * 8U + row.row);
+        }
+        adrdist.adr_dist_idletime_adr_oxbar_ctl[idle_bus / 4].set_subfield(logical_id | 0x10,
+                                                                           5 * (idle_bus % 4), 5);
+    }
+    // don't enable initially -- runtime will enable
+    // adrdist.idletime_sweep_ctl[logical_id].idletime_en = 1;
+    adrdist.idletime_sweep_ctl[logical_id].idletime_sweep_offset = minvpn;
+    adrdist.idletime_sweep_ctl[logical_id].idletime_sweep_size = layout_size() - 1;
+    adrdist.idletime_sweep_ctl[logical_id].idletime_sweep_remove_hole_pos = 0;  // TODO
+    adrdist.idletime_sweep_ctl[logical_id].idletime_sweep_remove_hole_en = 0;   // TODO
+    adrdist.idletime_sweep_ctl[logical_id].idletime_sweep_interval = sweep_interval;
+    auto &idle_dump_ctl = regs.cfg_regs.idle_dump_ctl[logical_id];
+    idle_dump_ctl.idletime_dump_offset = minvpn;
+    idle_dump_ctl.idletime_dump_size = maxvpn;
+    idle_dump_ctl.idletime_dump_remove_hole_pos = 0;  // TODO
+    idle_dump_ctl.idletime_dump_remove_hole_en = 0;   // TODO
+    adrdist.movereg_idle_ctl[logical_id].movereg_idle_ctl_size = precision_bits[precision];
+    adrdist.movereg_idle_ctl[logical_id].movereg_idle_ctl_direct = 1;
+    adrdist.movereg_ad_direct[MoveReg::IDLE] |= 1 << logical_id;
+    adrdist.idle_bubble_req[timing_thread(gress)].bubble_req_1x_class_en |= 1 << logical_id;
+}
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void IdletimeTable::write_regs, (mau_regs & regs),
+                      { write_regs_vt(regs); })
+
+void IdletimeTable::gen_stage_tbl_cfg(json::map &out) const {
+    unsigned number_entries = layout_size() * (8U / precision) * SRAM_DEPTH;
+    json::map &tbl = out["idletime_stage_table"] = json::map();
+    tbl["stage_number"] = stage->stageno;
+    tbl["size"] = number_entries;
+    tbl["stage_table_type"] = "idletime";
+    tbl["precision"] = precision;
+    tbl["disable_notification"] = disable_notification;
+    tbl["two_way_notification"] = two_way_notification;
+    // ??
+    tbl["logical_table_id"] = match_table->logical_id;
+    tbl["enable_pfe"] = per_flow_enable;
+    add_pack_format(tbl, 11, 1, 8U / precision);
+    tbl["memory_resource_allocation"] = gen_memory_resource_allocation_tbl_cfg("map_ram", layout);
+}
diff --git a/backends/tofino/bf-asm/input_xbar.cpp b/backends/tofino/bf-asm/input_xbar.cpp
new file mode 100644
index 00000000000..cdee0a81cc8
--- /dev/null
+++ b/backends/tofino/bf-asm/input_xbar.cpp
@@ -0,0 +1,1137 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "input_xbar.h"
+
+#include <stdlib.h>
+
+#include <fstream>
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "hashexpr.h"
+#include "lib/log.h"
+#include "lib/range.h"
+#include "misc.h"
+#include "power_ctl.h"
+
+// template specialization declarations
+#include "backends/tofino/bf-asm/jbay/input_xbar.h"
+#include "backends/tofino/bf-asm/tofino/input_xbar.h"
+
+void HashCol::dbprint(std::ostream &out) const {
+    out << "HashCol: " << " lineno: " << lineno << " bit: " << bit << " data: " << data
+        << " valid: " << valid;
+    if (fn) out << " fn: " << *fn << std::endl;
+}
+
+DynamicIXbar::DynamicIXbar(const Table *tbl, const pair_t &data) {
+    if (CHECKTYPE(data.key, tINT)) {
+        bit = data.key.i;
+        if (bit < 0 || bit >= Target::DYNAMIC_CONFIG_INPUT_BITS())
+            error(data.key.lineno, "Invalid dynamic config bit %d", bit);
+    }
+    if (CHECKTYPE2(data.value, tMAP, tMATCH)) {
+        if (data.value.type == tMAP) {
+            for (auto &kv : data.value.map)
+                if (CHECKTYPE(kv.value, tMATCH))
+                    match_phv.emplace_back(Phv::Ref(tbl->gress, tbl->stage->stageno, data.key),
+                                           data.value.m);
+        } else {
+            match = data.value.m;
+        }
+    }
+}
+
+int InputXbar::group_max_index(Group::type_t t) const {
+    switch (t) {
+        case Group::EXACT:
+            return EXACT_XBAR_GROUPS;
+        case Group::TERNARY:
+            return TCAM_XBAR_GROUPS;
+        case Group::BYTE:
+            return BYTE_XBAR_GROUPS;
+        default:
+            BUG("invalid group type for %s: %s", Target::name(), group_type(t));
+    }
+    return 0;
+}
+
+InputXbar::Group InputXbar::group_name(bool tern, const value_t &key) const {
+    if (CHECKTYPE(key, tCMD)) {
+        int index = 1;
+        if (key[0] != "group" && (key[1] == "group" || key[1] == "table")) ++index;
+        if (PCHECKTYPE(key.vec.size == index + 1, key[index], tINT)) {
+            index = key[index].i;
+            if (key[0] == "group") return Group(tern ? Group::TERNARY : Group::EXACT, index);
+            if (key[0] == "exact" && key[1] == "group") return Group(Group::EXACT, index);
+            if (key[0] == "ternary" && key[1] == "group") return Group(Group::TERNARY, index);
+            if (key[0] == "byte" && key[1] == "group") return Group(Group::BYTE, index);
+        }
+    }
+    return Group(Group::INVALID, 0);
+}
+
+int InputXbar::group_size(Group::type_t t) const {
+    switch (t) {
+        case Group::EXACT:
+            return EXACT_XBAR_GROUP_SIZE;
+        case Group::TERNARY:
+            return TCAM_XBAR_GROUP_SIZE;
+        case Group::BYTE:
+            return BYTE_XBAR_GROUP_SIZE;
+        default:
+            BUG("invalid group type for %s: %s", Target::name(), group_type(t));
+    }
+    return 0;
+}
+
+const char *InputXbar::group_type(Group::type_t t) const {
+    switch (t) {
+        case Group::EXACT:
+            return "exact";
+        case Group::TERNARY:
+            return "ternary";
+        case Group::BYTE:
+            return "byte";
+        case Group::GATEWAY:
+            return "gateway";
+        case Group::XCMP:
+            return "xcmp";
+        default:
+            return "";
+    }
+}
+
+void InputXbar::parse_group(Table *t, Group gr, const value_t &value) {
+    BUG_CHECK(gr.index >= 0, "invalid group");
+    auto &group = groups[gr];
+    if (value.type == tVEC) {
+        for (auto &reg : value.vec) group.emplace_back(Phv::Ref(t->gress, t->stage->stageno, reg));
+    } else if (value.type == tMAP) {
+        for (auto &reg : value.map) {
+            if (!CHECKTYPE2(reg.key, tINT, tRANGE)) continue;
+            int lo = -1, hi = -1;
+            if (reg.key.type == tINT) {
+                lo = reg.key.i;
+            } else {
+                lo = reg.key.range.lo;
+                hi = reg.key.range.hi;
+            }
+            if (lo < 0 || lo >= group_size(gr.type)) {
+                error(reg.key.lineno, "Invalid offset for %s group", group_type(gr.type));
+            } else if (gr.type == Group::TERNARY && lo >= 40) {
+                if (hi >= lo) hi -= 40;
+                groups[Group(Group::BYTE, gr.index / 2)].emplace_back(
+                    Phv::Ref(t->gress, t->stage->stageno, reg.value), lo - 40, hi);
+            } else {
+                group.emplace_back(Phv::Ref(t->gress, t->stage->stageno, reg.value), lo, hi);
+            }
+        }
+    } else {
+        group.emplace_back(Phv::Ref(t->gress, t->stage->stageno, value));
+    }
+}
+
+void InputXbar::parse_hash_group(HashGrp &hash_group, const value_t &value) {
+    if (value.type == tINT && (unsigned)value.i < Target::EXACT_HASH_TABLES()) {
+        hash_group.tables |= 1U << value.i;
+        return;
+    }
+    if (!CHECKTYPE2(value, tVEC, tMAP)) return;
+    const VECTOR(value_t) *tbl = 0;
+    if (value.type == tMAP) {
+        for (auto &el : MapIterChecked(value.map)) {
+            if (el.key == "seed") {
+                if (!CHECKTYPE2(el.value, tINT, tBIGINT)) continue;
+                if (el.value.type == tBIGINT) {
+                    int shift = 0;
+                    for (int i = 0; i < el.value.bigi.size; ++i) {
+                        if (shift >= 64) {
+                            error(el.key.lineno, "Invalid seed %s too large",
+                                  value_desc(&el.value));
+                            break;
+                        }
+                        hash_group.seed |= el.value.bigi.data[i] << shift;
+                        shift += CHAR_BIT * sizeof(el.value.bigi.data[i]);
+                    }
+                } else {
+                    hash_group.seed |= el.value.i & 0xFFFFFFFF;
+                }
+            } else if (el.key == "table") {
+                if (el.value.type == tINT) {
+                    if (el.value.i < 0 || el.value.i >= Target::EXACT_HASH_TABLES())
+                        error(el.value.lineno, "invalid hash group descriptor");
+                    else
+                        hash_group.tables |= 1U << el.value.i;
+                } else if (CHECKTYPE(el.value, tVEC)) {
+                    tbl = &el.value.vec;
+                }
+            } else if (el.key == "seed_parity") {
+                if (el.value.type == tSTR && el.value == "true") hash_group.seed_parity = true;
+            } else {
+                error(el.key.lineno, "invalid hash group descriptor");
+            }
+        }
+    } else {
+        tbl = &value.vec;
+    }
+    if (tbl) {
+        for (auto &v : *tbl) {
+            if (!CHECKTYPE(v, tINT)) continue;
+            if (v.i < 0 || v.i >= Target::EXACT_HASH_TABLES()) {
+                error(v.lineno, "invalid hash group descriptor");
+            } else {
+                hash_group.tables |= 1U << v.i;
+            }
+        }
+    }
+}
+
+void InputXbar::parse_hash_table(Table *t, HashTable ht, const value_t &value) {
+    if (!CHECKTYPE(value, tMAP)) return;
+    for (auto &c : value.map) {
+        if (c.key.type == tINT) {
+            setup_hash(hash_tables[ht], ht, t->gress, t->stage->stageno, c.value, c.key.lineno,
+                       c.key.i, c.key.i);
+        } else if (c.key.type == tRANGE) {
+            setup_hash(hash_tables[ht], ht, t->gress, t->stage->stageno, c.value, c.key.lineno,
+                       c.key.range.lo, c.key.range.hi);
+        } else if (CHECKTYPEM(c.key, tCMD, "hash column decriptor")) {
+            if (c.key.vec.size != 2 || c.key[0] != "valid" || c.key[1].type != tINT ||
+                options.target != TOFINO) {
+                error(c.key.lineno, "Invalid hash column descriptor");
+                continue;
+            }
+            int col = c.key[1].i;
+            if (col < 0 || col >= 52) {
+                error(c.key.lineno, "Hash column out of range");
+                continue;
+            }
+            if (!CHECKTYPE(c.value, tINT)) continue;
+            if (hash_tables[ht][col].valid)
+                error(c.key.lineno, "Hash table %d column %d valid duplicated", ht.index, col);
+            else if (c.value.i >= 0x10000)
+                error(c.value.lineno, "Hash valid value out of range");
+            else
+                hash_tables[ht][col].valid = c.value.i;
+        }
+    }
+}
+
+void InputXbar::setup_hash(std::map<int, HashCol> &hash_table, HashTable ht, gress_t gress,
+                           int stage, value_t &what, int lineno, int lo, int hi) {
+    if (lo < 0 || lo >= hash_num_columns(ht) || hi < 0 || hi >= hash_num_columns(ht)) {
+        error(lineno, "Hash column out of range");
+        return;
+    }
+    if (lo == hi) {
+        if (what.type == tINT || what.type == tBIGINT) {
+            hash_table[lo].data = get_bitvec(what, 64, "Hash column value out of range");
+            return;
+        } else if ((what.type == tSTR) && (what == "parity")) {
+            options.disable_gfm_parity = false;
+            hash_table_parity[ht] = lo;
+            return;
+        }
+    } else if (what.type == tINT && what.i == 0) {
+        for (int i = lo; i <= hi; ++i) {
+            hash_table[i].data.setraw(what.i);
+        }
+        return;
+    }
+    HashExpr *fn = HashExpr::create(gress, stage, what);  // TODO Set the crcSize.
+    if (!fn) return;
+    fn->build_algorithm();
+    int width = fn->width();
+    if (width && width != abs(hi - lo) + 1)
+        error(what.lineno, "hash expression width mismatch (%d != %d)", width, abs(hi - lo) + 1);
+    int bit = 0;
+    int errlo = -1;
+    bool fn_assigned = false;
+    for (int col : Range(lo, hi)) {
+        if (hash_table[col].data || hash_table[col].fn) {
+            if (errlo < 0) errlo = col;
+        } else {
+            if (errlo >= 0) {
+                if (errlo == col - 1) {
+                    error(lineno, "%s column %d duplicated", ht.toString().c_str(), errlo);
+                } else {
+                    error(lineno, "%s column %d..%d duplicated", ht.toString().c_str(), errlo,
+                          col - 1);
+                }
+                errlo = -1;
+            }
+            hash_table[col].lineno = what.lineno;
+            hash_table[col].fn = fn;
+            hash_table[col].bit = bit++;
+            fn_assigned = true;
+        }
+    }
+
+    if (!fn_assigned) delete fn;
+
+    if (errlo >= 0) {
+        error(lineno, "%s column %d..%d duplicated", ht.toString().c_str(), errlo, hi);
+    }
+}
+
+void InputXbar::input(Table *t, bool tern, const VECTOR(pair_t) & data) {
+    for (auto &kv : data) {
+        if ((kv.key.type == tSTR) && (kv.key == "random_seed")) {
+            random_seed = kv.value.i;
+            continue;
+        }
+        if (kv.key.type == tCMD && kv.key.vec.size == 2 && kv.key[1] == "unit" &&
+            parse_unit(t, kv)) {
+            continue;
+        }
+        if (auto grp = group_name(tern, kv.key)) {
+            if (grp.index >= group_max_index(grp.type)) {
+                error(kv.key.lineno, "invalid group descriptor");
+                continue;
+            }
+            parse_group(t, grp, kv.value);
+        } else if (kv.key.type == tCMD && kv.key[0] == "hash") {
+            if (!CHECKTYPE(kv.key.vec.back(), tINT)) continue;
+            int index = kv.key.vec.back().i;
+            if (kv.key[1] == "group") {
+                if (index >= Target::EXACT_HASH_GROUPS()) {
+                    error(kv.key.lineno, "invalid hash group descriptor");
+                    continue;
+                }
+                if (hash_groups[index].lineno >= 0) {
+                    // FIXME -- should be an error? but the compiler generates it this way
+                    warning(kv.key.lineno, "duplicate hash group %d, will merge with", index);
+                    warning(hash_groups[index].lineno, "previous definition here");
+                }
+                hash_groups[index].lineno = kv.key.lineno;
+                parse_hash_group(hash_groups[index], kv.value);
+            } else if (index >= Target::EXACT_HASH_TABLES()) {
+                error(kv.key.lineno, "invalid hash descriptor");
+            } else {
+                parse_hash_table(t, HashTable(HashTable::EXACT, index), kv.value);
+            }
+        } else if (kv.key.type == tCMD && kv.key[1] == "hash" && parse_hash(t, kv)) {
+            continue;
+        } else {
+            error(kv.key.lineno, "expecting a group or hash descriptor");
+        }
+    }
+}
+
+std::unique_ptr<InputXbar> InputXbar::create(Table *table, const value_t *key) {
+    if (key && key->type != tSTR)
+        error(key->lineno, "%s does not support dynamic key mux", Target::name());
+    return std::unique_ptr<InputXbar>(new InputXbar(table, key ? key->lineno : -1));
+}
+
+std::unique_ptr<InputXbar> InputXbar::create(Table *table, bool tern, const value_t &key,
+                                             const VECTOR(pair_t) & data) {
+    auto rv = create(table, &key);
+    rv->input(table, tern, data);
+    return rv;
+}
+
+unsigned InputXbar::tcam_width() {
+    unsigned words = 0, bytes = 0;
+    for (auto &group : groups) {
+        if (group.first.type != Group::TERNARY) {
+            if (group.first.type == Group::BYTE) ++bytes;
+            continue;
+        }
+        unsigned in_word = 0, in_byte = 0;
+        for (auto &input : group.second) {
+            if (input.lo < 40) in_word = 1;
+            if (input.lo >= 40 || input.hi >= 40) in_byte = 1;
+        }
+        words += in_word;
+        bytes += in_byte;
+    }
+    if (bytes * 2 > words) error(lineno, "Too many byte groups in tcam input xbar");
+    return words;
+}
+
+int InputXbar::tcam_byte_group(int idx) {
+    for (auto &group : groups) {
+        if (group.first.type != Group::TERNARY) continue;
+        for (auto &input : group.second)
+            if (input.lo >= 40 || input.hi >= 40) {
+                if (--idx < 0) return group.first.index / 2;
+                break;
+            }
+    }
+    return -1;
+}
+
+int InputXbar::tcam_word_group(int idx) {
+    for (auto &group : groups) {
+        if (group.first.type != Group::TERNARY) continue;
+        for (auto &input : group.second)
+            if (input.lo < 40) {
+                if (--idx < 0) return group.first.index;
+                break;
+            }
+    }
+    return -1;
+}
+
+const std::map<int, HashCol> &InputXbar::get_hash_table(HashTable id) {
+    for (auto &ht : hash_tables)
+        if (ht.first == id) return ht.second;
+    warning(lineno, "%s does not exist in table %s", id.toString().c_str(), table->name());
+    static const std::map<int, HashCol> empty_hash_table = {};
+    return empty_hash_table;
+}
+
+bool InputXbar::conflict(const std::vector<Input> &a, const std::vector<Input> &b) {
+    for (auto &i1 : a) {
+        if (i1.lo < 0) continue;
+        for (auto &i2 : b) {
+            if (i2.lo < 0) continue;
+            if (i2.lo > i1.hi || i1.lo > i2.hi) continue;
+            if (i1.what->reg != i2.what->reg) return true;
+            if (i1.lo - i1.what->lo != i2.lo - i2.what->lo) return true;
+        }
+    }
+    return false;
+}
+
+bool InputXbar::conflict(const std::map<int, HashCol> &a, const std::map<int, HashCol> &b,
+                         int *col) {
+    for (auto &acol : a) {
+        if (auto bcol = ::getref(b, acol.first)) {
+            if (acol.second.data != bcol->data || acol.second.valid != bcol->valid) {
+                if (col) *col = acol.first;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool InputXbar::conflict(const HashGrp &a, const HashGrp &b) {
+    if (a.tables != b.tables) return true;
+    if (a.seed && b.seed && a.seed != b.seed) return true;
+    return false;
+}
+
+uint64_t InputXbar::hash_columns_used(HashTable hash) {
+    uint64_t rv = 0;
+    if (hash_tables.count(hash))
+        for (auto &col : hash_tables[hash]) rv |= UINT64_C(1) << col.first;
+    return rv;
+}
+
+/* FIXME -- this is questionable, but the compiler produces hash groups that conflict
+ * FIXME -- so we try to tag ones that may be ok as merely warnings */
+bool InputXbar::can_merge(HashGrp &a, HashGrp &b) {
+    unsigned both = a.tables & b.tables;
+    uint64_t both_cols = 0, a_cols = 0, b_cols = 0;
+    for (unsigned i = 0; i < 16; i++) {
+        unsigned mask = 1U << i;
+        if (!((a.tables | b.tables) & mask)) continue;
+        for (InputXbar *other : table->stage->hash_table_use[i]) {
+            if (both & mask) both_cols |= other->hash_columns_used(i);
+            if (a.tables & mask) a_cols |= other->hash_columns_used(i);
+            if (b.tables & mask) b_cols |= other->hash_columns_used(i);
+            for (auto htp : hash_table_parity) {
+                if (other->hash_table_parity.count(htp.first) &&
+                    other->hash_table_parity.at(htp.first) != htp.second)
+                    return false;
+            }
+        }
+    }
+    a_cols &= ~both_cols;
+    b_cols &= ~both_cols;
+    if (a_cols & b_cols) return false;
+    if ((a_cols & b.seed & ~a.seed) || (b_cols & a.seed & ~b.seed)) return false;
+    if (a.tables && b.tables) {
+        a.tables |= b.tables;
+        b.tables |= a.tables;
+    }
+    if (a.seed && b.seed) {
+        a.seed |= b.seed;
+        b.seed |= a.seed;
+    }
+    return true;
+}
+
+static int tcam_swizzle_offset[4][4] = {
+    {0, +1, -2, -1},
+    {+3, 0, +1, -2},
+    {+2, -1, 0, -3},
+    {+1, +2, -1, 0},
+};
+
+// FIXME -- when swizlling 16 bit PHVs, there are 2 places we could copy from, but
+// FIXME -- we only consider the closest/easiest
+static int tcam_swizzle_16[2][2]{{0, -1}, {+1, 0}};
+
+int InputXbar::tcam_input_use(int out_byte, int phv_byte, int phv_size) {
+    int rv = out_byte;
+    BUG_CHECK(phv_byte >= 0 && phv_byte < phv_size / 8);
+    switch (phv_size) {
+        case 8:
+            break;
+        case 32:
+            rv += tcam_swizzle_offset[out_byte & 3][phv_byte];
+            break;
+        case 16:
+            rv += tcam_swizzle_16[out_byte & 1][phv_byte];
+            break;
+        default:
+            BUG();
+    }
+    return rv;
+}
+
+void InputXbar::tcam_update_use(TcamUseCache &use) {
+    if (use.ixbars_added.count(this)) return;
+    use.ixbars_added.insert(this);
+    for (auto &group : groups) {
+        if (group.first.type == Group::EXACT) continue;
+        for (auto &input : group.second) {
+            if (input.lo < 0) continue;
+            int group_base = (group.first.index * 11 + 1) / 2U;
+            int half_byte = 5 + 11 * (group.first.index / 2U);
+            if (group.first.type == Group::BYTE) {
+                group_base = 5 + 11 * group.first.index;
+                half_byte = -1;
+            }
+            int group_byte = input.lo / 8;
+            for (int phv_byte = input.what->lo / 8; phv_byte <= input.what->hi / 8;
+                 phv_byte++, group_byte++) {
+                BUG_CHECK(group_byte <= 5);
+                int out_byte = group_byte == 5 ? half_byte : group_base + group_byte;
+                int in_byte = tcam_input_use(out_byte, phv_byte, input.what->reg.size);
+                use.tcam_use.emplace(in_byte, std::pair<const Input &, int>(input, phv_byte));
+            }
+        }
+    }
+}
+
+void InputXbar::check_input(InputXbar::Group group, Input &input, TcamUseCache &use) {
+    if (group.type == Group::EXACT) {
+        if (input.lo % input.what->reg.size != input.what->lo)
+            error(input.what.lineno, "%s misaligned on input_xbar", input.what.name());
+        return;
+    }
+    unsigned bit_align_mask = input.lo >= 40 ? 3 : 7;
+    unsigned byte_align_mask = (input.what->reg.size - 1) >> 3;
+    int group_base = (group.index * 11 + 1) / 2U;
+    int half_byte = 5 + 11 * (group.index / 2U);
+    if (group.type == Group::BYTE) {
+        bit_align_mask = 3;
+        group_base = 5 + 11 * group.index;
+        half_byte = -1;
+    }
+    int group_byte = input.lo / 8;
+    if ((input.lo ^ input.what->lo) & bit_align_mask) {
+        error(input.what.lineno, "%s misaligned on input_xbar", input.what.name());
+        return;
+    }
+    for (int phv_byte = input.what->lo / 8; phv_byte <= input.what->hi / 8;
+         phv_byte++, group_byte++) {
+        BUG_CHECK(group_byte <= 5);
+        int out_byte = group_byte == 5 ? half_byte : group_base + group_byte;
+        int in_byte = tcam_input_use(out_byte, phv_byte, input.what->reg.size);
+        if (in_byte < 0 || in_byte >= TCAM_XBAR_INPUT_BYTES) {
+            error(input.what.lineno, "%s misaligned on input_xbar", input.what.name());
+            break;
+        }
+        auto *tbl = table->stage->tcam_ixbar_input[in_byte];
+        if (tbl) {
+            BUG_CHECK(tbl->input_xbar.size() == 1, "%s does not have one input xbar", tbl->name());
+            tbl->input_xbar[0]->tcam_update_use(use);
+        }
+        if (use.tcam_use.count(in_byte)) {
+            if (use.tcam_use.at(in_byte).first.what->reg != input.what->reg ||
+                use.tcam_use.at(in_byte).second != phv_byte) {
+                error(input.what.lineno, "Use of tcam ixbar for %s", input.what.name());
+                error(use.tcam_use.at(in_byte).first.what.lineno, "...conflicts with %s",
+                      use.tcam_use.at(in_byte).first.what.name());
+                break;
+            }
+        } else {
+            use.tcam_use.emplace(in_byte, std::pair<const Input &, int>(input, phv_byte));
+            table->stage->tcam_ixbar_input[in_byte] = tbl;
+        }
+    }
+}
+
+bool InputXbar::copy_existing_hash(HashTable ht, std::pair<const int, HashCol> &col) {
+    for (InputXbar *other : table->stage->hash_table_use[ht.index]) {
+        if (other == this) continue;
+        if (other->hash_tables.count(ht)) {
+            auto &o = other->hash_tables.at(ht);
+            if (o.count(col.first)) {
+                auto ocol = o.at(col.first);
+                if (ocol.fn && *ocol.fn == *col.second.fn) {
+                    col.second.data = ocol.data;
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+void InputXbar::gen_hash_column(std::pair<const int, HashCol> &col,
+                                std::pair<const HashTable, std::map<int, HashCol>> &hash) {
+    col.second.fn->gen_data(col.second.data, col.second.bit, this, hash.first);
+}
+
+void InputXbar::pass1() {
+    TcamUseCache tcam_use;
+    tcam_use.ixbars_added.insert(this);
+    if (random_seed >= 0) srandom(random_seed);
+    for (auto &group : groups) {
+        for (auto &input : group.second) {
+            if (!input.what.check()) continue;
+            if (input.what->reg.ixbar_id() < 0)
+                error(input.what.lineno, "%s not accessable in input xbar", input.what->reg.name);
+            table->stage->match_use[table->gress][input.what->reg.uid] = 1;
+            if (input.lo < 0 && group.first.type == Group::BYTE) input.lo = input.what->lo % 8U;
+            if (input.lo >= 0) {
+                if (input.hi >= 0) {
+                    if (input.size() != input.what->size())
+                        error(input.what.lineno, "Input xbar size doesn't match register size");
+                } else {
+                    input.hi = input.lo + input.what->size() - 1;
+                }
+                if (input.lo >= group_size(group.first.type))
+                    error(input.what.lineno, "placing %s off the top of the input xbar",
+                          input.what.name());
+            }
+            check_input(group.first, input, tcam_use);
+        }
+        auto &use = table->stage->ixbar_use;
+        for (InputXbar *other : use[group.first]) {
+            if (other->groups.count(group.first) &&
+                conflict(other->groups.at(group.first), group.second)) {
+                error(lineno, "Input xbar group %d conflict in stage %d", group.first.index,
+                      table->stage->stageno);
+                warning(other->lineno, "conflicting group definition here");
+            }
+        }
+        use[group.first].push_back(this);
+    }
+    for (auto &hash : hash_tables) {
+        bool ok = true;
+        HashExpr *prev = 0;
+        for (auto &col : hash.second) {
+            if (col.second.fn && col.second.fn != prev)
+                ok = (prev = col.second.fn)->check_ixbar(this, hash.first);
+            if (ok && col.second.fn && !copy_existing_hash(hash.first, col)) {
+                gen_hash_column(col, hash);
+            }
+        }
+        bool add_to_use = true;
+        for (InputXbar *other : table->stage->hash_table_use[hash.first.uid()]) {
+            if (other == this) {
+                add_to_use = false;
+                continue;
+            }
+            int column;
+            if (other->hash_tables.count(hash.first) &&
+                conflict(other->hash_tables[hash.first], hash.second, &column)) {
+                error(hash.second.at(column).lineno, "%s column %d conflict in stage %d",
+                      hash.first.toString().c_str(), column, table->stage->stageno);
+                error(other->hash_tables[hash.first].at(column).lineno,
+                      "conflicting hash definition here");
+            }
+        }
+        if (add_to_use) table->stage->hash_table_use[hash.first.uid()].push_back(this);
+    }
+    for (auto &group : hash_groups) {
+        bool add_to_use = true;
+        for (InputXbar *other : table->stage->hash_group_use[group.first]) {
+            if (other == this) {
+                add_to_use = false;
+                break;
+            }
+            if (other->hash_groups.count(group.first) &&
+                conflict(other->hash_groups[group.first], group.second)) {
+                if (can_merge(other->hash_groups[group.first], group.second))
+                    warning(group.second.lineno,
+                            "Input xbar hash group %d mergeable conflict "
+                            "in stage %d",
+                            group.first, table->stage->stageno);
+                else
+                    error(group.second.lineno, "Input xbar hash group %d conflict in stage %d",
+                          group.first, table->stage->stageno);
+                warning(other->hash_groups[group.first].lineno,
+                        "conflicting hash group definition here");
+            }
+        }
+        if (add_to_use) table->stage->hash_group_use[group.first].push_back(this);
+    }
+}
+
+void InputXbar::add_use(unsigned &byte_use, std::vector<Input> &inputs) {
+    for (auto &i : inputs) {
+        if (i.lo < 0) continue;
+        for (int byte = i.lo / 8; byte <= i.hi / 8; byte++) byte_use |= 1 << byte;
+        ;
+    }
+}
+
+const InputXbar::Input *InputXbar::GroupSet::find(Phv::Slice sl) const {
+    for (InputXbar *i : use)
+        if (auto rv = i->find(sl, group)) return rv;
+    return 0;
+}
+
+std::vector<const InputXbar::Input *> InputXbar::GroupSet::find_all(Phv::Slice sl) const {
+    std::vector<const Input *> rv;
+    for (const InputXbar *i : use) {
+        auto vec = i->find_all(sl, group);
+        rv.insert(rv.end(), vec.begin(), vec.end());
+    }
+    return rv;
+}
+
+void InputXbar::GroupSet::dbprint(std::ostream &out) const {
+    std::map<unsigned, const InputXbar::Input *> byte_use;
+    for (const InputXbar *ixbar : use) {
+        if (ixbar->groups.count(group)) {
+            for (auto &i : ixbar->groups.at(group)) {
+                if (i.lo < 0) continue;
+                for (int byte = i.lo / 8; byte <= i.hi / 8; byte++) byte_use[byte] = &i;
+            }
+        }
+    }
+    const InputXbar::Input *prev = 0;
+    for (auto &in : byte_use) {
+        if (prev == in.second) continue;
+        if (prev) out << ", ";
+        prev = in.second;
+        out << prev->what << ':' << prev->lo << ".." << prev->hi;
+    }
+}
+
+void InputXbar::pass2() {
+    auto &use = table->stage->ixbar_use;
+    for (auto &group : groups) {
+        unsigned bytes_in_use = 0;
+        for (auto &input : group.second) {
+            if (input.lo >= 0) continue;
+            if (auto *at = GroupSet(use, group.first).find(*input.what)) {
+                input.lo = at->lo;
+                input.hi = at->hi;
+                LOG1(input.what << " found in bytes " << at->lo / 8 << ".." << at->hi / 8 << " of "
+                                << group.first << " in stage " << table->stage->stageno);
+                continue;
+            }
+            if (bytes_in_use == 0)
+                for (InputXbar *other : table->stage->ixbar_use[group.first])
+                    if (other->groups.count(group.first))
+                        add_use(bytes_in_use, other->groups.at(group.first));
+            int need = input.what->hi / 8U - input.what->lo / 8U + 1;
+            unsigned mask = (1U << need) - 1;
+            int max = (group_size(group.first.type) + 7) / 8 - need;
+            for (int i = 0; i <= max; i++, mask <<= 1)
+                if (!(bytes_in_use & mask)) {
+                    input.lo = i * 8 + input.what->lo % 8U;
+                    input.hi = (i + need - 1) * 8 + input.what->hi % 8U;
+                    bytes_in_use |= mask;
+                    LOG1("Putting " << input.what << " in bytes " << i << ".." << i + need - 1
+                                    << " of " << group.first << " in stage "
+                                    << table->stage->stageno);
+                    break;
+                }
+            if (input.lo < 0) {
+                error(input.what.lineno, "No space in input xbar %s group %d for %s",
+                      group_type(group.first.type), group.first.index, input.what.name());
+                LOG1("Failed to put " << input.what << " into " << group.first << " in stage "
+                                      << table->stage->stageno);
+                LOG1("  inuse: " << GroupSet(use, group.first));
+            }
+        }
+    }
+    for (auto &hash : hash_tables) {
+        for (auto &col : hash.second) {
+            if (!col.second.data && col.second.fn) {
+                gen_hash_column(col, hash);
+            }
+        }
+    }
+}
+
+template <class REGS>
+void InputXbar::write_regs(REGS &regs) {
+    LOG1("### Input xbar " << table->name() << " write_regs " << table->loc());
+    auto &xbar = regs.dp.xbar_hash.xbar;
+    auto gress = timing_thread(table->gress);
+    for (auto &group : groups) {
+        if (group.second.empty()) continue;
+        LOG1("  # Input xbar group " << group.first);
+        unsigned group_base = 0;
+        unsigned half_byte = 0;
+        unsigned bytes_used = 0;
+        switch (group.first.type) {
+            case Group::EXACT:
+                group_base = group.first.index * 16U;
+                break;
+            case Group::TERNARY:
+                group_base = 128 + (group.first.index * 11 + 1) / 2U;
+                half_byte = 133 + 11 * (group.first.index / 2U);
+                xbar.mau_match_input_xbar_ternary_match_enable[gress] |=
+                    1 << (group.first.index) / 2U;
+                break;
+            case Group::BYTE:
+                group_base = 133 + 11 * group.first.index;
+                xbar.mau_match_input_xbar_ternary_match_enable[gress] |= 1 << (group.first.index);
+                break;
+            default:
+                BUG();
+        }
+        for (auto &input : group.second) {
+            BUG_CHECK(input.lo >= 0);
+            unsigned word_group = 0, word_index = 0, swizzle_mask = 0;
+            bool hi_enable = false;
+            switch (input.what->reg.size) {
+                case 8:
+                    word_group = (input.what->reg.ixbar_id() - 64) / 8U;
+                    word_index = (input.what->reg.ixbar_id() - 64) % 8U + (word_group & 4) * 2;
+                    swizzle_mask = 0;
+                    break;
+                case 16:
+                    word_group = (input.what->reg.ixbar_id() - 128) / 12U;
+                    word_index =
+                        (input.what->reg.ixbar_id() - 128) % 12U + 16 + (word_group & 4) * 3;
+                    swizzle_mask = 1;
+                    break;
+                case 32:
+                    word_group = input.what->reg.ixbar_id() / 8U;
+                    word_index = input.what->reg.ixbar_id() % 8U;
+                    hi_enable = word_group & 4;
+                    swizzle_mask = 3;
+                    break;
+                default:
+                    BUG();
+            }
+            word_group &= 3;
+            unsigned phv_byte = input.what->lo / 8U;
+            unsigned phv_size = input.what->reg.size / 8U;
+            for (unsigned byte = input.lo / 8U; byte <= input.hi / 8U; byte++, phv_byte++) {
+                bytes_used |= 1U << byte;
+                unsigned i = group_base + byte;
+                if (half_byte && byte == 5) i = half_byte;
+                if (i % phv_size != phv_byte) {
+                    if (group.first.type != Group::EXACT) {
+                        int off;
+                        if (phv_size == 2)
+                            off = (i & 2) ? -1 : 1;
+                        else
+                            off = tcam_swizzle_offset[i & 3][phv_byte];
+                        xbar.tswizzle.tcam_byte_swizzle_ctl[(i & 0x7f) / 4U].set_subfield(
+                            off & 3U, 2 * (i % 4U), 2);
+                        i += off;
+                    } else {
+                        error(input.what.lineno, "misaligned phv access on input_xbar");
+                    }
+                }
+                if (input.what->reg.ixbar_id() < 64) {
+                    BUG_CHECK(input.what->reg.size == 32);
+                    xbar.match_input_xbar_32b_ctl[word_group][i].match_input_xbar_32b_ctl_address =
+                        word_index;
+                    if (hi_enable)
+                        xbar.match_input_xbar_32b_ctl[word_group][i]
+                            .match_input_xbar_32b_ctl_hi_enable = 1;
+                    else
+                        xbar.match_input_xbar_32b_ctl[word_group][i]
+                            .match_input_xbar_32b_ctl_lo_enable = 1;
+                } else {
+                    xbar.match_input_xbar_816b_ctl[word_group][i]
+                        .match_input_xbar_816b_ctl_address = word_index;
+                    xbar.match_input_xbar_816b_ctl[word_group][i].match_input_xbar_816b_ctl_enable =
+                        1;
+                }
+                if ((i ^ phv_byte) & swizzle_mask)
+                    error(input.what.lineno, "Need tcam swizzle for %s",
+                          input.what.toString().c_str());
+            }
+            auto &power_ctl = regs.dp.match_input_xbar_din_power_ctl;
+            // we do in fact want mau_id, not ixbar_id here!
+            set_power_ctl_reg(power_ctl, input.what->reg.mau_id());
+        }
+        if (group.first.type == Group::EXACT) {
+            unsigned enable = 0;
+            if (bytes_used & 0xff) enable |= 1;
+            if (bytes_used & 0xff00) enable |= 2;
+            enable <<= group.first.index * 2;
+            regs.dp.mau_match_input_xbar_exact_match_enable[gress].rewrite();
+            regs.dp.mau_match_input_xbar_exact_match_enable[gress] |= enable;
+        }
+    }
+    auto &hash = regs.dp.xbar_hash.hash;
+    for (auto &ht : hash_tables) {
+        if (ht.second.empty()) continue;
+        LOG1("  # Input xbar hash table " << ht.first);
+        write_galois_matrix(regs, ht.first, ht.second);
+    }
+    for (auto &hg : hash_groups) {
+        LOG1("  # Input xbar hash group " << hg.first);
+        int grp = hg.first;
+        if (hg.second.tables) {
+            hash.parity_group_mask[grp][0] = hg.second.tables & 0xff;
+            hash.parity_group_mask[grp][1] = (hg.second.tables >> 8) & 0xff;
+            regs.dp.mau_match_input_xbar_exact_match_enable[gress].rewrite();
+            regs.dp.mau_match_input_xbar_exact_match_enable[gress] |= hg.second.tables;
+        }
+        if (hg.second.seed) {
+            for (int bit = 0; bit < 52; ++bit) {
+                if ((hg.second.seed >> bit) & 1) {
+                    hash.hash_seed[bit] |= UINT64_C(1) << grp;
+                }
+            }
+        }
+        if (gress == INGRESS)
+            regs.dp.hashout_ctl.hash_group_ingress_enable |= 1 << grp;
+        else
+            regs.dp.hashout_ctl.hash_group_egress_enable |= 1 << grp;
+        // Set hash parity check if enabled. The hash parity column data is set
+        // in pass2
+        if (hg.second.tables && !options.disable_gfm_parity) {
+            // Enable check if parity bit is set on all tables in hash group
+            int parity_bit = -1;
+            for (int index : bitvec(hg.second.tables)) {
+                HashTable ht(HashTable::EXACT, index);
+                if (!hash_table_parity.count(ht)) {
+                    continue;
+                } else {
+                    if (parity_bit == -1) {
+                        parity_bit = hash_table_parity[ht];
+                    } else {
+                        if (hash_table_parity[ht] != parity_bit)
+                            error(hg.second.lineno,
+                                  "Hash tables within a hash group "
+                                  "do not have the same parity bit - %d",
+                                  grp);
+                    }
+                }
+            }
+            if (parity_bit >= 0) {
+                regs.dp.hashout_ctl.hash_parity_check_enable |= 1 << grp;
+                // Hash seed must have even parity for the group. Loop through
+                // all bits set on the group for hash seed to determine if the
+                // parity bit must be set
+                int seed_parity = 0;
+                for (int bit = 0; bit < 52; ++bit) {
+                    auto seed_bit = (hash.hash_seed[bit] >> grp) & 0x1;
+                    seed_parity ^= seed_bit;
+                }
+                if (seed_parity) {  // flip parity bit setup on group for even parity
+                    if (!hg.second.seed_parity)
+                        warning(hg.second.lineno,
+                                "hash group %d has parity enabled, but setting seed_parity"
+                                " is disabled, changing seed to even parity",
+                                grp);
+                    hash.hash_seed[parity_bit] ^= (1 << grp);
+                }
+            }
+        }
+    }
+}
+
+template void InputXbar::write_regs(Target::Tofino::mau_regs &);
+template void InputXbar::write_regs(Target::JBay::mau_regs &);
+
+template <class REGS>
+void InputXbar::write_xmu_regs(REGS &regs) {
+    BUG("no XMU regs for %s", Target::name());
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void InputXbar::write_xmu_regs, mau_regs &)
+
+const InputXbar::Input *InputXbar::find(Phv::Slice sl, Group grp, Group *found) const {
+    const InputXbar::Input *rv = nullptr;
+    if (groups.count(grp)) {
+        for (auto &in : groups.at(grp)) {
+            if (in.lo < 0) continue;
+            if (in.what->reg.uid != sl.reg.uid) continue;
+            if (in.what->lo / 8U > sl.lo / 8U) continue;
+            if (in.what->hi / 8U < sl.hi / 8U) continue;
+            rv = &in;
+            if (in.what->lo > sl.lo) continue;
+            if (in.what->hi < sl.hi) continue;
+            if (found) *found = grp;
+            return &in;
+        }
+    } else if (grp.index == -1) {
+        for (auto &g : Keys(groups)) {
+            if (g.type != grp.type) continue;
+            if ((rv = find(sl, g))) {
+                if (found) *found = g;
+                return rv;
+            }
+        }
+    }
+    return rv;
+}
+
+int InputXbar::find_offset(const MatchSource *, Group, int) const {
+    BUG("find_offset should not be needed on %s", Target::name());
+}
+
+std::vector<const InputXbar::Input *> InputXbar::find_all(Phv::Slice sl, Group grp) const {
+    std::vector<const InputXbar::Input *> rv;
+    if (groups.count(grp)) {
+        for (auto &in : groups.at(grp)) {
+            if (in.lo < 0) continue;
+            if (in.what->reg.uid != sl.reg.uid) continue;
+            if (in.what->lo / 8U > sl.lo / 8U) continue;
+            if (in.what->hi / 8U < sl.hi / 8U) continue;
+            rv.push_back(&in);
+        }
+    } else if (grp.index == -1) {
+        for (auto &g : Keys(groups)) {
+            if (g.type != grp.type) continue;
+            auto tmp = find_all(sl, g);
+            rv.insert(rv.end(), tmp.begin(), tmp.end());
+        }
+    }
+    return rv;
+}
+
+/**
+ * InputXbar::find_hash_inputs: find all of the ixbar inputs that feed a particular phv slice
+ * to a hash table
+ * @param sl            the PHV container slice we're interested in
+ * @param hash_table    which hash table we want the input for (-1 for all hash tables)
+ */
+std::vector<const InputXbar::Input *> InputXbar::find_hash_inputs(Phv::Slice sl,
+                                                                  HashTable ht) const {
+    /* code for tofino1/2 -- all hash tables take input from exact ixbar groups, with
+     * two hash tables per group (even in lower bits and odd in upper bits)
+     */
+    BUG_CHECK(ht.type == HashTable::EXACT, "not an exact hash table: %s", ht.toString().c_str());
+    auto rv = find_all(sl, Group(Group::EXACT, ht.index >= 0 ? ht.index / 2 : -1));
+    if (ht.index >= 0) {
+        unsigned upper = ht.index % 2;
+        for (auto it = rv.begin(); it != rv.end();) {
+            unsigned bit = (*it)->lo + (sl.lo - (*it)->what->lo);
+            if (bit / 64 != upper || (bit + sl.size() - 1) / 64 != upper)
+                it = rv.erase(it);
+            else
+                ++it;
+        }
+    }
+    return rv;
+}
+
+bitvec InputXbar::hash_group_bituse(int grp) const {
+    bitvec rv;
+    unsigned tables = 0;
+    for (auto &g : hash_groups) {
+        if (grp == -1 || static_cast<int>(g.first) == grp) {
+            tables |= g.second.tables;
+            rv |= g.second.seed;
+        }
+    }
+    for (auto &tbl : hash_tables) {
+        if (tbl.first.type != HashTable::EXACT) continue;
+        if (!((tables >> tbl.first.index) & 1)) continue;
+        // Skip parity bit if set on hash table
+        auto hash_parity_bit = -1;
+        if (hash_table_parity.count(tbl.first)) {
+            hash_parity_bit = hash_table_parity.at(tbl.first);
+        }
+        for (auto &col : tbl.second) {
+            if (col.first == hash_parity_bit) continue;
+            rv[col.first] = 1;
+        }
+    }
+    return rv;
+}
+
+// Used by LPF/WRED meters to determine the bytemask input
+bitvec InputXbar::bytemask() {
+    bitvec bytemask;
+    // Only one ixbar group allowed for a meter input
+    if (match_group() == -1) return bytemask;
+    for (auto group : groups) {
+        auto &inputs = group.second;
+        for (auto &input : inputs) {
+            int byte_lo = input.lo / 8;
+            int byte_hi = input.hi / 8;
+            int byte_size = byte_hi - byte_lo + 1;
+            bytemask.setrange(byte_lo, byte_size);
+        }
+    }
+    return bytemask;
+}
+
+std::vector<const HashCol *> InputXbar::hash_column(int col, int grp) const {
+    unsigned tables = 0;
+    std::vector<const HashCol *> rv;
+    for (auto &g : hash_groups)
+        if (grp == -1 || static_cast<int>(g.first) == grp) tables |= g.second.tables;
+    for (auto &tbl : hash_tables) {
+        if (tbl.first.type != HashTable::EXACT) continue;
+        if (!((tables >> tbl.first.index) & 1)) continue;
+        if (const HashCol *c = getref(tbl.second, col)) rv.push_back(c);
+    }
+    return rv;
+}
+
+bool InputXbar::log_hashes(std::ofstream &out) const {
+    bool logged = false;
+    for (auto &ht : hash_tables) {
+        // ht.first is HashTable
+        // ht.second is std::map<int, HashCol>, key is col
+        if (ht.second.empty()) continue;
+        out << std::endl << ht.first << std::endl;
+        logged = true;
+        for (auto &col : ht.second) {
+            // col.first is hash result bit
+            // col.second is bits XOR'd in
+            out << "result[" << col.first << "] = ";
+            out << get_seed_bit(ht.first.index / 2, col.first);
+            for (const auto &bit : col.second.data) {
+                if (auto ref = get_hashtable_bit(ht.first, bit)) {
+                    std::string field_name = ref.name();
+                    auto field_bit = remove_name_tail_range(field_name) + ref.lobit();
+                    out << " ^ " << field_name << "[" << field_bit << "]";
+                }
+            }
+            out << std::endl;
+        }
+    }
+    return logged;
+}
+
+std::string InputXbar::HashTable::toString() const {
+    std::stringstream tmp;
+    tmp << *this;
+    return tmp.str();
+}
+
+unsigned InputXbar::HashTable::uid() const {
+    switch (type) {
+        case EXACT:
+            BUG_CHECK(index < Target::EXACT_HASH_TABLES(), "index too large: %s",
+                      toString().c_str());
+            return index;
+        case XCMP:
+            return index + Target::EXACT_HASH_TABLES();
+        default:
+            BUG("invalid type: %s", toString().c_str());
+    }
+}
diff --git a/backends/tofino/bf-asm/input_xbar.h b/backends/tofino/bf-asm/input_xbar.h
new file mode 100644
index 00000000000..f5d64a18d8c
--- /dev/null
+++ b/backends/tofino/bf-asm/input_xbar.h
@@ -0,0 +1,367 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_INPUT_XBAR_H_
+#define BACKENDS_TOFINO_BF_ASM_INPUT_XBAR_H_
+
+#include <fstream>
+
+#include "backends/tofino/bf-utils/dynamic_hash/dynamic_hash.h"
+#include "constants.h"
+#include "lib/ordered_map.h"
+#include "phv.h"
+
+class Table;
+class HashExpr;
+
+struct HashCol {
+    int lineno = -1;
+    HashExpr *fn = 0;
+    int bit = 0;
+    bitvec data;
+    unsigned valid = 0;  // Used only in Tofino
+    void dbprint(std::ostream &out) const;
+};
+
+inline std::ostream &operator<<(std::ostream &out, HashCol &col) {
+    col.dbprint(out);
+    return out;
+}
+
+struct DynamicIXbar {
+    int bit = -1;
+    std::vector<std::pair<Phv::Ref, match_t>> match_phv;
+    match_t match;
+
+    DynamicIXbar() = default;
+    DynamicIXbar(const DynamicIXbar &) = default;
+    DynamicIXbar(DynamicIXbar &&) = default;
+    DynamicIXbar &operator=(const DynamicIXbar &) = default;
+    DynamicIXbar &operator=(DynamicIXbar &&) = default;
+    DynamicIXbar(const Table *, const pair_t &);
+};
+
+class InputXbar {
+ public:
+    struct Group {
+        short index;
+        enum type_t { INVALID, EXACT, TERNARY, BYTE, GATEWAY, XCMP } type;
+        Group() : index(-1), type(INVALID) {}
+        Group(Group::type_t t, int i) : index(i), type(t) {}
+        explicit operator bool() const { return type != INVALID; }
+        bool operator==(const Group &a) const { return type == a.type && index == a.index; }
+        bool operator<(const Group &a) const {
+            return (type << 16) + index < (a.type << 16) + a.index;
+        }
+    };
+    struct HashTable {
+        short index;
+        enum type_t { INVALID, EXACT, XCMP } type;
+        HashTable() : index(-1), type(INVALID) {}
+        HashTable(type_t t, int i) : index(i), type(t) {}
+        explicit operator bool() const { return type != INVALID; }
+        bool operator==(const HashTable &a) const { return type == a.type && index == a.index; }
+        bool operator<(const HashTable &a) const {
+            return (type << 16) + index < (a.type << 16) + a.index;
+        }
+        std::string toString() const;
+        unsigned uid() const;
+    };
+
+ protected:
+    struct Input {
+        Phv::Ref what;
+        int lo, hi;
+        explicit Input(const Phv::Ref &a) : what(a), lo(-1), hi(-1) {}
+        Input(const Phv::Ref &a, int s) : what(a), lo(s), hi(-1) {}
+        Input(const Phv::Ref &a, int l, int h) : what(a), lo(l), hi(h) {}
+        unsigned size() const { return hi - lo + 1; }
+    };
+    struct HashGrp {
+        int lineno = -1;
+        unsigned tables = 0;  // Bit set for table index
+        uint64_t seed = 0;
+        bool seed_parity = false;  // Parity to be set on the seed value
+    };
+    Table *table;
+    ordered_map<Group, std::vector<Input>> groups;
+    std::map<HashTable, std::map<int, HashCol>> hash_tables;
+    // Map of hash table index to parity bit set on the table
+    std::map<HashTable, unsigned> hash_table_parity;
+    std::map<unsigned, HashGrp> hash_groups;
+    static bool conflict(const std::vector<Input> &a, const std::vector<Input> &b);
+    static bool conflict(const std::map<int, HashCol> &, const std::map<int, HashCol> &, int * = 0);
+    static bool conflict(const HashGrp &a, const HashGrp &b);
+    bool copy_existing_hash(HashTable ht, std::pair<const int, HashCol> &col);
+    uint64_t hash_columns_used(HashTable hash);
+    uint64_t hash_columns_used(unsigned id) {
+        BUG_CHECK(id < Target::EXACT_HASH_TABLES(), "%d out of range for exact hash", id);
+        return hash_columns_used(HashTable(HashTable::EXACT, id));
+    }
+    bool can_merge(HashGrp &a, HashGrp &b);
+    void add_use(unsigned &byte_use, std::vector<Input> &a);
+    virtual int hash_num_columns(HashTable ht) const { return 52; }
+    virtual int group_max_index(Group::type_t t) const;
+    virtual Group group_name(bool ternary, const value_t &value) const;
+    virtual int group_size(Group::type_t t) const;
+    const char *group_type(Group::type_t t) const;
+    void parse_group(Table *t, Group gr, const value_t &value);
+    virtual bool parse_hash(Table *t, const pair_t &kv) { return false; }
+    void parse_hash_group(HashGrp &hash_group, const value_t &value);
+    void parse_hash_table(Table *t, HashTable ht, const value_t &value);
+    virtual bool parse_unit(Table *t, const pair_t &kv) { return false; }
+    void setup_hash(std::map<int, HashCol> &, HashTable ht, gress_t, int stage, value_t &,
+                    int lineno, int lo, int hi);
+    struct TcamUseCache {
+        std::map<int, std::pair<const Input &, int>> tcam_use;
+        std::set<InputXbar *> ixbars_added;
+    };
+    virtual void check_input(Group group, Input &input, TcamUseCache &tcam_use);
+    int tcam_input_use(int out_byte, int phv_byte, int phv_size);
+    void tcam_update_use(TcamUseCache &use);
+    void gen_hash_column(std::pair<const int, HashCol> &col,
+                         std::pair<const HashTable, std::map<int, HashCol>> &hash);
+
+    struct GroupSet : public IHasDbPrint {
+        Group group;
+        const std::vector<InputXbar *> &use;
+        GroupSet(const std::vector<InputXbar *> &u, Group g) : group(g), use(u) {}
+        GroupSet(ordered_map<Group, std::vector<InputXbar *>> &u, Group g) : group(g), use(u[g]) {}
+        void dbprint(std::ostream &) const;
+        const Input *find(Phv::Slice sl) const;
+        std::vector<const Input *> find_all(Phv::Slice sl) const;
+    };
+
+    InputXbar() = delete;
+    InputXbar(const InputXbar &) = delete;
+    void input(Table *table, bool ternary, const VECTOR(pair_t) & data);
+    InputXbar(Table *table, int lineno) : table(table), lineno(lineno) {}
+
+ public:
+    const int lineno;
+    int random_seed = -1;
+    static std::unique_ptr<InputXbar> create(Table *table, const value_t *key = nullptr);
+    static std::unique_ptr<InputXbar> create(Table *table, bool tern, const value_t &key,
+                                             const VECTOR(pair_t) & data);
+    void pass1();
+    virtual void pass2();
+    template <class REGS>
+    void write_regs(REGS &regs);
+    template <class REGS>
+    void write_xmu_regs(REGS &regs);
+    template <class REGS>
+    void write_galois_matrix(REGS &regs, HashTable id, const std::map<int, HashCol> &mat);
+    bool have_exact() const {
+        for (auto &grp : groups)
+            if (grp.first.type == Group::EXACT) return true;
+        return false;
+    }
+    bool have_ternary() const {
+        for (auto &grp : groups)
+            if (grp.first.type != Group::EXACT) return true;
+        return false;
+    }
+    int hash_group() const {
+        /* used by gateways to get the associated hash group */
+        if (hash_groups.size() != 1) return -1;
+        return hash_groups.begin()->first;
+    }
+    bitvec hash_group_bituse(int grp = -1) const;
+    std::vector<const HashCol *> hash_column(int col, int grp = -1) const;
+    int match_group() {
+        /* used by gateways and stateful to get the associated match group */
+        if (groups.size() != 1 || groups.begin()->first.type != Group::EXACT) return -1;
+        return groups.begin()->first.index;
+    }
+    bitvec bytemask();
+    /* functions for tcam ixbar that take into account funny byte/word group stuff */
+    unsigned tcam_width();
+    int tcam_byte_group(int n);
+    int tcam_word_group(int n);
+    std::map<HashTable, std::map<int, HashCol>> &get_hash_tables() { return hash_tables; }
+    const std::map<int, HashCol> &get_hash_table(HashTable id);
+    const std::map<int, HashCol> &get_hash_table(unsigned id = 0) {
+        return get_hash_table(HashTable(HashTable::EXACT, id));
+    }
+
+    // which Group provides the input for a given HashTable
+    virtual Group hashtable_input_group(HashTable ht) const {
+        BUG_CHECK(ht.type == HashTable::EXACT, "not an exact hash table");
+        return Group(Group::EXACT, ht.index / 2);
+    }
+    virtual Phv::Ref get_hashtable_bit(HashTable id, unsigned bit) const {
+        BUG_CHECK(id.type == HashTable::EXACT, "not an exact hash table");
+        return get_group_bit(Group(Group::EXACT, id.index / 2), bit + 64 * (id.index & 0x1));
+    }
+    Phv::Ref get_hashtable_bit(unsigned id, unsigned bit) const {
+        return get_hashtable_bit(HashTable(HashTable::EXACT, id), bit);
+    }
+    Phv::Ref get_group_bit(Group grp, unsigned bit) const {
+        if (groups.count(grp))
+            for (auto &in : groups.at(grp))
+                if (bit >= unsigned(in.lo) && bit <= unsigned(in.hi))
+                    return Phv::Ref(in.what, bit - in.lo, bit - in.lo);
+        return Phv::Ref();
+    }
+    std::string get_field_name(int bit) {
+        for (auto &g : groups) {
+            for (auto &p : g.second) {
+                if (bit <= p.hi && bit >= p.lo) return p.what.name();
+            }
+        }
+        return "";
+    }
+    bool is_p4_param_bit_in_hash(std::string p4_param_name, unsigned bit) {
+        for (auto &g : groups) {
+            for (auto &p : g.second) {
+                std::string phv_field_name = p.what.name();
+                auto phv_field_lobit = remove_name_tail_range(phv_field_name);
+                phv_field_lobit += p.what.fieldlobit();
+                auto phv_field_hibit = phv_field_lobit + p.size() - 1;
+                if (p4_param_name == phv_field_name && bit <= phv_field_hibit &&
+                    bit >= phv_field_lobit)
+                    return true;
+            }
+        }
+        return false;
+    }
+    unsigned get_seed_bit(unsigned group, unsigned bit) const {
+        if (hash_groups.count(group)) return ((hash_groups.at(group).seed >> bit) & 0x1);
+        return 0;
+    }
+    HashGrp *get_hash_group(unsigned group = -1) { return ::getref(hash_groups, group); }
+    HashGrp *get_hash_group_from_hash_table(int hash_table) {
+        if (hash_table < 0 || hash_table >= Target::EXACT_HASH_TABLES()) return nullptr;
+        for (auto &hg : hash_groups) {
+            if (hg.second.tables & (1U << hash_table)) return &hg.second;
+        }
+        return nullptr;
+    }
+    bool log_hashes(std::ofstream &out) const;
+    virtual unsigned exact_physical_ids() const { return -1; }
+
+    class all_iter {
+        decltype(groups)::const_iterator outer, outer_end;
+        bool inner_valid;
+        std::vector<Input>::const_iterator inner;
+        void mk_inner_valid() {
+            if (!inner_valid) {
+                if (outer == outer_end) return;
+                inner = outer->second.begin();
+            }
+            while (inner == outer->second.end()) {
+                if (++outer == outer_end) return;
+                inner = outer->second.begin();
+            }
+            inner_valid = true;
+        }
+        struct iter_deref : public std::pair<Group, const Input &> {
+            explicit iter_deref(const std::pair<Group, const Input &> &a)
+                : std::pair<Group, const Input &>(a) {}
+            iter_deref *operator->() { return this; }
+        };
+
+     public:
+        all_iter(decltype(groups)::const_iterator o, decltype(groups)::const_iterator oend)
+            : outer(o), outer_end(oend), inner_valid(false) {
+            mk_inner_valid();
+        }
+        bool operator==(const all_iter &a) {
+            if (outer != a.outer) return false;
+            if (inner_valid != a.inner_valid) return false;
+            return inner_valid ? inner == a.inner : true;
+        }
+        all_iter &operator++() {
+            if (inner_valid && ++inner == outer->second.end()) {
+                ++outer;
+                inner_valid = false;
+                mk_inner_valid();
+            }
+            return *this;
+        }
+        std::pair<Group, const Input &> operator*() {
+            return std::pair<Group, const Input &>(outer->first, *inner);
+        }
+        iter_deref operator->() { return iter_deref(**this); }
+    };
+    all_iter begin() const { return all_iter(groups.begin(), groups.end()); }
+    all_iter end() const { return all_iter(groups.end(), groups.end()); }
+
+    const Input *find(Phv::Slice sl, Group grp, Group *found = nullptr) const;
+    const Input *find_exact(Phv::Slice sl, int group) const {
+        return find(sl, Group(Group::EXACT, group));
+    }
+    virtual int find_offset(const MatchSource *, Group grp, int offset) const;
+    int find_gateway_offset(const MatchSource *ms, int offset) const {
+        return find_offset(ms, Group(Group::GATEWAY, 0), offset);
+    }
+    int find_match_offset(const MatchSource *ms, int offset = -1) const {
+        return find_offset(ms, Group(Group::EXACT, -1), offset);
+    }
+
+    std::vector<const Input *> find_all(Phv::Slice sl, Group grp) const;
+    virtual std::vector<const Input *> find_hash_inputs(Phv::Slice sl, HashTable ht) const;
+    virtual int global_bit_position_adjust(HashTable ht) const {
+        BUG_CHECK(ht.type == HashTable::EXACT, "not an exact hash table");
+        return (ht.index / 2) * 128;
+    }
+    virtual bitvec global_column0_extract(
+        HashTable ht, const hash_column_t matrix[PARITY_GROUPS_DYN][HASH_MATRIX_WIDTH_DYN]) const {
+        BUG_CHECK(ht.type == HashTable::EXACT, "not an exact hash table");
+        return bitvec(matrix[ht.index][0].column_value);
+    }
+    virtual void setup_match_key_cfg(const MatchSource *) {}  // noop for tofino1/2
+};
+
+inline std::ostream &operator<<(std::ostream &out, InputXbar::Group gr) {
+    switch (gr.type) {
+        case InputXbar::Group::EXACT:
+            out << "exact";
+            break;
+        case InputXbar::Group::TERNARY:
+            out << "ternary";
+            break;
+        case InputXbar::Group::BYTE:
+            out << "byte";
+            break;
+        case InputXbar::Group::GATEWAY:
+            out << "gateway";
+            break;
+        case InputXbar::Group::XCMP:
+            out << "xcmp";
+            break;
+        default:
+            out << "<type=" << static_cast<int>(gr.type) << ">";
+    }
+    return out << " ixbar group " << gr.index;
+}
+
+inline std::ostream &operator<<(std::ostream &out, InputXbar::HashTable ht) {
+    switch (ht.type) {
+        case InputXbar::HashTable::EXACT:
+            out << "exact";
+            break;
+        case InputXbar::HashTable::XCMP:
+            out << "xcmp";
+            break;
+        default:
+            out << "<type=" << static_cast<int>(ht.type) << ">";
+    }
+    return out << " hashtable " << ht.index;
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_INPUT_XBAR_H_ */
diff --git a/backends/tofino/bf-asm/instruction.cpp b/backends/tofino/bf-asm/instruction.cpp
new file mode 100644
index 00000000000..415930d8d24
--- /dev/null
+++ b/backends/tofino/bf-asm/instruction.cpp
@@ -0,0 +1,1738 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "instruction.h"
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "depositfield.h"
+#include "phv.h"
+#include "power_ctl.h"
+
+namespace {
+constexpr int RotationBits = 16;
+}
+
+std::multimap<std::string, Instruction::Decode *>
+    Instruction::Decode::opcode[Instruction::NUM_SETS];
+
+Instruction::Decode::Decode(const char *name, int set, bool ts) : type_suffix(ts) {
+    targets = ~0U;
+    for (auto d : ValuesForKey(opcode[set], name)) {
+        BUG_CHECK(!(d->targets & 1));
+        targets &= ~d->targets;
+    }
+    BUG_CHECK(targets > 1);
+    opcode[set].emplace(name, this);
+}
+Instruction::Decode::Decode(const char *name, target_t target, int set, bool ts) : type_suffix(ts) {
+    targets = 1 << target;
+    for (auto d : ValuesForKey(opcode[set], name)) {
+        if (d->targets & 1) {
+            d->targets &= ~targets;
+            BUG_CHECK(d->targets > 1);
+        }
+    }
+    opcode[set].emplace(name, this);
+}
+Instruction::Decode::Decode(const char *name, std::set<target_t> target, int set, bool ts)
+    : type_suffix(ts), targets(0) {
+    for (auto t : target) targets |= 1 << t;
+    BUG_CHECK(targets > 1);
+    for (auto d : ValuesForKey(opcode[set], name)) {
+        if (d->targets & 1) {
+            d->targets &= ~targets;
+            BUG_CHECK(d->targets > 1);
+        }
+    }
+    opcode[set].emplace(name, this);
+}
+
+Instruction *Instruction::decode(Table *tbl, const Table::Actions::Action *act,
+                                 const VECTOR(value_t) & op) {
+    for (auto d : ValuesForKey(Instruction::Decode::opcode[tbl->instruction_set()], op[0].s)) {
+        if ((d->targets >> Target::register_set()) & 1) {
+            auto inst = d->decode(tbl, act, op);
+            if (!inst) continue;
+            return inst;
+        }
+    }
+    if (auto p = strchr(op[0].s, '.')) {
+        std::string opname(op[0].s, p - op[0].s);
+        for (auto d : ValuesForKey(Instruction::Decode::opcode[tbl->instruction_set()], opname)) {
+            if (((d->targets >> options.target) & 1) && d->type_suffix) {
+                auto inst = d->decode(tbl, act, op);
+                if (!inst) continue;
+                return inst;
+            }
+        }
+    }
+    return 0;
+}
+
+namespace VLIW {
+static const int group_size[] = {32, 32, 32, 32, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16};
+
+struct Operand : public IHasDbPrint {
+    /** A source operand to a VLIW instruction -- this can be a variety of things, so we
+     * have a pointer to an abstract base class and a number of derived concrete classes for
+     * the different kinds of operands.  When we parse the operand, the type may be determined,
+     * or if it is just a name, we will have to wait to a later pass to resolve what the
+     * name refers to.  At that point, the `Named' object created in parsing will be replaced
+     * with the actual operand type */
+    static const int ACTIONBUS_OPERAND = 0x20;
+    struct Base {
+        int lineno;
+        explicit Base(int line) : lineno(line) {}
+        Base(const Base &a) : lineno(a.lineno) {}
+        virtual ~Base() {}
+        virtual Base *clone() = 0;
+        virtual Base *lookup(Base *&ref) { return this; }
+        virtual bool check() { return true; }
+        virtual int phvGroup() { return -1; }
+        virtual int bits(int group, int dest_size = -1) = 0;
+        virtual unsigned bitoffset(int group) const { return 0; }
+        virtual void dbprint(std::ostream &) const = 0;
+        virtual bool equiv(const Base *) const = 0;
+        virtual bool phvRead(std::function<void(const ::Phv::Slice &sl)>) { return false; }
+        /** pass1 called as part of pass1 processing of stage
+         * @param tbl table containing the action with the instruction with this operand
+         * @param group mau PHV group of the ALU (dest) for this instruction */
+        virtual void pass1(Table *tbl, int group) {}
+        /** pass2 called as part of pass2 processing of stage
+         * @param group mau PHV group of the ALU (dest) for this instruction */
+        virtual void pass2(int group) {}
+    } *op;
+    struct Const : Base {
+        int64_t value;
+        Const(int line, int64_t v) : Base(line), value(v) {}
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const Const *>(a_)) {
+                return value == a->value;
+            } else {
+                return false;
+            }
+        }
+        Const *clone() override { return new Const(*this); }
+        int32_t bits(int group, int dest_size = -1) override {
+            // assert(value <= 0xffffffffLL);
+            int32_t val = value;
+            if (val > 0 && ((val >> (group_size[group] - 1)) & 1))
+                val |= UINT64_MAX << group_size[group];
+            int minconst = Target::MINIMUM_INSTR_CONSTANT();
+
+            if (dest_size != -1) {  // DepositField::encode() calling.
+                auto rotConst =
+                    DepositField::discoverRotation(val, group_size[group], 8, minconst - 1);
+                if (rotConst.rotate)
+                    return (rotConst.value + 24) | (rotConst.rotate << RotationBits);
+            }
+
+            if (val >= minconst && val < 8) return val + 24;
+            error(lineno, "constant value %" PRId64 " out of range for immediate", value);
+            return -1;
+        }
+        void dbprint(std::ostream &out) const override { out << value; }
+    };
+    struct Phv : Base {
+        ::Phv::Ref reg;
+        Phv(int line, gress_t g, int stage, const value_t &n) : Base(line), reg(g, stage, n) {}
+        Phv(int line, gress_t g, int stage, const std::string &n, int l, int h)
+            : Base(line), reg(g, stage, line, n, l, h) {}
+        explicit Phv(const ::Phv::Ref &r) : Base(r.lineno), reg(r) {}
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const Phv *>(a_)) {
+                return reg == a->reg;
+            } else {
+                return false;
+            }
+        }
+        Phv *clone() override { return new Phv(*this); }
+        bool check() override {
+            if (!reg.check()) return false;
+            if (reg->reg.mau_id() < 0) {
+                error(reg.lineno, "%s not accessable in mau", reg->reg.name);
+                return false;
+            }
+            return true;
+        }
+        int phvGroup() override { return reg->reg.mau_id() / ::Phv::mau_groupsize(); }
+        int bits(int group, int dest_size = -1) override {
+            if (group != phvGroup()) {
+                error(lineno, "registers in an instruction must all be in the same phv group");
+                return -1;
+            }
+            return reg->reg.mau_id() % ::Phv::mau_groupsize();
+        }
+        unsigned bitoffset(int group) const override { return reg->lo; }
+        void pass1(Table *tbl, int) override {
+            tbl->stage->action_use[tbl->gress][reg->reg.uid] = true;
+        }
+        void dbprint(std::ostream &out) const override { out << reg; }
+        bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override {
+            fn(*reg);
+            return true;
+        }
+    };
+    struct Action : Base {
+        /* source referring to either an action data or immediate field OR an attached table
+         * output.  All of these are accessed via the action data bus */
+        std::string name;
+        std::string p4name;
+        TableOutputModifier mod = TableOutputModifier::NONE;
+        Table *table;
+        Table::Format::Field *field;
+        int lo, hi;
+
+        Action(int line, const std::string &n, Table *tbl, Table::Format::Field *f, unsigned l,
+               unsigned h)
+            : Base(line), name(n), table(tbl), field(f), lo(l), hi(h) {}
+        Action(int line, const std::string &n, TableOutputModifier mod, Table *tbl, unsigned l,
+               unsigned h)
+            : Base(line), name(n), mod(mod), table(tbl), field(nullptr), lo(l), hi(h) {}
+        Action(int line, const std::string &n, Table *tbl, Table::Format::Field *f, unsigned l,
+               unsigned h, const std::string &m)
+            : Base(line), name(n), p4name(m), table(tbl), field(f), lo(l), hi(h) {}
+        Action(int line, const std::string &n, TableOutputModifier mod, Table *tbl, unsigned l,
+               unsigned h, const std::string &m)
+            : Base(line), name(n), p4name(m), mod(mod), table(tbl), field(nullptr), lo(l), hi(h) {}
+        bool equiv(const Base *a_) const override {
+            auto *a = dynamic_cast<const Action *>(a_);
+            if (!a || lo != a->lo || hi != a->hi) return false;
+            if (name == a->name && table == a->table && field == a->field && mod == a->mod)
+                return true;
+            if (field != a->field && (!field || !a->field)) return false;
+            int b1 = field ? table->find_on_actionbus(field, lo, hi, 0)
+                           : table->find_on_actionbus(name, mod, lo, hi, 0);
+            int b2 = a->field ? a->table->find_on_actionbus(a->field, lo, hi, 0)
+                              : a->table->find_on_actionbus(a->name, mod, lo, hi, 0);
+            return b1 == b2 && b1 >= 0;
+        }
+        Action *clone() override { return new Action(*this); }
+        int bits(int group, int dest_size = -1) override {
+            int size = group_size[group] / 8U;
+            BUG_CHECK(lo >= 0 && hi >= 0);
+            unsigned lo = this->lo, hi = this->hi;
+            if (dest_size > 0) {
+                // override size based on destination size for deposit-field
+                hi = lo + dest_size - 1;
+                unsigned mask = group_size[group] - 1;  // group size is power of 2 (8, 16, or 32)
+                if ((hi | mask) != (lo | mask)) {
+                    // crosses slot boundary, so is a wrap-around rotated source -- need all of it
+                    lo &= ~mask;
+                    hi = lo | mask;
+                }
+            }
+            int byte = field ? table->find_on_actionbus(field, lo, hi, size)
+                             : table->find_on_actionbus(name, mod, lo, hi, size);
+            if (byte < 0) {
+                if (this->lo > 0 || (field && this->hi + 1 < int(field->size)))
+                    error(lineno, "%s(%d..%d) is not on the action bus", name.c_str(), lo, hi);
+                else
+                    error(lineno, "%s is not on the action bus", name.c_str());
+                return -1;
+            }
+            int byte_value = byte;
+            if (size == 2) byte -= 32;
+            if (byte < 0 || byte > 32 * size)
+                error(lineno, "action bus entry %d(%s) out of range for %d-bit access", byte_value,
+                      name.c_str(), size * 8);
+            // else if (byte % size != 0)
+            //     error(lineno, "action bus entry %d(%s) misaligned for %d-bit access",
+            //           byte_value, name.c_str(), size*8);
+            else
+                return ACTIONBUS_OPERAND + byte / size;
+            return -1;
+        }
+        void pass1(Table *tbl, int group) override {
+            if (field) field->flags |= Table::Format::Field::USED_IMMED;
+            if (lo >= 0 && hi >= 0 && lo / group_size[group] != hi / group_size[group]) {
+                error(lineno,
+                      "action bus slice (%d..%d) can't fit in a single slot for %d bit "
+                      "access",
+                      lo, hi, group_size[group]);
+                // chop it down to be in range (avoid error cascade)
+                hi = lo | (group_size[group] - 1);
+            }
+        }
+        void pass2(int group) override {
+            int bits = group_size[group];
+            unsigned bytes = bits / 8U;
+            if (lo < 0) lo = 0;
+            if (hi < 0) hi = lo + bits - 1;
+            if (hi > lo + bits - 1) {
+                warning(lineno, "%s(%d..%d) larger than %d bit access", name.c_str(), lo, hi, bits);
+                hi = lo + bits - 1;
+            }
+            if ((lo ^ hi) & ~(bits - 1))
+                error(lineno, "%s(%d..%d) can't be accessed by %d bit PHV", name.c_str(), lo, hi,
+                      bits);
+            if (field && table->find_on_actionbus(field, lo, hi, bytes) < 0) {
+                int immed_offset = 0;
+                if (table->format && table->format->immed)
+                    immed_offset = table->format->immed->bit(0);
+                int l = field->bit(lo) - immed_offset, h = field->bit(hi) - immed_offset;
+                if (l % bits != 0 && l / bits != h / bits)
+                    error(lineno, "%s misaligned for action bus", name.c_str());
+                table->need_on_actionbus(field, lo, hi, bytes);
+            } else if (!field && table->find_on_actionbus(name, mod, lo, hi, bytes) < 0) {
+                if (auto *tbl = ::get(Table::all, name))
+                    table->need_on_actionbus(tbl, mod, lo, hi, bytes);
+                else
+                    error(lineno, "Can't find any operand named %s", name.c_str());
+            }
+        }
+        unsigned bitoffset(int group) const override {
+            int size = group_size[group] / 8U;
+            int byte = field ? table->find_on_actionbus(field, lo, hi, size)
+                             : table->find_on_actionbus(name, lo, hi, size);
+            return 8 * (byte % size) + lo % 8;
+        }
+        void dbprint(std::ostream &out) const override {
+            out << name << mod << '(' << lo << ".." << hi << ')';
+            if (field)
+                out << '[' << field->bits[0].lo << ':' << field->size << ", " << field->group
+                    << ']';
+        }
+    };
+    struct RawAction : Base {
+        int index;
+        unsigned offset;
+
+        RawAction(int line, int idx, unsigned off) : Base(line), index(idx), offset(off) {}
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const RawAction *>(a_)) {
+                return index == a->index && offset == a->offset;
+            } else {
+                return false;
+            }
+        }
+        RawAction *clone() override { return new RawAction(*this); }
+        int bits(int group, int dest_size = -1) override { return ACTIONBUS_OPERAND + index; }
+        unsigned bitoffset(int group) const override { return offset; }
+        void dbprint(std::ostream &out) const override { out << 'A' << index; }
+    };
+    struct HashDist : Base {
+        Table *table;
+        std::vector<int> units;
+        int lo = -1, hi = -1;
+
+        HashDist(int line, Table *t) : Base(line), table(t) {}
+        HashDist(int line, Table *t, int unit) : Base(line), table(t) { units.push_back(unit); }
+        unsigned bitoffset(int group) const override { return lo >= 0 ? lo : 0; }
+        static HashDist *parse(Table *tbl, const VECTOR(value_t) & v) {
+            if (v.size < 2 || v[0] != "hash_dist") return nullptr;
+            auto *rv = new HashDist(v[0].lineno, tbl);
+            for (int i = 1; i < v.size; ++i) {
+                if (v[i].type == tRANGE && rv->lo == -1) {
+                    rv->lo = v[i].range.lo;
+                    rv->hi = v[i].range.hi;
+                } else if (CHECKTYPE(v[i], tINT)) {
+                    rv->units.push_back(v[i].i);
+                } else {
+                    delete rv;
+                    return nullptr;
+                }
+            }
+            return rv;
+        }
+
+        HashDistribution *find_hash_dist(int unit) const {
+            if (auto rv = table->find_hash_dist(unit)) return rv;
+            for (auto *mtab : table->get_match_tables())
+                if (auto rv = mtab->find_hash_dist(unit)) return rv;
+            return nullptr;
+        }
+        bool equiv(const Base *a_) const override {
+            auto *a = dynamic_cast<const HashDist *>(a_);
+            if (!a || units != a->units || lo != a->lo || hi != a->hi) return false;
+            if (table == a->table) return true;
+            int elo = this->lo < 0 ? 0 : lo;
+            int ehi = this->hi < 0 ? 15 : hi;
+            for (auto unit : units) {
+                int b1 = table->find_on_actionbus(find_hash_dist(unit), elo, ehi, 0);
+                int b2 = a->table->find_on_actionbus(a->find_hash_dist(unit), elo, ehi, 0);
+                if (b1 != b2 || b1 < 0) return false;
+            }
+            return true;
+        }
+        HashDist *clone() override { return new HashDist(*this); }
+        void pass2(int group) override {
+            if (units.size() > 2) {
+                error(lineno, "Can't use more than 2 hash_dist units together in an action");
+                return;
+            }
+            int size = group_size[group] / 8U;
+            if (lo < 0) lo = 0;
+            if (hi < 0) hi = 8 * size - 1;
+            if ((lo ^ hi) & ~(8 * size - 1))
+                error(lineno, "hash dist slice(%d..%d) can't be accessed by %d bit PHV", lo, hi,
+                      8 * size);
+            if (units.size() == 2) {
+                if (size != 4)
+                    error(lineno, "Can't combine hash_dist units in %d bit operation", size * 8);
+                auto xbar_use = HashDistribution::IMMEDIATE_LOW;
+                for (auto u : units) {
+                    if (auto hd = find_hash_dist(u))
+                        hd->xbar_use |= xbar_use;
+                    else
+                        error(lineno, "No hash dist %d in table %s", u, table->name());
+                    xbar_use = HashDistribution::IMMEDIATE_HIGH;
+                }
+            } else if (auto hd = find_hash_dist(units.at(0))) {
+                if (hd->xbar_use & HashDistribution::IMMEDIATE_HIGH) {
+                    if (size == 4) {
+                        lo += 16;
+                        hi += 16;
+                    }
+                } else {
+                    hd->xbar_use |= HashDistribution::IMMEDIATE_LOW;
+                }
+            } else {
+                error(lineno, "No hash dist %d in table %s", units.at(0), table->name());
+            }
+            int lo = this->lo;
+            for (auto u : units) {
+                if (auto hd = find_hash_dist(u)) {
+                    if (table->find_on_actionbus(hd, lo, hi, size) < 0)
+                        table->need_on_actionbus(hd, lo, hi, size);
+                    lo += 16;
+                }
+            }
+        }
+        int bits(int group, int dest_size = -1) override {
+            int size = group_size[group] / 8U;
+            auto hd = find_hash_dist(units.at(0));
+            if (!hd) error(lineno, "could not find hash dist");
+            int byte = table->find_on_actionbus(hd, lo, hi, size);
+            if (byte < 0) {
+                error(lineno, "hash dist %d is not on the action bus", (hd ? hd->id : -1));
+                return -1;
+            }
+            if (units.size() == 2) {
+                auto hd1 = find_hash_dist(units.at(1));
+                if (!hd1) error(lineno, "could not find hash dist");
+                if (table->find_on_actionbus(ActionBusSource(hd, hd1), lo + 16, hi, size) < 0)
+                    error(lineno, "hash dists %d and %d not contiguous on the action bus",
+                          (hd ? hd->id : -1), (hd1 ? hd1->id : -1));
+            }
+            if (size == 2) byte -= 32;
+            if (byte >= 0 && byte < 32 * size) return ACTIONBUS_OPERAND + byte / size;
+            error(lineno, "action bus entry %d(hash_dist %d) out of range for %d-bit access",
+                  size == 2 ? byte + 32 : byte, hd->id, size * 8);
+            return -1;
+        }
+        void dbprint(std::ostream &out) const override {
+            out << "hash_dist(";
+            const char *sep = "";
+            for (auto u : units) {
+                out << sep << u;
+                sep = ", ";
+            }
+            out << ")";
+        }
+    };
+    struct RandomGen : Base {
+        Table *table;
+        RandomNumberGen rng;
+        int lo = 0, hi = -1;
+        RandomGen(Table *t, const VECTOR(value_t) & v) : Base(v[0].lineno), table(t), rng(0) {
+            if (v.size > 1 && CHECKTYPE(v[1], tINT)) rng.unit = v[1].i;
+            if (rng.unit < 0 || rng.unit > 1) error(v[0].lineno, "invalid random number generator");
+            if (v.size > 2 && CHECKTYPE(v[2], tRANGE)) {
+                lo = v[2].range.lo;
+                hi = v[2].range.hi;
+                if (lo < 0 || hi > 31 || hi < lo)
+                    error(v[2].lineno, "invalid random number generator slice");
+            }
+        }
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const RandomGen *>(a_)) {
+                return rng == a->rng && lo == a->lo && hi == a->hi;
+            } else {
+                return false;
+            }
+        }
+        RandomGen *clone() override { return new RandomGen(*this); }
+        void pass2(int group) override {
+            unsigned size = group_size[group];
+            if (hi < 0) hi = lo + 8 * size - 1;
+            if ((lo ^ hi) & ~(8 * size - 1))
+                error(lineno, "invalid slice(%d..%d) of rng %d for use with %d bit PHV", lo, hi,
+                      rng.unit, size);
+            if (table->find_on_actionbus(rng, lo, hi, size / 8U))
+                table->need_on_actionbus(rng, lo, hi, size / 8U);
+        }
+        int bits(int group, int dest_size = -1) override {
+            int size = group_size[group] / 8U;
+            int byte = table->find_on_actionbus(rng, lo, hi, size);
+            if (byte < 0) {
+                error(lineno, "rng %d is not on the action bus", rng.unit);
+                return -1;
+            }
+            if (size == 2) byte -= 32;
+            if (byte >= 0 && byte < 32 * size) return ACTIONBUS_OPERAND + byte / size;
+            error(lineno, "action bus entry %d(rng %d) out of range for %d-bit access",
+                  size == 2 ? byte + 32 : byte, rng.unit, size * 8);
+            return -1;
+        }
+        unsigned bitoffset(int group) const override { return lo; }
+        void dbprint(std::ostream &out) const override {
+            out << "rng " << rng.unit << '(' << lo << ".." << hi << ')';
+        }
+    };
+    struct Named : Base {
+        std::string name;
+        std::string p4name;
+        TableOutputModifier mod = TableOutputModifier::NONE;
+        int lo, hi;
+        Table *tbl;
+        std::string action;
+
+        Named(int line, const std::string &n, int l, int h, Table *t, const std::string &act)
+            : Base(line), name(n), lo(l), hi(h), tbl(t), action(act) {}
+        Named(int line, const std::string &n, TableOutputModifier m, int l, int h, Table *t,
+              const std::string &act)
+            : Base(line), name(n), mod(m), lo(l), hi(h), tbl(t), action(act) {}
+        Named(int line, const std::string &n, int l, int h, Table *t, const std::string &act,
+              std::string &m)
+            : Base(line), name(n), p4name(m), lo(l), hi(h), tbl(t), action(act) {}
+        Named(int line, const std::string &n, TableOutputModifier mod, int l, int h, Table *t,
+              const std::string &act, std::string &m)
+            : Base(line), name(n), p4name(m), mod(mod), lo(l), hi(h), tbl(t), action(act) {}
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const Named *>(a_)) {
+                return name == a->name && lo == a->lo && hi == a->hi && tbl == a->tbl &&
+                       action == a->action;
+            } else {
+                return false;
+            }
+        }
+        Base *lookup(Base *&ref) override;
+        Named *clone() override { return new Named(*this); }
+        bool check() override {
+            BUG();
+            return true;
+        }
+        int phvGroup() override {
+            BUG();
+            return -1;
+        }
+        int bits(int group, int dest_size = -1) override {
+            BUG();
+            return 0;
+        }
+        unsigned bitoffset(int group) const override {
+            BUG();
+            return 0;
+        }
+        void pass1(Table *, int) override { BUG(); }
+        void dbprint(std::ostream &out) const override {
+            out << name;
+            if (lo >= 0) {
+                out << '(' << lo;
+                if (hi >= 0 && hi != lo) out << ".. " << hi;
+                out << ')';
+            }
+            out << '[' << tbl->name() << ':' << action << ']';
+        }
+    };
+    Operand() : op(0) {}
+    Operand(const Operand &a) : op(a.op ? a.op->clone() : 0) {}
+    Operand(Operand &&a) : op(a.op) { a.op = 0; }
+    Operand &operator=(const Operand &a) {
+        if (&a != this) {
+            delete op;
+            op = a.op ? a.op->clone() : 0;
+        }
+        return *this;
+    }
+    Operand &operator=(Operand &&a) {
+        if (&a != this) {
+            delete op;
+            op = a.op;
+            a.op = 0;
+        }
+        return *this;
+    }
+    ~Operand() { delete op; }
+    Operand(Table *tbl, const Table::Actions::Action *act, const value_t &v);
+    Operand(gress_t gress, int stage, const value_t &v) : op(new Phv(v.lineno, gress, stage, v)) {}
+    explicit Operand(const ::Phv::Ref &r) : op(new Phv(r)) {}
+    bool valid() const { return op != 0; }
+    bool operator==(Operand &a) {
+        return op == a.op || (op && a.op && op->lookup(op)->equiv(a.op->lookup(a.op)));
+    }
+    unsigned bitoffset(int group) { return op->lookup(op)->bitoffset(group); }
+    bool check() { return op && op->lookup(op) ? op->check() : false; }
+    int phvGroup() { return op->lookup(op)->phvGroup(); }
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) {
+        return op->lookup(op)->phvRead(fn);
+    }
+    int bits(int group, int dest_size = -1) { return op->lookup(op)->bits(group, dest_size); }
+    void dbprint(std::ostream &out) const { op->dbprint(out); }
+    Base *operator->() { return op->lookup(op); }
+    template <class T>
+    T *to() {
+        return dynamic_cast<T *>(op->lookup(op));
+    }
+};
+
+static void parse_slice(const VECTOR(value_t) & vec, int idx, int &lo, int &hi) {
+    if (PCHECKTYPE2(vec.size == idx + 1, vec[idx], tINT, tRANGE)) {
+        if (vec[idx].type == tINT) {
+            lo = hi = vec[idx].i;
+        } else {
+            lo = vec[idx].range.lo;
+            hi = vec[idx].range.hi;
+        }
+    }
+}
+
+Operand::Operand(Table *tbl, const Table::Actions::Action *act, const value_t &v) : op(0) {
+    if (v.type == tINT) {
+        op = new Const(v.lineno, v.i);
+    } else if (CHECKTYPE2(v, tSTR, tCMD)) {
+        std::string name = v.type == tSTR ? v.s : v[0].s;
+        std::string p4name = name;
+        TableOutputModifier mod = TableOutputModifier::NONE;
+        int lo = -1, hi = -1;
+        if (v.type == tCMD) {
+            if (v == "hash_dist" && (op = HashDist::parse(tbl, v.vec))) return;
+            if (v == "rng" && (op = new RandomGen(tbl, v.vec))) return;
+            if (v.vec.size > 1 && (v[1] == "color" || v[1] == "address")) {
+                if (v[1] == "color") mod = TableOutputModifier::Color;
+                if (v[1] == "address") mod = TableOutputModifier::Address;
+                if (v[1].type == tCMD)
+                    parse_slice(v[1].vec, 1, lo, hi);
+                else if (v.vec.size > 2)
+                    parse_slice(v.vec, 2, lo, hi);
+            } else {
+                parse_slice(v.vec, 1, lo, hi);
+            }
+        }
+        name = act->alias_lookup(v.lineno, name, lo, hi);
+        if (name == "hash_dist" && lo == hi) {
+            auto hd = new HashDist(v.lineno, tbl, lo);
+            if (v.type == tCMD && v[1].type == tRANGE) {
+                hd->lo = v[1].range.lo;
+                hd->hi = v[1].range.hi;
+            }
+            op = hd;
+            return;
+        }
+        op = new Named(v.lineno, name, mod, lo, hi, tbl, act->name, p4name);
+    }
+}
+
+auto Operand::Named::lookup(Base *&ref) -> Base * {
+    int slot, len = -1;
+    if (tbl->action) tbl = tbl->action;
+    int lo = this->lo >= 0 ? this->lo : 0;
+    if (auto *field = tbl->lookup_field(name, action)) {
+        if (!options.match_compiler) {
+            /* FIXME -- The glass compiler generates refs past the end of action table fields
+             * like these, and just accesses whatever bits happen to be there.  So we
+             * supress these error checks for compatibility (ex: tests/action_bus1.p4) */
+            if ((unsigned)lo >= field->size) {
+                error(lineno, "Bit %d out of range for field %s", lo, name.c_str());
+                ref = 0;
+            } else if (hi >= 0 && (unsigned)hi >= field->size) {
+                error(lineno, "Bit %d out of range for field %s", hi, name.c_str());
+                ref = 0;
+            }
+        }
+        if (ref) {
+            ref = new Action(lineno, name, tbl, field, lo, hi >= 0 ? hi : field->size - 1, p4name);
+        }
+    } else if (tbl->find_on_actionbus(name, mod, lo, hi >= 0 ? hi : 7, 0, &len) >= 0) {
+        ref = new Action(lineno, name, mod, tbl, lo, hi >= 0 ? hi : len - 1, p4name);
+    } else if (::Phv::get(tbl->gress, tbl->stage->stageno, name)) {
+        ref = new Phv(lineno, tbl->gress, tbl->stage->stageno, name, lo, hi);
+    } else if (sscanf(name.c_str(), "A%d%n", &slot, &len) >= 1 &&
+               len == static_cast<int>(name.size()) && slot >= 0 && slot < 32) {
+        ref = new RawAction(lineno, slot, lo);
+    } else if (name == "hash_dist" && (lo == hi || hi < 0)) {
+        ref = new HashDist(lineno, tbl, lo);
+    } else if (Table::all->count(name)) {
+        ref = new Action(lineno, name, mod, tbl, lo, hi, p4name);
+    } else {
+        ref = new Phv(lineno, tbl->gress, tbl->stage->stageno, name, this->lo, hi);
+    }
+    if (ref != this) delete this;
+    return ref;
+}
+
+struct VLIWInstruction : Instruction {
+    explicit VLIWInstruction(int l) : Instruction(l) {}
+    virtual int encode() = 0;
+    template <class REGS>
+    void write_regs_2(REGS &regs, Table *tbl, Table::Actions::Action *act);
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+// target specific template specializations
+#include "jbay/instruction.cpp"    // NOLINT(build/include)
+#include "tofino/instruction.cpp"  // NOLINT(build/include)
+
+struct AluOP : VLIWInstruction {
+    enum special_flags {
+        Commutative = 1,
+        IgnoreSrc1 = 2,
+        IgnoreSrc2 = 4,
+        IgnoreSrcs = 6,
+        CanSliceWithConst = 8
+    };
+    const struct Decode : Instruction::Decode {
+        std::string name;
+        unsigned opcode;
+        const Decode *swap_args;
+        int flags = 0;
+        Decode(const char *n, unsigned opc, int flgs = 0, const char *alias_name = 0)
+            : Instruction::Decode(n),
+              name(n),
+              opcode(opc),
+              swap_args(flgs & Commutative ? this : 0),
+              flags(flgs) {
+            if (alias_name) alias(alias_name);
+        }
+        Decode(const char *n, target_t targ, unsigned opc, int flgs = 0)
+            : Instruction::Decode(n, targ),
+              name(n),
+              opcode(opc),
+              swap_args(flgs & Commutative ? this : 0),
+              flags(flgs) {}
+        Decode(const char *n, std::set<target_t> targ, unsigned opc, int flgs = 0,
+               const char *alias_name = 0)
+            : Instruction::Decode(n, targ),
+              name(n),
+              opcode(opc),
+              swap_args(flgs & Commutative ? this : 0),
+              flags(flgs) {
+            if (alias_name) alias(alias_name);
+        }
+        Decode(const char *n, unsigned opc, int flgs, Decode *sw, const char *alias_name = 0)
+            : Instruction::Decode(n), name(n), opcode(opc), swap_args(sw), flags(flgs) {
+            if (sw && !sw->swap_args) sw->swap_args = this;
+            if (alias_name) alias(alias_name);
+        }
+        Decode(const char *n, unsigned opc, Decode *sw, const char *alias_name = 0)
+            : Instruction::Decode(n), name(n), opcode(opc), swap_args(sw) {
+            if (sw && !sw->swap_args) sw->swap_args = this;
+            if (alias_name) alias(alias_name);
+        }
+        Decode(const char *n, target_t targ, unsigned opc, Decode *sw, const char *alias_name = 0)
+            : Instruction::Decode(n, targ), name(n), opcode(opc), swap_args(sw) {
+            if (sw && !sw->swap_args) sw->swap_args = this;
+            if (alias_name) alias(alias_name);
+        }
+        Decode(const char *n, std::set<target_t> targ, unsigned opc, Decode *sw,
+               const char *alias_name = 0)
+            : Instruction::Decode(n, targ), name(n), opcode(opc), swap_args(sw) {
+            if (sw && !sw->swap_args) sw->swap_args = this;
+            if (alias_name) alias(alias_name);
+        }
+        Decode(const char *n, std::set<target_t> targ, unsigned opc, int flgs, Decode *sw,
+               const char *alias_name = 0)
+            : Instruction::Decode(n, targ), name(n), opcode(opc), swap_args(sw), flags(flgs) {
+            if (sw && !sw->swap_args) sw->swap_args = this;
+            if (alias_name) alias(alias_name);
+        }
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    Phv::Ref dest;
+    Operand src1, src2;
+    bool ignoreSrc1 = false, ignoreSrc2 = false;
+    AluOP(const Decode *op, Table *tbl, const Table::Actions::Action *act, const value_t &d,
+          const value_t &s1, const value_t &s2)
+        : VLIWInstruction(d.lineno),
+          opc(op),
+          dest(tbl->gress, tbl->stage->stageno + 1, d),
+          src1(tbl, act, s1),
+          src2(tbl, act, s2) {}
+    std::string name() override { return opc->name; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *) override;
+    void pass2(Table *tbl, Table::Actions::Action *) override {
+        if (!ignoreSrc1) src1->pass2(slot / Phv::mau_groupsize());
+        if (!ignoreSrc2) src2->pass2(slot / Phv::mau_groupsize());
+    }
+    int encode() override;
+    bool equiv(Instruction *a_) override;
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override {
+        bool rv = false;
+        if (!ignoreSrc1) rv |= src1.phvRead(fn);
+        if (!ignoreSrc2) rv |= src2.phvRead(fn);
+        return rv;
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "INSTR: " << opc->name << ' ' << dest << ", " << src1 << ", " << src2;
+    }
+};
+
+struct AluOP3Src : AluOP {
+    struct Decode : AluOP::Decode {
+        Decode(const char *n, unsigned opc) : AluOP::Decode(n, opc) {}
+        Decode(const char *n, target_t t, unsigned opc) : AluOP::Decode(n, t, opc) {}
+        Decode(const char *n, std::set<target_t> t, unsigned opc) : AluOP::Decode(n, t, opc) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    };
+    Operand src3;
+    AluOP3Src(const Decode *op, Table *tbl, const Table::Actions::Action *act, const value_t &d,
+              const value_t &s1, const value_t &s2, const value_t &s3)
+        : AluOP(op, tbl, act, d, s1, s2), src3(tbl, act, s3) {}
+    Instruction *pass1(Table *tbl, Table::Actions::Action *);
+    void pass2(Table *tbl, Table::Actions::Action *);
+};
+
+Instruction *AluOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                   const VECTOR(value_t) & op) const {
+    AluOP *rv;
+    if (op.size == 4) {
+        rv = new AluOP(this, tbl, act, op.data[1], op.data[2], op.data[3]);
+    } else if (op.size == 3) {
+        if (!(flags & IgnoreSrc1) && (flags & IgnoreSrc2)) {
+            rv = new AluOP(this, tbl, act, op.data[1], op.data[2], op.data[2]);
+            rv->ignoreSrc2 = true;
+        } else {
+            rv = new AluOP(this, tbl, act, op.data[1], op.data[1], op.data[2]);
+            rv->ignoreSrc1 = (flags & IgnoreSrc1) != 0;
+        }
+    } else if (op.size == 3 && (flags & IgnoreSrc1) && (flags & IgnoreSrc2)) {
+        rv = new AluOP(this, tbl, act, op.data[1], op.data[1], op.data[1]);
+        rv->ignoreSrc1 = rv->ignoreSrc2 = true;
+    } else {
+        error(op[0].lineno, "%s requires 2 or 3 operands", op[0].s);
+        return 0;
+    }
+    if (!rv->src1.valid())
+        error(op[2].lineno, "invalid src1");
+    else if (!rv->src2.valid())
+        error(op[3].lineno, "invalid src2");
+    else
+        return rv;
+    delete rv;
+    return 0;
+}
+Instruction *AluOP3Src::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                       const VECTOR(value_t) & op) const {
+    if (op.size != 5) {
+        if (op.size < 3 || op.size > 5) {
+            error(op[0].lineno, "%s requires 2, 3 or 4 operands", op[0].s);
+            return 0;
+        } else {
+        }
+        return AluOP::Decode::decode(tbl, act, op);
+    }
+    auto rv = new AluOP3Src(this, tbl, act, op.data[1], op.data[2], op.data[3], op.data[4]);
+    if (!rv->src1.valid())
+        error(op[2].lineno, "invalid src1");
+    else if (!rv->src2.valid())
+        error(op[3].lineno, "invalid src2");
+    else if (!rv->src3.valid())
+        error(op[3].lineno, "invalid src3");
+    else
+        return rv;
+    delete rv;
+    return 0;
+}
+
+static bool will_pad_with_zeros(const Phv::Slice &dest, Table::Actions::Action *,
+                                Operand::Action *ad) {
+    if (ad->lo != dest.lo || ad->hi != dest.hi) {
+        // need to line up with the destination, if it doesn't reject
+        // FIXME could we rotate the data in the field if everything else was ok?  The
+        // compiler should have done that already
+        return false;
+    }
+    if (ad->field->bits.size() != 1) {
+        // punt for split fields.  Not sure this can ever happen
+        return false;
+    }
+    // If Operand::Action is for immediate, check if the immediate is at the top end of the
+    // immediate overhead. The immediate extract mask in these cases will set the additional bits to
+    // zero (zero extend). Hence we dont need to check if the size is the same as destination
+    // register size.
+    // This check will be false for cases when Operand::Action is not immediate as immed_size will
+    // be zero
+    if (ad->field->immed_bit(ad->field->size) == ad->field->fmt->immed_size) return true;
+    if (ad->field->size < dest.reg.size) {
+        // field not big enough
+        return false;
+    }
+    // FIXME -- should check that the action has no other uses of this AD operand that uses
+    // other bits?  Not trivial to do
+    return true;
+}
+
+Instruction *AluOP::pass1(Table *tbl, Table::Actions::Action *act) {
+    if (!dest.check()) return this;
+    if (!ignoreSrc1 && !src1.check()) return this;
+    if (!ignoreSrc2 && !src2.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    if (dest->reg.type != Phv::Register::NORMAL) {
+        error(dest.lineno, "%s dest can't be dark or mocha phv", opc->name.c_str());
+        return this;
+    }
+    slot = dest->reg.mau_id();
+    tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    if (!ignoreSrc1) src1->pass1(tbl, slot / Phv::mau_groupsize());
+    if (!ignoreSrc2) src2->pass1(tbl, slot / Phv::mau_groupsize());
+    if (!ignoreSrc2 && src2.phvGroup() < 0 && opc->swap_args) {
+        std::swap(src1, src2);
+        std::swap(ignoreSrc1, ignoreSrc2);
+        opc = opc->swap_args;
+    }
+    if (!ignoreSrc2 && src2.phvGroup() < 0) error(lineno, "src2 must be phv register");
+    if (dest->lo || dest->hi != dest->reg.size - 1) {
+        if ((opc->flags & CanSliceWithConst) && Operand(dest) == src2) {
+            // special case -- bitwise op wih dest==src2 and src1 is a constant or action
+            // data that is padded with 0s can just operate on the whole container to get
+            // the right result
+            auto *k = src1.to<Operand::Const>();
+            if (k && k->value >= 0 && (k->value << dest->lo) < 8) {
+                k->value <<= dest->lo;
+                // FIXME -- should rewrite dest and src2 to refer to the whole container for
+                // strict correctness?  We don't actually look at the slice after this so maybe ok
+                return this;
+            }
+            auto *ad = src1.to<Operand::Action>();
+            if (ad && will_pad_with_zeros(*dest, act, ad)) return this;
+        }
+        error(lineno, "ALU ops cannot operate on slices");
+    }
+    return this;
+}
+Instruction *AluOP3Src::pass1(Table *tbl, Table::Actions::Action *act) {
+    AluOP::pass1(tbl, act);
+    src3->pass1(tbl, slot / Phv::mau_groupsize());
+    if (!src3.to<Operand::Action>()) error(lineno, "src3 must be on the action bus");
+    return this;
+}
+void AluOP3Src::pass2(Table *tbl, Table::Actions::Action *act) {
+    AluOP::pass2(tbl, act);
+    src3->pass2(slot / Phv::mau_groupsize());
+    if (auto s1 = src1.to<Operand::Action>()) {
+        auto s3 = src3.to<Operand::Action>();
+        if (s1->bits(slot / Phv::mau_groupsize()) + 1 != s3->bits(slot / Phv::mau_groupsize()))
+            error(lineno, "src1 and src3 must be adjacent on the action bus");
+    } else {
+        error(lineno, "src1 must be on the action bus");
+    }
+}
+
+int AluOP::encode() {
+    int rv = (opc->opcode << 6);
+    if (!ignoreSrc1) rv |= src1.bits(slot / Phv::mau_groupsize());
+    rv <<= Target::INSTR_SRC2_BITS();
+    if (!ignoreSrc2) rv |= src2.bits(slot / Phv::mau_groupsize());
+    return rv;
+}
+bool AluOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<AluOP *>(a_)) {
+        return opc == a->opc && dest == a->dest && src1 == a->src1 && src2 == a->src2 &&
+               ignoreSrc1 == a->ignoreSrc1 && ignoreSrc2 == a->ignoreSrc2;
+    } else {
+        return false;
+    }
+}
+
+struct LoadConst : VLIWInstruction {
+    struct Decode : Instruction::Decode {
+        Decode(const char *n, std::set<target_t> targ) : Instruction::Decode(n, targ) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    };
+    Phv::Ref dest;
+    int src;
+    LoadConst(Table *tbl, const Table::Actions::Action *act, const value_t &d, int s)
+        : VLIWInstruction(d.lineno), dest(tbl->gress, tbl->stage->stageno + 1, d), src(s) {}
+    LoadConst(int line, Phv::Ref &d, int v) : VLIWInstruction(line), dest(d), src(v) {}
+    std::string name() override { return ""; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *) override;
+    void pass2(Table *, Table::Actions::Action *) override {}
+    int encode() override { return Target::encodeConst(src); }
+    bool equiv(Instruction *a_) override;
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override { return false; }
+    void dbprint(std::ostream &out) const override { out << "INSTR: set " << dest << ", " << src; }
+};
+
+Instruction *LoadConst::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                       const VECTOR(value_t) & op) const {
+    if (op.size != 3) {
+        error(op[0].lineno, "%s requires 2 operands", op[0].s);
+        return 0;
+    }
+    if (!CHECKTYPE(op[2], tINT)) return 0;
+    return new LoadConst(tbl, act, op[1], op[2].i);
+}
+
+Instruction *LoadConst::pass1(Table *tbl, Table::Actions::Action *) {
+    if (!dest.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    if (dest->reg.type != Phv::Register::NORMAL) {
+        error(dest.lineno, "load-const dest can't be dark or mocha phv");
+        return this;
+    }
+    if (dest->lo || dest->hi != dest->reg.size - 1) {
+        error(lineno, "load-const cannot operate on slices");
+        return this;
+    }
+    slot = dest->reg.mau_id();
+    int size = Phv::reg(slot)->size;
+    BUG_CHECK(size > 0, "bad register size");
+    int minval = ~0u << (size - 1);
+    if (size > 21) {
+        size = 21;
+        minval = 0;
+    }
+    // For an 8 or 16 bit PHV, the constant to load is 8 (or 16) bits, so
+    // there's no need for sign extension to deal with a negative value.  For
+    // 32 bit PHVs, the constant is 21 bits and zero-extended to 32 bits, so
+    // must be positive.
+    if (src >= (1 << size) || src < minval) error(lineno, "Constant value %d out of range", src);
+    src &= (1 << size) - 1;
+    tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    return this;
+}
+
+bool LoadConst::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<LoadConst *>(a_)) {
+        return dest == a->dest && src == a->src;
+    } else {
+        return false;
+    }
+}
+
+struct CondMoveMux : VLIWInstruction {
+    const struct Decode : Instruction::Decode {
+        std::string name;
+        unsigned opcode, cond_size;
+        bool src2opt;
+        Decode(const char *name, unsigned opc, unsigned csize, bool s2opt, const char *alias_name)
+            : Instruction::Decode(name), name(name), opcode(opc), cond_size(csize), src2opt(s2opt) {
+            alias(alias_name);
+        }
+        Decode(const char *name, target_t targ, unsigned opc, unsigned csize, bool s2opt,
+               const char *alias_name)
+            : Instruction::Decode(name, targ),
+              name(name),
+              opcode(opc),
+              cond_size(csize),
+              src2opt(s2opt) {
+            alias(alias_name);
+        }
+        Decode(const char *name, std::set<target_t> targ, unsigned opc, unsigned csize, bool s2opt,
+               const char *alias_name)
+            : Instruction::Decode(name, targ),
+              name(name),
+              opcode(opc),
+              cond_size(csize),
+              src2opt(s2opt) {
+            alias(alias_name);
+        }
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    Phv::Ref dest;
+    Operand src1, src2;
+    unsigned cond = 0;
+    CondMoveMux(Table *tbl, const Decode *op, const Table::Actions::Action *act, const value_t &d,
+                const value_t &s)
+        : VLIWInstruction(d.lineno),
+          opc(op),
+          dest(tbl->gress, tbl->stage->stageno + 1, d),
+          src1(tbl, act, s),
+          src2(tbl->gress, tbl->stage->stageno, d) {}
+    CondMoveMux(Table *tbl, const Decode *op, const Table::Actions::Action *act, const value_t &d,
+                const value_t &s1, const value_t &s2)
+        : VLIWInstruction(d.lineno),
+          opc(op),
+          dest(tbl->gress, tbl->stage->stageno + 1, d),
+          src1(tbl, act, s1),
+          src2(tbl, act, s2) {}
+    std::string name() { return opc->name; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *);
+    void pass2(Table *tbl, Table::Actions::Action *) {
+        src1->pass2(slot / Phv::mau_groupsize());
+        src2->pass2(slot / Phv::mau_groupsize());
+    }
+    int encode();
+    bool equiv(Instruction *a_);
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) {
+        bool rv = false;
+        if (cond & 1) {
+            fn(*dest);
+            rv = true;
+        }
+        rv |= src1.phvRead(fn);
+        if (!opc->src2opt || (cond & 4)) rv |= src2.phvRead(fn);
+        return rv;
+    }
+    void dbprint(std::ostream &out) const {
+        out << "INSTR: cmov " << dest << ", " << src1 << ", " << src2;
+    }
+};
+
+Instruction *CondMoveMux::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                         const VECTOR(value_t) & op) const {
+    if (op.size != 5 && (op.size != 4 || !src2opt)) {
+        error(op[0].lineno, "%s requires %s4 operands", op[0].s, src2opt ? "3 or " : "");
+        return 0;
+    }
+    if (!CHECKTYPE(op[op.size - 1], tINT)) {
+        if (op[op.size - 1].i < 0 || op[op.size - 1].i >= (1 << cond_size)) {
+            error(op[op.size - 1].lineno, "%s condition must be %d-bit constant", op[0].s,
+                  cond_size);
+            return 0;
+        }
+    }
+    CondMoveMux *rv;
+    if (op.size == 5)
+        rv = new CondMoveMux(tbl, this, act, op[1], op[2], op[3]);
+    else
+        rv = new CondMoveMux(tbl, this, act, op[1], op[2]);
+    rv->cond = op[op.size - 1].i;
+    if (!rv->src1.valid())
+        error(op[2].lineno, "invalid src1");
+    else if (!rv->src2.valid())
+        error(op[3].lineno, "invalid src2");
+    else
+        return rv;
+    delete rv;
+    return 0;
+}
+
+Instruction *CondMoveMux::pass1(Table *tbl, Table::Actions::Action *) {
+    if (!dest.check() || !src1.check() || !src2.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    if (dest->reg.type != Phv::Register::NORMAL) {
+        error(dest.lineno, "%s dest can't be dark or mocha phv", opc->name.c_str());
+        return this;
+    }
+    slot = dest->reg.mau_id();
+    tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    src1->pass1(tbl, slot / Phv::mau_groupsize());
+    src2->pass1(tbl, slot / Phv::mau_groupsize());
+    return this;
+}
+int CondMoveMux::encode() {
+    int rv = (cond << 11) | (opc->opcode << 6) | src1.bits(slot / Phv::mau_groupsize());
+    rv <<= Target::INSTR_SRC2_BITS();
+    /* funny cond test on src2 is to match the compiler output -- if we're not testing
+     * src2 validity, what we specify as src2 is irrelevant */
+    return rv | (cond & 0x40 ? src2.bits(slot / Phv::mau_groupsize()) : 0);
+}
+bool CondMoveMux::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<CondMoveMux *>(a_)) {
+        return opc == a->opc && dest == a->dest && src1 == a->src1 && src2 == a->src2 &&
+               cond == a->cond;
+    } else {
+        return false;
+    }
+}
+
+/**
+ * This instruction represents the Byte-Rotate-Merge instruction described in the
+ * uArch section 14.1.6.5 Byte-rotate-merge section.
+ */
+struct ByteRotateMerge : VLIWInstruction {
+    struct Decode : Instruction::Decode {
+        Decode() : Instruction::Decode("byte_rotate_merge") { alias("byte-rotate-merge"); }
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const;
+    };
+    Phv::Ref dest;
+    Operand src1, src2;
+    int src1_shift, src2_shift;
+    bitvec byte_mask;
+    ByteRotateMerge(Table *tbl, const Table::Actions::Action *act, const value_t &d,
+                    const value_t &s1, const value_t &s2, int s1s, int s2s, int bm)
+        : VLIWInstruction(d.lineno),
+          dest(tbl->gress, tbl->stage->stageno + 1, d),
+          src1(tbl, act, s1),
+          src2(tbl, act, s2),
+          src1_shift(s1s),
+          src2_shift(s2s),
+          byte_mask(bm) {}
+
+    std::string name() { return "byte_rotate_merge"; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *);
+    void pass2(Table *tbl, Table::Actions::Action *) {
+        src1->pass2(slot / Phv::mau_groupsize());
+        src2->pass2(slot / Phv::mau_groupsize());
+    }
+    int encode();
+    bool equiv(Instruction *a_);
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) {
+        return src1.phvRead(fn) | src2.phvRead(fn);
+    }
+    void dbprint(std::ostream &out) const {
+        out << "INSTR: byte_rotate_merge " << dest << ", " << src1 << ", " << src2 << " "
+            << byte_mask;
+    }
+};
+
+/**
+ * Unlike deposit-field, because of the non-contiguity of both sources possibly, the
+ * full instruction with both sources, shifts and byte mask are required
+ */
+Instruction *ByteRotateMerge::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                             const VECTOR(value_t) & op) const {
+    if (op.size != 7) {
+        error(op[0].lineno, "%s requires 6 operands", op[0].s);
+        return 0;
+    }
+    if (!CHECKTYPE(op[4], tINT) || !CHECKTYPE(op[5], tINT) || !CHECKTYPE(op[6], tINT)) {
+        error(op[0].lineno, "%s requires operands 3-5 to be ints", op[0].s);
+        return 0;
+    }
+
+    ByteRotateMerge *rv =
+        new ByteRotateMerge(tbl, act, op[1], op[2], op[3], op[4].i, op[5].i, op[6].i);
+    if (!rv->src1.valid())
+        error(op[2].lineno, "invalid src1");
+    else if (!rv->src2.valid())
+        error(op[3].lineno, "invalid src2");
+    else
+        return rv;
+    delete rv;
+    return 0;
+}
+
+/**
+ * The shifts at most can be container.size / 8 and the byte mask bit count can be at most
+ * container.size / 8.
+ */
+Instruction *ByteRotateMerge::pass1(Table *tbl, Table::Actions::Action *) {
+    if (!dest.check() || !src1.check() || !src2.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    if (dest->reg.type != Phv::Register::NORMAL) {
+        error(dest.lineno, "byte-rotate-merge dest can't be dark or mocha phv");
+        return this;
+    }
+    if (dest->reg.size == 8) {
+        error(dest.lineno, "byte-rotate-merge invalid on 8 bit containers");
+        return this;
+    }
+    if (byte_mask.max().index() > dest->reg.size / 8) {
+        error(dest.lineno, "byte-rotate-merge mask beyond container size bounds");
+        return this;
+    }
+    if (src1_shift > dest->reg.size / 8) {
+        error(dest.lineno, "byte-rotate-merge src1_shift beyond container size bounds");
+        return this;
+    }
+    if (src2_shift > dest->reg.size / 8) {
+        error(dest.lineno, "byte-rotate-merge src2_shift beyond container size bounds");
+        return this;
+    }
+    slot = dest->reg.mau_id();
+    tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    src1->pass1(tbl, slot / Phv::mau_groupsize());
+    src2->pass1(tbl, slot / Phv::mau_groupsize());
+    src2->pass1(tbl, slot / Phv::mau_groupsize());
+    if (src2.phvGroup() < 0) {
+        std::swap(src1, src2);
+        std::swap(src1_shift, src2_shift);
+        byte_mask = bitvec(0, dest->reg.size / 8) - byte_mask;
+    }
+    if (src2.phvGroup() < 0) error(lineno, "src2 must be phv register");
+    return this;
+}
+
+int ByteRotateMerge::encode() {
+    int bits = (0xa << 6) | src1.bits(slot / Phv::mau_groupsize());
+    bits |= (byte_mask.getrange(0, 4)) << 10;
+    bits |= (src1_shift << 17);
+    bits |= (src2_shift << 15);
+    bits <<= Target::INSTR_SRC2_BITS();
+    return bits | src2.bits(slot / Phv::mau_groupsize());
+}
+
+bool ByteRotateMerge::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<ByteRotateMerge *>(a_)) {
+        return dest == a->dest && src1 == a->src1 && src2 == a->src2 && byte_mask == a->byte_mask &&
+               src1_shift == a->src1_shift && src2_shift == a->src2_shift;
+    } else {
+        return false;
+    }
+}
+
+struct Set;
+
+struct DepositField : VLIWInstruction {
+    struct Decode : Instruction::Decode {
+        Decode() : Instruction::Decode("deposit_field") { alias("deposit-field"); }
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    };
+    Phv::Ref dest;
+    Operand src1, src2;
+    DepositField(Table *tbl, const Table::Actions::Action *act, const value_t &d, const value_t &s)
+        : VLIWInstruction(d.lineno),
+          dest(tbl->gress, tbl->stage->stageno + 1, d),
+          src1(tbl, act, s),
+          src2(tbl->gress, tbl->stage->stageno, d) {}
+    DepositField(Table *tbl, const Table::Actions::Action *act, const value_t &d, const value_t &s1,
+                 const value_t &s2)
+        : VLIWInstruction(d.lineno),
+          dest(tbl->gress, tbl->stage->stageno + 1, d),
+          src1(tbl, act, s1),
+          src2(tbl, act, s2) {}
+    DepositField(Table *tbl, const Set &);
+    std::string name() { return "deposit_field"; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *);
+    void pass2(Table *tbl, Table::Actions::Action *) {
+        src1->pass2(slot / Phv::mau_groupsize());
+        src2->pass2(slot / Phv::mau_groupsize());
+    }
+    int encode();
+    bool equiv(Instruction *a_);
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) {
+        return src1.phvRead(fn) | src2.phvRead(fn);
+    }
+    void dbprint(std::ostream &out) const {
+        out << "INSTR: deposit_field " << dest << ", " << src1 << ", " << src2;
+    }
+};
+
+Instruction *DepositField::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                          const VECTOR(value_t) & op) const {
+    if (op.size != 4 && op.size != 3) {
+        error(op[0].lineno, "%s requires 2 or 3 operands", op[0].s);
+        return 0;
+    }
+    DepositField *rv;
+    if (op.size == 4)
+        rv = new DepositField(tbl, act, op[1], op[2], op[3]);
+    else
+        rv = new DepositField(tbl, act, op[1], op[2]);
+    if (!rv->src1.valid())
+        error(op[2].lineno, "invalid src1");
+    else if (!rv->src2.valid())
+        error(op[3].lineno, "invalid src2");
+    else
+        return rv;
+    delete rv;
+    return 0;
+}
+
+Instruction *DepositField::pass1(Table *tbl, Table::Actions::Action *act) {
+    if (!dest.check() || !src1.check() || !src2.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    if (dest->reg.type != Phv::Register::NORMAL) {
+        error(dest.lineno, "deposit-field dest can't be dark or mocha phv");
+        return this;
+    }
+    slot = dest->reg.mau_id();
+    tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    src1->pass1(tbl, slot / Phv::mau_groupsize());
+    src2->pass1(tbl, slot / Phv::mau_groupsize());
+    return this;
+}
+int DepositField::encode() {
+    // If src1 is an Operand::Const (and we pass a valid dest_size),
+    // we will recieve the combined rotation + bits from DepositField::discoverRotation().
+    // Otherwise the top 'RotationBits' will be zero.
+    int rotConst = src1.bits(slot / Phv::mau_groupsize(), dest.size());
+    unsigned rot = rotConst >> RotationBits;
+    rot += dest->reg.size - dest->lo + src1.bitoffset(slot / Phv::mau_groupsize());
+    rot %= dest->reg.size;
+    int bits = rotConst & ((1U << RotationBits) - 1);
+    bits |= (1 << 6);
+    bits |= dest->hi << 7;
+    bits |= rot << 12;
+    switch (Phv::reg(slot)->size) {
+        case 8:
+            bits |= (dest->lo & 3) << 10;
+            bits |= (dest->lo & ~3) << 13;
+            break;
+        case 16:
+            bits |= (dest->lo & 1) << 11;
+            bits |= (dest->lo & ~1) << 15;
+            break;
+        case 32:
+            bits |= dest->lo << 17;
+            break;
+        default:
+            BUG();
+    }
+    bits <<= Target::INSTR_SRC2_BITS();
+    return bits | src2.bits(slot / Phv::mau_groupsize());
+}
+bool DepositField::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<DepositField *>(a_)) {
+        return dest == a->dest && src1 == a->src1 && src2 == a->src2;
+    } else {
+        return false;
+    }
+}
+
+struct Set : VLIWInstruction {
+    struct Decode : Instruction::Decode {
+        std::string name;
+        Decode(const char *n, std::set<target_t> targ) : Instruction::Decode(n, targ), name(n) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    };
+    Phv::Ref dest;
+    Operand src;
+    static AluOP::Decode *opA;
+    Set(Table *tbl, const Table::Actions::Action *act, const value_t &d, const value_t &s)
+        : VLIWInstruction(d.lineno),
+          dest(tbl->gress, tbl->stage->stageno + 1, d),
+          src(tbl, act, s) {}
+    std::string name() { return "set"; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *);
+    void pass2(Table *tbl, Table::Actions::Action *) { src->pass2(slot / Phv::mau_groupsize()); }
+    int encode();
+    bool equiv(Instruction *a_);
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) { return src.phvRead(fn); }
+    void dbprint(std::ostream &out) const { out << "INSTR: set " << dest << ", " << src; }
+};
+
+DepositField::DepositField(Table *tbl, const Set &s)
+    : VLIWInstruction(s), dest(s.dest), src1(s.src), src2(::Phv::Ref(s.dest->reg, tbl->gress)) {}
+
+Instruction *Set::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                 const VECTOR(value_t) & op) const {
+    if (op.size != 3) {
+        error(op[0].lineno, "%s requires 2 operands", op[0].s);
+        return 0;
+    }
+    Set *rv = new Set(tbl, act, op[1], op[2]);
+    if (!rv->src.valid())
+        error(op[2].lineno, "invalid src");
+    else
+        return rv;
+    delete rv;
+    return 0;
+}
+
+Instruction *Set::pass1(Table *tbl, Table::Actions::Action *act) {
+    if (!dest.check() || !src.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    if (dest->lo || dest->hi != dest->reg.size - 1)
+        return (new DepositField(tbl, *this))->pass1(tbl, act);
+    if (auto *k = src.to<Operand::Const>()) {
+        if (dest->reg.type == Phv::Register::DARK) {
+            error(dest.lineno, "can't set dark phv to a constant");
+            return this;
+        }
+        int minsignconst = Target::MINIMUM_INSTR_CONSTANT();
+        // Translate large value with negative value, e.g. 0xFFFE -> -2 on 16-bit PHV
+        int64_t maxvalue = 1LL << dest->reg.size;
+        int64_t delta = k->value - maxvalue;
+        if (delta >= minsignconst) k->value = delta;
+        if (k->value < minsignconst || k->value >= 8)
+            return (new LoadConst(lineno, dest, k->value))->pass1(tbl, act);
+    }
+    slot = dest->reg.mau_id();
+    tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    src->pass1(tbl, slot / Phv::mau_groupsize());
+    return this;
+}
+
+int Set::encode() {
+    int rv = src.bits(slot / Phv::mau_groupsize());
+    switch (dest->reg.type) {
+        case Phv::Register::NORMAL:
+            rv |= (opA->opcode << 6);
+            rv <<= Target::INSTR_SRC2_BITS();
+            rv |= (slot & 0xf);
+            break;
+        case Phv::Register::MOCHA:
+            rv |= 0x40;
+            break;
+        case Phv::Register::DARK:
+            rv |= 0x20;
+            break;
+        default:
+            BUG();
+    }
+    return rv;
+}
+
+bool Set::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<Set *>(a_)) {
+        return dest == a->dest && src == a->src;
+    } else {
+        return false;
+    }
+}
+
+struct NulOP : VLIWInstruction {
+    const struct Decode : Instruction::Decode {
+        std::string name;
+        unsigned opcode;
+        Decode(const char *n, unsigned opc) : Instruction::Decode(n), name(n), opcode(opc) {}
+        Decode(const char *n, target_t targ, unsigned opc)
+            : Instruction::Decode(n, targ), name(n), opcode(opc) {}
+        Decode(const char *n, std::set<target_t> targ, unsigned opc)
+            : Instruction::Decode(n, targ), name(n), opcode(opc) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    Phv::Ref dest;
+    NulOP(Table *tbl, const Table::Actions::Action *act, const Decode *o, const value_t &d)
+        : VLIWInstruction(d.lineno), opc(o), dest(tbl->gress, tbl->stage->stageno + 1, d) {}
+    std::string name() { return opc->name; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *);
+    void pass2(Table *, Table::Actions::Action *) {}
+    int encode();
+    bool equiv(Instruction *a_);
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) { return false; }
+    void dbprint(std::ostream &out) const { out << "INSTR: " << opc->name << " " << dest; }
+};
+
+Instruction *NulOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                   const VECTOR(value_t) & op) const {
+    if (op.size != 2) {
+        error(op[0].lineno, "%s requires 1 operand", op[0].s);
+        return 0;
+    }
+    return new NulOP(tbl, act, this, op[1]);
+}
+
+Instruction *NulOP::pass1(Table *tbl, Table::Actions::Action *) {
+    if (!dest.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    slot = dest->reg.mau_id();
+    if (opc->opcode || !options.match_compiler) {
+        tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    }
+    return this;
+}
+int NulOP::encode() { return opc->opcode; }
+bool NulOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<NulOP *>(a_)) {
+        return opc == a->opc && dest == a->dest;
+    } else {
+        return false;
+    }
+}
+
+struct ShiftOP : VLIWInstruction {
+    const struct Decode : Instruction::Decode {
+        std::string name;
+        unsigned opcode;
+        bool use_src1;
+        Decode(const char *n, std::set<target_t> targ, unsigned opc, bool funnel = false)
+            : Instruction::Decode(n, targ), name(n), opcode(opc), use_src1(funnel) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    Phv::Ref dest;
+    Operand src1, src2;
+    int shift = 0;
+    ShiftOP(const Decode *d, Table *tbl, const Table::Actions::Action *act, const value_t *ops)
+        : VLIWInstruction(ops->lineno),
+          opc(d),
+          dest(tbl->gress, tbl->stage->stageno + 1, ops[0]),
+          src1(tbl, act, ops[1]),
+          src2(tbl, act, ops[2]) {
+        if (opc->use_src1) {
+            if (CHECKTYPE(ops[3], tINT)) shift = ops[3].i;
+        } else {
+            src2 = src1;
+            if (CHECKTYPE(ops[2], tINT)) shift = ops[2].i;
+        }
+    }
+    std::string name() { return opc->name; }
+    Instruction *pass1(Table *tbl, Table::Actions::Action *);
+    void pass2(Table *tbl, Table::Actions::Action *) {
+        src1->pass2(slot / Phv::mau_groupsize());
+        src2->pass2(slot / Phv::mau_groupsize());
+    }
+    int encode();
+    bool equiv(Instruction *a_);
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) {
+        return src1.phvRead(fn) | src2.phvRead(fn);
+    }
+    void dbprint(std::ostream &out) const {
+        out << "INSTR: " << opc->name << ' ' << dest << ", " << src1 << ", " << shift;
+    }
+};
+
+Instruction *ShiftOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                     const VECTOR(value_t) & op) const {
+    if (op.size != (use_src1 ? 5 : 4)) {
+        error(op[0].lineno, "%s requires %d operands", op[0].s, use_src1 ? 4 : 3);
+        return 0;
+    }
+    ShiftOP *rv = new ShiftOP(this, tbl, act, op.data + 1);
+    if (!rv->src1.valid())
+        error(op[2].lineno, "invalid src1");
+    else if (!rv->src2.valid())
+        error(op[3].lineno, "invalid src2");
+    else if (rv->shift < 0 || rv->shift > 0x1f)
+        error(op[3].lineno, "invalid shift");
+    else
+        return rv;
+    delete rv;
+    return 0;
+}
+
+Instruction *ShiftOP::pass1(Table *tbl, Table::Actions::Action *) {
+    if (!dest.check() || !src1.check() || !src2.check()) return this;
+    if (dest->reg.mau_id() < 0) {
+        error(dest.lineno, "%s not accessable in mau", dest->reg.name);
+        return this;
+    }
+    if (dest->reg.type != Phv::Register::NORMAL) {
+        error(dest.lineno, "%s dest can't be dark or mocha phv", opc->name.c_str());
+        return this;
+    }
+    if (dest->lo) {
+        error(lineno, "shift ops cannot operate on slices");
+        return this;
+    }
+    slot = dest->reg.mau_id();
+    tbl->stage->action_set[tbl->gress][dest->reg.uid] = true;
+    src1->pass1(tbl, slot / Phv::mau_groupsize());
+    src2->pass1(tbl, slot / Phv::mau_groupsize());
+    if (src2.phvGroup() < 0) error(lineno, "src%s must be phv register", opc->use_src1 ? "2" : "");
+    return this;
+}
+int ShiftOP::encode() {
+    int rv = (shift << 12) | (opc->opcode << 6);
+    if (opc->use_src1 || options.match_compiler) rv |= src1.bits(slot / Phv::mau_groupsize());
+    rv <<= Target::INSTR_SRC2_BITS();
+    return rv | src2.bits(slot / Phv::mau_groupsize());
+}
+bool ShiftOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<ShiftOP *>(a_)) {
+        return opc == a->opc && dest == a->dest && src1 == a->src1 && src2 == a->src2 &&
+               shift == a->shift;
+    } else {
+        return false;
+    }
+}
+
+static std::set<target_t> tofino12 = std::set<target_t>({
+    TOFINO,
+    JBAY,
+});
+
+// lifted from MAU uArch 15.1.6
+// If the operation is commutative operand swap is enabled
+//                                   OPNAME              OPCODE
+static AluOP::Decode opADD("add", tofino12, 0x23e, AluOP::Commutative),            // NOLINT
+    opADDC("addc", tofino12, 0x2be, AluOP::Commutative),                           // NOLINT
+    opSUB("sub", tofino12, 0x33e),                                                 // NOLINT
+    opSUBC("subc", tofino12, 0x3be),                                               // NOLINT
+    opSADDU("saddu", tofino12, 0x03e, AluOP::Commutative),                         // NOLINT
+    opSADDS("sadds", tofino12, 0x07e, AluOP::Commutative),                         // NOLINT
+    opSSUBU("ssubu", tofino12, 0x0be),                                             // NOLINT
+    opSSUBS("ssubs", tofino12, 0x0fe),                                             // NOLINT
+    opMINU("minu", tofino12, 0x13e, AluOP::Commutative),                           // NOLINT
+    opMINS("mins", tofino12, 0x17e, AluOP::Commutative),                           // NOLINT
+    opMAXU("maxu", tofino12, 0x1be, AluOP::Commutative),                           // NOLINT
+    opMAXS("maxs", tofino12, 0x1fe, AluOP::Commutative),                           // NOLINT
+    opSETZ("setz", tofino12, 0x01e, AluOP::Commutative + AluOP::IgnoreSrcs),       // NOLINT
+    opNOR("nor", tofino12, 0x05e, AluOP::Commutative),                             // NOLINT
+    opANDCA("andca", tofino12, 0x09e, AluOP::CanSliceWithConst),                   // NOLINT
+    opANDCB("andcb", tofino12, 0x11e, &opANDCA),                                   // NOLINT
+    opNOTB("notb", tofino12, 0x15e, AluOP::IgnoreSrc1, "not"),                     // NOLINT
+    opNOTA("nota", tofino12, 0x0de, AluOP::IgnoreSrc2, &opNOTB),                   // NOLINT
+    opXOR("xor", tofino12, 0x19e, AluOP::Commutative + AluOP::CanSliceWithConst),  // NOLINT
+    opNAND("nand", tofino12, 0x1de, AluOP::Commutative),                           // NOLINT
+    opAND("and", tofino12, 0x21e, AluOP::Commutative),                             // NOLINT
+    opXNOR("xnor", tofino12, 0x25e, AluOP::Commutative),                           // NOLINT
+    opB("alu_b", tofino12, 0x29e, AluOP::IgnoreSrc1),                              // NOLINT
+    opORCA("orca", tofino12, 0x2de),                                               // NOLINT
+    opA("alu_a", tofino12, 0x31e, AluOP::IgnoreSrc2, &opB),                        // NOLINT
+    opORCB("orcb", tofino12, 0x35e, &opORCA),                                      // NOLINT
+    opOR("or", tofino12, 0x39e, AluOP::Commutative + AluOP::CanSliceWithConst),    // NOLINT
+    opSETHI("sethi", tofino12, 0x3de, AluOP::Commutative + AluOP::IgnoreSrcs);     // NOLINT
+static LoadConst::Decode opLoadConst("load-const", tofino12);                      // NOLINT
+static Set::Decode opSet("set", tofino12);                                         // NOLINT
+static NulOP::Decode opNoop("noop", tofino12, 0x0);                                // NOLINT
+static ShiftOP::Decode opSHL("shl", tofino12, 0x0c, false),                        // NOLINT
+    opSHRS("shrs", tofino12, 0x1c, false),                                         // NOLINT
+    opSHRU("shru", tofino12, 0x14, false),                                         // NOLINT
+    opFUNSHIFT("funnel-shift", tofino12, 0x4, true);                               // NOLINT
+static DepositField::Decode opDepositField;
+static ByteRotateMerge::Decode opByteRotateMerge;
+
+AluOP::Decode *Set::opA = &VLIW::opA;
+
+static AluOP3Src::Decode tf_opBMSET("bitmasked-set", TOFINO, 0x2e);  // NOLINT
+static CondMoveMux::Decode tf_opCondMove("cmov", TOFINO, 0x16, true, 5,
+                                         "conditional-move");  // NOLINT
+static CondMoveMux::Decode tf_opCondMux("cmux", TOFINO, 0x6, false, 2,
+                                        "conditional-mux");          // NOLINT
+static NulOP::Decode tf_opInvalidate("invalidate", TOFINO, 0x3800);  // NOLINT
+
+static std::set<target_t> jb_targets = std::set<target_t>({
+    JBAY,
+});
+
+static AluOP3Src::Decode jb_opBMSET("bitmasked-set", jb_targets, 0x0e);  // NOLINT
+static CondMoveMux::Decode jb_opCondMove("cmov", jb_targets, 0x6, true, 5,
+                                         "conditional-move");    // NOLINT
+static AluOP::Decode jb_opGTEQU("gtequ", jb_targets, 0x02e),     // NOLINT
+    jb_opGTEQS("gteqs", jb_targets, 0x06e),                      // NOLINT
+    jb_opLTU("ltu", jb_targets, 0x0ae),                          // NOLINT
+    jb_opLTS("lts", jb_targets, 0x0ee),                          // NOLINT
+    jb_opLEQU("lequ", jb_targets, 0x12e, &jb_opGTEQU),           // NOLINT
+    jb_opLEQS("leqs", jb_targets, 0x16e, &jb_opGTEQS),           // NOLINT
+    jb_opGTU("gtu", jb_targets, 0x1ae, &jb_opLTU),               // NOLINT
+    jb_opGTS("gts", jb_targets, 0x1ee, &jb_opLTS),               // NOLINT
+    jb_opEQ("eq", jb_targets, 0x22e, AluOP::Commutative),        // NOLINT
+    jb_opNEQ("neq", jb_targets, 0x2ae, AluOP::Commutative),      // NOLINT
+    jb_opEQ64("eq64", jb_targets, 0x26e, AluOP::Commutative),    // NOLINT
+    jb_opNEQ64("neq64", jb_targets, 0x2ee, AluOP::Commutative);  // NOLINT
+
+std::unique_ptr<Instruction> genNoopFill(Table *tbl, Table::Actions::Action *act, const char *op,
+                                         int slot) {
+    VECTOR(value_t) args;
+    VECTOR_init(args, 3);
+    args.add(op).add(Phv::reg(slot)->name).add(Phv::reg(slot)->name);
+    std::unique_ptr<Instruction> rv(Instruction::decode(tbl, act, args));
+    VECTOR_fini(args);
+    return rv;
+}
+
+}  // end namespace VLIW
+
+void dump(const Instruction &inst) { std::cout << inst << std::endl; }
diff --git a/backends/tofino/bf-asm/instruction.h b/backends/tofino/bf-asm/instruction.h
new file mode 100644
index 00000000000..48d2eaa721b
--- /dev/null
+++ b/backends/tofino/bf-asm/instruction.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_INSTRUCTION_H_
+#define BACKENDS_TOFINO_BF_ASM_INSTRUCTION_H_
+
+#include <functional>
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/tables.h"
+
+struct Instruction : public IHasDbPrint {
+    int lineno;
+    int slot;
+    explicit Instruction(int l) : lineno(l), slot(-1) {}
+    virtual ~Instruction() {}
+    virtual Instruction *pass1(Table *, Table::Actions::Action *) = 0;
+    virtual std::string name() = 0;
+    virtual void pass2(Table *, Table::Actions::Action *) = 0;
+    virtual void dbprint(std::ostream &) const = 0;
+    virtual bool equiv(Instruction *a) = 0;
+    bool equiv(const std::unique_ptr<Instruction> &a) { return equiv(a.get()); }
+    virtual bool salu_output() const { return false; }
+    virtual bool salu_alu() const { return false; }
+    virtual bool phvRead(std::function<void(const Phv::Slice &sl)>) = 0;
+    bool phvRead() {
+        return phvRead([](const Phv::Slice &sl) {});
+    }
+#define VIRTUAL_TARGET_METHODS(TARGET) \
+    virtual void write_regs(Target::TARGET::mau_regs &, Table *, Table::Actions::Action *) = 0;
+    FOR_ALL_REGISTER_SETS(VIRTUAL_TARGET_METHODS)
+#undef VIRTUAL_TARGET_METHODS
+#define DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS(TARGET)                               \
+    void write_regs(Target::TARGET::mau_regs &regs, Table *tbl, Table::Actions::Action *act) \
+        override;
+    static Instruction *decode(Table *, const Table::Actions::Action *, const VECTOR(value_t) &);
+
+    enum instruction_set_t { VLIW_ALU = 0, STATEFUL_ALU = 1, NUM_SETS = 2 };
+    struct Decode {
+        static std::multimap<std::string, Decode *> opcode[NUM_SETS];
+        bool type_suffix;
+        unsigned targets;
+        explicit Decode(const char *name, int set = VLIW_ALU, bool ts = false);
+        Decode(const char *name, target_t target, int set = VLIW_ALU, bool ts = false);
+        Decode(const char *name, std::set<target_t> target, int set = VLIW_ALU, bool ts = false);
+        const Decode &alias(const char *name, int set = VLIW_ALU, bool ts = false) {
+            opcode[set].emplace(name, this);
+            return *this;
+        }
+        virtual Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                                    const VECTOR(value_t) & op) const = 0;
+    };
+};
+
+namespace VLIW {
+std::unique_ptr<Instruction> genNoopFill(Table *tbl, Table::Actions::Action *act, const char *op,
+                                         int slot);
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_INSTRUCTION_H_ */
diff --git a/backends/tofino/bf-asm/j2b.cpp b/backends/tofino/bf-asm/j2b.cpp
new file mode 100644
index 00000000000..3888f460d62
--- /dev/null
+++ b/backends/tofino/bf-asm/j2b.cpp
@@ -0,0 +1,48 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <fstream>
+#include <iostream>
+
+#include "bson.h"
+
+int main(int ac, char **av) {
+    if (ac != 3) {
+        std::cerr << "usage " << av[0] << " <json in> <bson out>" << std::endl;
+        return 1;
+    }
+    std::ifstream in(av[1]);
+    if (!in) {
+        std::cerr << "failed to open " << av[1] << std::endl;
+        return 1;
+    }
+    json::obj *data = nullptr;
+    if (!(in >> data)) {
+        std::cerr << "failed to read json" << std::endl;
+        return 1;
+    }
+    std::ofstream out(av[2]);
+    if (!out) {
+        std::cerr << "failed to open " << av[2] << std::endl;
+        return 1;
+    }
+    if (!(out << json::binary(data))) {
+        std::cerr << "failed to write bson" << std::endl;
+        return 1;
+    }
+    return 0;
+}
diff --git a/backends/tofino/bf-asm/jbay/CMakeLists.txt b/backends/tofino/bf-asm/jbay/CMakeLists.txt
new file mode 100644
index 00000000000..c285f9ade6f
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/CMakeLists.txt
@@ -0,0 +1,56 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+set (GEN_JBAY
+  memories.jbay_mem
+  memories.pipe_addrmap
+  memories.prsr_mem_main_rspec
+  regs.dprsr_reg
+  regs.epb_prsr4_reg
+  regs.ipb_prsr4_reg
+  regs.jbay_reg
+  regs.mau_addrmap
+  regs.pipe_addrmap
+  regs.pmerge_reg
+  regs.prsr_reg_main_rspec
+  )
+
+foreach(f IN LISTS GEN_JBAY)
+  list (APPEND GEN_JBAY_SRCS ${BFASM_BINARY_DIR}/gen/jbay/${f}.cpp)
+  list (APPEND GEN_JBAY_HDRS ${BFASM_BINARY_DIR}/gen/jbay/${f}.h)
+endforeach()
+
+add_custom_command(OUTPUT ${GEN_JBAY_HDRS} ${GEN_JBAY_SRCS}
+  COMMAND ${BFASM_WALLE} --schema chip.schema --generate-cpp template_objects.yaml -o ${BFASM_BINARY_DIR}/gen/jbay
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS template_objects.yaml chip.schema ${WALLE_SOURCES}
+  COMMENT "Generating cpp code for jbay from jbay/chip.schema")
+
+set_source_files_properties(${GEN_JBAY_SRCS} ${GEN_JBAY_HDRS} PROPERTIES GENERATED TRUE)
+
+set (BFAS_JBAY_SRCS
+  jbay/gateway.cpp
+  jbay/input_xbar.cpp
+  jbay/stateful.cpp
+  jbay/parser.cpp
+  PARENT_SCOPE
+  )
+
+add_library (regs_jbay ${GEN_JBAY_SRCS})
+target_link_libraries (regs_jbay p4ctoolkit)
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(regs_jbay PUBLIC -Wno-error -Wno-unused-parameter -Wno-unused-variable -Wno-type-limits -Wno-sign-compare)
diff --git a/backends/tofino/bf-asm/jbay/chip.schema b/backends/tofino/bf-asm/jbay/chip.schema
new file mode 100644
index 00000000000..5afef775e2d
Binary files /dev/null and b/backends/tofino/bf-asm/jbay/chip.schema differ
diff --git a/backends/tofino/bf-asm/jbay/counter.h b/backends/tofino/bf-asm/jbay/counter.h
new file mode 100644
index 00000000000..0b6655964ab
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/counter.h
@@ -0,0 +1,121 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_JBAY_COUNTER_H_
+#define BACKENDS_TOFINO_BF_ASM_JBAY_COUNTER_H_
+
+template <typename REGS>
+void CounterTable::setup_teop_regs_2(REGS &regs, int stats_group_index) {
+    BUG_CHECK(teop >= 0 && teop < 4);
+    BUG_CHECK(gress == EGRESS);
+
+    auto &adrdist = regs.rams.match.adrdist;
+
+    if (!teop_initialized) {
+        // assume this stage driving teop
+        auto delay = stage->pipelength(gress) - stage->pred_cycle(gress) - 7;
+        adrdist.teop_bus_ctl[teop].teop_bus_ctl_delay = delay;
+        adrdist.teop_bus_ctl[teop].teop_bus_ctl_delay_en = 1;
+        adrdist.teop_bus_ctl[teop].teop_bus_ctl_stats_en = 1;
+
+        adrdist.stats_to_teop_adr_oxbar_ctl[teop].enabled_2bit_muxctl_select = stats_group_index;
+        adrdist.stats_to_teop_adr_oxbar_ctl[teop].enabled_2bit_muxctl_enable = 1;
+        teop_initialized = true;
+    }
+
+    adrdist.teop_to_stats_adr_oxbar_ctl[stats_group_index].enabled_2bit_muxctl_select = teop;
+    adrdist.teop_to_stats_adr_oxbar_ctl[stats_group_index].enabled_2bit_muxctl_enable = 1;
+
+    // count all tEOP events
+    adrdist.dp_teop_stats_ctl[stats_group_index].dp_teop_stats_ctl_err = 0;
+    // XXX is this always 2?
+    adrdist.dp_teop_stats_ctl[stats_group_index].dp_teop_stats_ctl_rx_shift = 2;
+    adrdist.dp_teop_stats_ctl[stats_group_index].dp_teop_stats_ctl_rx_en = 1;
+
+    auto &stats = regs.rams.map_alu.stats_wrap[stats_group_index].stats;
+    stats.statistics_ctl_teop_en = 1;
+}
+
+template <typename REGS>
+void CounterTable::write_alu_vpn_range_2(REGS &regs) {
+    auto &adrdist = regs.rams.match.adrdist;
+    int minvpn, sparevpn;
+
+    // Used to validate the BFA VPN configuration
+    std::set<int> vpn_processed;
+    bitvec vpn_range;
+
+    // Get Spare VPN
+    layout_vpn_bounds(minvpn, sparevpn, false);
+
+    for (int home_row : home_rows) {
+        bool block_start = false;
+        bool block_end = false;
+        int min = 1000000;
+        int max = -1;
+        for (Layout &logical_row : layout) {
+            // Block Start with the home row and End with the Spare VPN
+            if (logical_row.row == home_row) block_start = true;
+
+            if (block_start) {
+                for (auto v : logical_row.vpns) {
+                    if (v == sparevpn) {
+                        block_end = true;
+                        break;
+                    }
+                    if (vpn_processed.count(v))
+                        error(home_lineno, "Multiple instance of the VPN %d detected", v);
+                    else
+                        vpn_processed.insert(v);
+
+                    if (v < min) min = v;
+                    if (v > max) max = v;
+                }
+            }
+            if (block_end) {
+                BUG_CHECK(min != 1000000 && max != -1);
+
+                bitvec block_range(min, max - min + 1);
+                if (vpn_range.intersects(block_range))
+                    error(home_lineno, "Overlapping of VPN range detected");
+                else
+                    vpn_range |= block_range;
+
+                adrdist.mau_stats_alu_vpn_range[home_row / 4].stats_vpn_base = min;
+                adrdist.mau_stats_alu_vpn_range[home_row / 4].stats_vpn_limit = max;
+                adrdist.mau_stats_alu_vpn_range[home_row / 4].stats_vpn_range_check_enable = 1;
+                break;
+            }
+        }
+        BUG_CHECK(block_start && block_end);
+    }
+
+    if (vpn_range != bitvec(minvpn, sparevpn - minvpn))
+        error(home_lineno, "VPN range not entirely covered");
+}
+
+template <>
+void CounterTable::setup_teop_regs(Target::JBay::mau_regs &regs, int stats_group_index) {
+    setup_teop_regs_2(regs, stats_group_index);
+}
+
+template <>
+void CounterTable::write_alu_vpn_range(Target::JBay::mau_regs &regs) {
+    write_alu_vpn_range_2(regs);
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_JBAY_COUNTER_H_ */
diff --git a/backends/tofino/bf-asm/jbay/deparser.cpp b/backends/tofino/bf-asm/jbay/deparser.cpp
new file mode 100644
index 00000000000..e30402ac9b2
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/deparser.cpp
@@ -0,0 +1,1092 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* deparser template specializations for jbay -- #included directly in top-level deparser.cpp */
+
+#define YES(X) X
+#define NO(X)
+
+#define JBAY_POV(GRESS, VAL, REG)                                                      \
+    if (VAL.pov.size() == 1)                                                           \
+        REG.pov = deparser.pov[GRESS].at(&VAL.pov.front()->reg) + VAL.pov.front()->lo; \
+    else                                                                               \
+        error(VAL.val.lineno, "POV bit required for Tofino2");
+
+#define JBAY_SIMPLE_INTRINSIC(GRESS, VAL, REG, IFSHIFT) \
+    REG.phv = VAL.val->reg.deparser_id();               \
+    JBAY_POV(GRESS, VAL, REG)                           \
+    IFSHIFT(REG.shft = intrin.vals[0].val->lo;)
+
+#define JBAY_ARRAY_INTRINSIC(GRESS, VAL, ARRAY, REG, POV, IFSHIFT) \
+    for (auto &r : ARRAY) {                                        \
+        r.REG.phv = VAL.val->reg.deparser_id();                    \
+        IFSHIFT(r.REG.shft = intrin.vals[0].val->lo;)              \
+    }                                                              \
+    JBAY_POV(GRESS, VAL, POV)
+
+#define EI_INTRINSIC(NAME, IFSHIFT)                                                                \
+    DEPARSER_INTRINSIC(JBay, EGRESS, NAME, 1) {                                                    \
+        JBAY_SIMPLE_INTRINSIC(EGRESS, intrin.vals[0], regs.dprsrreg.inp.ipp.egr.m_##NAME, IFSHIFT) \
+    }
+#define HO_E_INTRINSIC(NAME, IFSHIFT)                                                       \
+    DEPARSER_INTRINSIC(JBay, EGRESS, NAME, 1) {                                             \
+        JBAY_ARRAY_INTRINSIC(EGRESS, intrin.vals[0], regs.dprsrreg.ho_e, her.meta.m_##NAME, \
+                             regs.dprsrreg.inp.icr.egr_meta_pov.m_##NAME, IFSHIFT)          \
+    }
+#define II_INTRINSIC(NAME, IFSHIFT)                                                         \
+    DEPARSER_INTRINSIC(JBay, INGRESS, NAME, 1) {                                            \
+        JBAY_SIMPLE_INTRINSIC(INGRESS, intrin.vals[0], regs.dprsrreg.inp.ipp.ingr.m_##NAME, \
+                              IFSHIFT)                                                      \
+    }
+#define II_INTRINSIC_RENAME(NAME, REGNAME, IFSHIFT)                                            \
+    DEPARSER_INTRINSIC(JBay, INGRESS, NAME, 1) {                                               \
+        JBAY_SIMPLE_INTRINSIC(INGRESS, intrin.vals[0], regs.dprsrreg.inp.ipp.ingr.m_##REGNAME, \
+                              IFSHIFT)                                                         \
+    }
+#define HO_I_INTRINSIC(NAME, IFSHIFT)                                                        \
+    DEPARSER_INTRINSIC(JBay, INGRESS, NAME, 1) {                                             \
+        JBAY_ARRAY_INTRINSIC(INGRESS, intrin.vals[0], regs.dprsrreg.ho_i, hir.meta.m_##NAME, \
+                             regs.dprsrreg.inp.icr.ingr_meta_pov.m_##NAME, IFSHIFT)          \
+    }
+#define HO_I_INTRINSIC_RENAME(NAME, REGNAME, IFSHIFT)                                           \
+    DEPARSER_INTRINSIC(JBay, INGRESS, NAME, 1) {                                                \
+        JBAY_ARRAY_INTRINSIC(INGRESS, intrin.vals[0], regs.dprsrreg.ho_i, hir.meta.m_##REGNAME, \
+                             regs.dprsrreg.inp.icr.ingr_meta_pov.m_##REGNAME, IFSHIFT)          \
+    }
+
+EI_INTRINSIC(drop_ctl, YES)
+EI_INTRINSIC(egress_unicast_port, NO)
+HO_E_INTRINSIC(afc, YES)
+HO_E_INTRINSIC(capture_tx_ts, YES)
+HO_E_INTRINSIC(force_tx_err, YES)
+HO_E_INTRINSIC(tx_pkt_has_offsets, YES)
+HO_E_INTRINSIC(mirr_c2c_ctrl, YES)
+HO_E_INTRINSIC(mirr_coal_smpl_len, YES)
+HO_E_INTRINSIC(mirr_dond_ctrl, YES)
+HO_E_INTRINSIC(mirr_epipe_port, YES)
+HO_E_INTRINSIC(mirr_hash, YES)
+HO_E_INTRINSIC(mirr_icos, YES)
+HO_E_INTRINSIC(mirr_io_sel, YES)
+HO_E_INTRINSIC(mirr_mc_ctrl, YES)
+HO_E_INTRINSIC(mirr_qid, YES)
+HO_E_INTRINSIC(mtu_trunc_err_f, YES)
+HO_E_INTRINSIC(mtu_trunc_len, YES)
+
+II_INTRINSIC(copy_to_cpu, YES)
+II_INTRINSIC(drop_ctl, YES)
+II_INTRINSIC(egress_unicast_port, NO)
+II_INTRINSIC_RENAME(egress_multicast_group_0, mgid1, NO)
+II_INTRINSIC_RENAME(egress_multicast_group_1, mgid2, NO)
+II_INTRINSIC(pgen, YES)
+II_INTRINSIC(pgen_len, YES)
+II_INTRINSIC(pgen_addr, YES)
+HO_I_INTRINSIC(afc, YES)
+HO_I_INTRINSIC(bypss_egr, YES)
+HO_I_INTRINSIC(copy_to_cpu_cos, YES)
+HO_I_INTRINSIC(ct_disable, YES)
+HO_I_INTRINSIC(ct_mcast, YES)
+HO_I_INTRINSIC(deflect_on_drop, YES)
+HO_I_INTRINSIC(icos, YES)
+HO_I_INTRINSIC(mirr_c2c_ctrl, YES)
+HO_I_INTRINSIC(mirr_coal_smpl_len, YES)
+HO_I_INTRINSIC(mirr_dond_ctrl, YES)
+HO_I_INTRINSIC(mirr_epipe_port, YES)
+HO_I_INTRINSIC(mirr_hash, YES)
+HO_I_INTRINSIC(mirr_icos, YES)
+HO_I_INTRINSIC(mirr_io_sel, YES)
+HO_I_INTRINSIC(mirr_mc_ctrl, YES)
+HO_I_INTRINSIC(mirr_qid, YES)
+HO_I_INTRINSIC(mtu_trunc_err_f, YES)
+HO_I_INTRINSIC(mtu_trunc_len, YES)
+HO_I_INTRINSIC(qid, YES)
+HO_I_INTRINSIC(rid, YES)
+HO_I_INTRINSIC_RENAME(meter_color, pkt_color, YES)
+HO_I_INTRINSIC_RENAME(xid, xid_l1, YES)
+HO_I_INTRINSIC_RENAME(yid, xid_l2, YES)
+HO_I_INTRINSIC_RENAME(hash_lag_ecmp_mcast_0, hash1, YES)
+HO_I_INTRINSIC_RENAME(hash_lag_ecmp_mcast_1, hash2, YES)
+
+#undef EI_INTRINSIC
+#undef HO_E_INTRINSIC
+#undef II_INTRINSIC
+#undef II_INTRINSIC_RENAME
+#undef HO_I_INTRINSIC
+#undef HO_I_INTRINSIC_RENAME
+
+/** Macros to build Digest::Type objects for JBay --
+ * JBAY_SIMPLE_DIGEST: basic digest that appears one place in the config
+ * JBAY_ARRAY_DIGEST: config is replicated across Header+Output slices
+ * GRESS: INGRESS or EGRESS
+ * NAME: keyword use for this digest in the assembler
+ * ARRAY: Header+Ouput slice array (ho_i or ho_e, matching ingress or egress)
+ * TBL: config register containing the table config
+ * SEL: config register with the selection config
+ * IFID: YES or NO -- if this config needs to program id_phv
+ * CNT: how many patterns can be specified in the array
+ * REVERSE: YES or NO -- if the entries in the table are reverse (0 is last byte of header)
+ * IFIDX: YES or NO -- if CNT > 1 (if we index by id)
+ */
+
+#define JBAY_SIMPLE_DIGEST(GRESS, NAME, TBL, SEL, IFID, CNT, REVERSE, IFIDX) \
+    JBAY_COMMON_DIGEST(GRESS, NAME, TBL, SEL, IFID, CNT, REVERSE, IFIDX)     \
+    JBAY_DIGEST_TABLE(GRESS, NAME, TBL, IFID, YES, CNT, REVERSE, IFIDX)      \
+    }
+#define JBAY_ARRAY_DIGEST(GRESS, NAME, ARRAY, TBL, SEL, IFID, CNT, REVERSE, IFIDX) \
+    JBAY_COMMON_DIGEST(GRESS, NAME, TBL, SEL, IFID, CNT, REVERSE, IFIDX)           \
+    for (auto &r : ARRAY) {                                                        \
+        JBAY_DIGEST_TABLE(GRESS, NAME, r.TBL, IFID, NO, CNT, REVERSE, IFIDX)       \
+    }                                                                              \
+    }
+
+#define JBAY_COMMON_DIGEST(GRESS, NAME, TBL, SEL, IFID, CNT, REVERSE, IFIDX) \
+    DEPARSER_DIGEST(JBay, GRESS, NAME, CNT, can_shift = true;) {             \
+        SEL.phv = data.select.val->reg.deparser_id();                        \
+        JBAY_POV(GRESS, data.select, SEL)                                    \
+        SEL.shft = data.shift + data.select->lo;                             \
+        SEL.disable_ = 0;
+
+#define JBAY_DIGEST_TABLE(GRESS, NAME, REG, IFID, IFVALID, CNT, REVERSE, IFIDX)                 \
+    for (auto &set : data.layout) {                                                             \
+        int id = set.first >> data.shift;                                                       \
+        int idx = 0;                                                                            \
+        int maxidx = REG IFIDX([id]).phvs.size() - 1;                                           \
+        bool first = true;                                                                      \
+        int last = -1;                                                                          \
+        for (auto &reg : set.second) {                                                          \
+            if (first) {                                                                        \
+                first = false;                                                                  \
+                IFID(REG IFIDX([id]).id_phv = reg->reg.deparser_id(); continue;)                \
+            }                                                                                   \
+            /* The same 16b/32b container cannot appear consecutively, but 8b can. */           \
+            if (last == reg->reg.deparser_id() && reg->reg.size != 8) {                         \
+                error(data.lineno, "%s: %db container %s seen in consecutive locations", #NAME, \
+                      reg->reg.size, reg->reg.name);                                            \
+                continue;                                                                       \
+            }                                                                                   \
+            for (int i = reg->reg.size / 8; i > 0; i--) {                                       \
+                if (idx > maxidx) {                                                             \
+                    error(data.lineno, "%s digest limited to %d bytes", #NAME, maxidx + 1);     \
+                    break;                                                                      \
+                }                                                                               \
+                REG IFIDX([id]).phvs[REVERSE(maxidx -) idx++] = reg->reg.deparser_id();         \
+            }                                                                                   \
+            last = reg->reg.deparser_id();                                                      \
+        }                                                                                       \
+        IFVALID(REG IFIDX([id]).valid = 1;)                                                     \
+        REG IFIDX([id]).len = idx;                                                              \
+    }
+
+JBAY_SIMPLE_DIGEST(INGRESS, learning, regs.dprsrreg.inp.ipp.ingr.learn_tbl,
+                   regs.dprsrreg.inp.ipp.ingr.m_learn_sel, NO, 8, YES, YES)
+JBAY_ARRAY_DIGEST(INGRESS, mirror, regs.dprsrreg.ho_i, him.mirr_hdr_tbl.entry,
+                  regs.dprsrreg.inp.ipp.ingr.m_mirr_sel, YES, 16, NO, YES)
+JBAY_ARRAY_DIGEST(EGRESS, mirror, regs.dprsrreg.ho_e, hem.mirr_hdr_tbl.entry,
+                  regs.dprsrreg.inp.ipp.egr.m_mirr_sel, YES, 16, NO, YES)
+JBAY_SIMPLE_DIGEST(INGRESS, resubmit, regs.dprsrreg.inp.ipp.ingr.resub_tbl,
+                   regs.dprsrreg.inp.ipp.ingr.m_resub_sel, NO, 8, NO, YES)
+JBAY_SIMPLE_DIGEST(INGRESS, pktgen, regs.dprsrreg.inp.ipp.ingr.pgen_tbl,
+                   regs.dprsrreg.inp.ipp.ingr.m_pgen, NO, 1, NO, NO)
+
+// all the jbay deparser subtrees with a dis or disable_ bit
+// FIXME -- should be a way of doing this with a smart template or other metaprogramming.
+#define JBAY_DISABLE_REGBITS(M)                                         \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_afc, dis)                     \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_capture_tx_ts, dis)           \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_force_tx_err, dis)            \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_c2c_ctrl, dis)           \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_coal_smpl_len, dis)      \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_dond_ctrl, dis)          \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_epipe_port, dis)         \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_hash, dis)               \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_icos, dis)               \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_io_sel, dis)             \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_mc_ctrl, dis)            \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mirr_qid, dis)                \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mtu_trunc_err_f, dis)         \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_mtu_trunc_len, dis)           \
+    M(YES, regs.dprsrreg.ho_e, her.meta.m_tx_pkt_has_offsets, dis)      \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_afc, dis)                     \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_bypss_egr, dis)               \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_copy_to_cpu_cos, dis)         \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_ct_disable, dis)              \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_ct_mcast, dis)                \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_deflect_on_drop, dis)         \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_hash1, dis)                   \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_hash2, dis)                   \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_icos, dis)                    \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_c2c_ctrl, dis)           \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_coal_smpl_len, dis)      \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_dond_ctrl, dis)          \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_epipe_port, dis)         \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_hash, dis)               \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_icos, dis)               \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_io_sel, dis)             \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_mc_ctrl, dis)            \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mirr_qid, dis)                \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mtu_trunc_err_f, dis)         \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_mtu_trunc_len, dis)           \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_pkt_color, dis)               \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_qid, dis)                     \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_rid, dis)                     \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_xid_l1, dis)                  \
+    M(YES, regs.dprsrreg.ho_i, hir.meta.m_xid_l2, dis)                  \
+    M(NO, , regs.dprsrreg.inp.ipp.egr.m_drop_ctl, disable_)             \
+    M(NO, , regs.dprsrreg.inp.ipp.egr.m_egress_unicast_port, disable_)  \
+    M(NO, , regs.dprsrreg.inp.ipp.egr.m_mirr_sel, disable_)             \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_copy_to_cpu, disable_)         \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_drop_ctl, disable_)            \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_egress_unicast_port, disable_) \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_learn_sel, disable_)           \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_mgid1, disable_)               \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_mgid2, disable_)               \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_mirr_sel, disable_)            \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_pgen, disable_)                \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_pgen_addr, disable_)           \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_pgen_len, disable_)            \
+    M(NO, , regs.dprsrreg.inp.ipp.ingr.m_resub_sel, disable_)
+
+// Compiler workaround for TOF2LAB-44, skip certain chunk indices
+void tof2lab44_workaround(int lineno, unsigned &chunk_index) {
+    if (options.tof2lab44_workaround) {
+        static std::set<unsigned> skipped_chunks = {24, 32, 40, 48,  56,  64, 72,
+                                                    80, 88, 96, 104, 112, 120};
+        while (skipped_chunks.count(chunk_index)) chunk_index++;
+    }
+}
+
+// INVARIANT: check_chunk is idempotent.
+bool check_chunk(int lineno, unsigned &chunk) {
+    tof2lab44_workaround(lineno, chunk);
+
+    const unsigned TOTAL_CHUNKS = Target::JBay::DEPARSER_TOTAL_CHUNKS;
+    static bool suppress_repeated = false;
+    if (chunk >= TOTAL_CHUNKS) {
+        if (!suppress_repeated)
+            error(lineno, "Ran out of chunks in field dictionary (%d)", TOTAL_CHUNKS);
+        suppress_repeated = true;
+        return false;
+    }
+    return true;
+}
+
+/// A callback to write a PHV, constant, or checksum chunk to the field dictionary.
+using WriteChunk = std::function<void(
+    unsigned & /* chunk_index */, const Phv::Slice & /* prev_pov */, int /* prev_entry_encoded */,
+    int /* entry_lineno */, const Phv::Ref & /* entry_pov */,
+    Deparser::FDEntry::Base * /* entry_what */, unsigned /* byte */, unsigned /* size */)>;
+
+/// A callback to finish writing a PHV, constant, or checksum chunk to the field dictionary.
+using FinishChunk =
+    std::function<void(unsigned /* chunk_index */, unsigned /* dictionary entry number */,
+                       const Phv::Slice & /* pov_bit */, unsigned /* byte */)>;
+
+/// A callback for writing a CLOT to the field dictionary. This increments the chunk index if the
+/// CLOT spans multiple chunks.
+using WriteClot = std::function<void(
+    unsigned & /* chunk_index */, unsigned & /* dictionary entry number */, int /* segment_tag */,
+    int /* clot_tag */, const Phv::Ref & /* pov_bit */, Deparser::FDEntry::Clot * /* clot */)>;
+
+/// Implements common control functionality for outputting field dictionaries and field dictionary
+/// slices.
+template <class POV, class DICT>
+void output_jbay_field_dictionary_helper(int lineno, POV &pov, DICT &dict, WriteChunk write_chunk,
+                                         FinishChunk finish_chunk, WriteClot write_clot) {
+    const unsigned CHUNK_SIZE = Target::JBay::DEPARSER_CHUNK_SIZE;
+    const unsigned CHUNK_GROUPS = Target::JBay::DEPARSER_CHUNK_GROUPS;
+    const unsigned CHUNKS_PER_GROUP = Target::JBay::DEPARSER_CHUNKS_PER_GROUP;
+    const unsigned CLOTS_PER_GROUP = Target::JBay::DEPARSER_CLOTS_PER_GROUP;
+    unsigned ch = 0, entry_n = 0, byte = 0, group = 0, clots_in_group = 0;
+    Phv::Slice prev_pov;
+    int prev = -1;
+
+    // INVARIANT: check_chunk should be called immediately before doing anything with a chunk.
+    // Because check_chunk is idempotent, it is fine to call it on a chunk that has previously been
+    // checked.
+
+    for (auto &ent : dict) {
+        auto *clot = dynamic_cast<Deparser::FDEntry::Clot *>(ent.what.get());
+        // FIXME -- why does the following give an error from gcc?
+        // auto *clot = ent.what->to<Deparser::FDEntry::Clot>();
+        unsigned size = ent.what->size();
+
+        // Finish the current chunk if needed.
+        if (byte &&
+            (clot || byte + size > CHUNK_SIZE || (prev_pov && *ent.pov.front() != prev_pov))) {
+            finish_chunk(ch++, entry_n++, prev_pov, byte);
+            byte = 0;
+        }
+        if (ch / CHUNKS_PER_GROUP != group) {
+            // into a new group
+            group = ch / CHUNKS_PER_GROUP;
+            clots_in_group = 0;
+        }
+        if (clot) {
+            // Start a new group if needed. Each group has a maximum number of CLOTs that can be
+            // deparsed, and CLOTs cannot span multiple groups.
+            bool out_of_clots_in_group = clots_in_group >= CLOTS_PER_GROUP;
+            auto chunks_in_clot = (size + CHUNK_SIZE - 1) / CHUNK_SIZE;
+            bool out_of_chunks_in_group = ch % CHUNKS_PER_GROUP + chunks_in_clot > CHUNKS_PER_GROUP;
+            if (out_of_clots_in_group || out_of_chunks_in_group) {
+                // go on to the next group
+                ch = (ch | (CHUNKS_PER_GROUP - 1)) + 1;
+                group = ch / CHUNKS_PER_GROUP;
+                clots_in_group = 0;
+            }
+
+            // Write the CLOT to the next segment in the current group.
+            if (chunks_in_clot == CHUNKS_PER_GROUP && (ch % CHUNKS_PER_GROUP))
+                error(clot->lineno, "--tof2lab44-workaround incompatible with clot >56 bytes");
+            int clot_tag = Parser::clot_tag(clot->gress, clot->tag);
+            int seg_tag = clots_in_group++;
+            write_clot(ch, entry_n, seg_tag, clot_tag, ent.pov.front(), clot);
+
+            prev = -1;
+        } else {
+            // Phv, Constant, or Checksum
+            write_chunk(ch, prev_pov, prev, ent.lineno, ent.pov.front(), ent.what.get(), byte,
+                        size);
+            byte += size;
+            prev = ent.what->encode();
+        }
+        prev_pov = *ent.pov.front();
+    }
+
+    if (byte > 0) {
+        finish_chunk(ch, entry_n, prev_pov, byte);
+    }
+}
+
+template <class REGS, class POV_FMT, class POV, class DICT>
+void output_jbay_field_dictionary(int lineno, REGS &regs, POV_FMT &pov_layout, POV &pov,
+                                  DICT &dict) {
+    // Initialize pov_layout.
+    unsigned byte = 0;
+    for (auto &r : pov) {
+        for (int bits = 0; bits < r.first->size; bits += 8) {
+            if (byte > pov_layout.size()) error(lineno, "Ran out of space in POV in deparser");
+            pov_layout[byte++] = r.first->deparser_id();
+        }
+    }
+    while (byte < pov_layout.size()) pov_layout[byte++] = 0xff;
+    LOG5("jbay field dictionary:");
+
+    // Declare some callback functions, and then delegate to helper.
+    auto write_chunk = [](unsigned ch, const Phv::Slice &prev_pov, int prev, int ent_lineno,
+                          const Phv::Ref &ent_pov, Deparser::FDEntry::Base *ent_what, unsigned byte,
+                          unsigned size) {
+        // Just do an error check here. Defer actual writing to finish_chunk.
+        LOG5("  chunk " << ch << ": " << *ent_what << " (pov " << ent_pov << ")");
+        if (dynamic_cast<Deparser::FDEntry::Phv *>(ent_what) && prev_pov == *ent_pov &&
+            int(ent_what->encode()) == prev && (size & 6))
+            error(ent_lineno, "16 and 32-bit container cannot be repeatedly deparsed");
+    };
+
+    auto finish_chunk = [&](unsigned ch, unsigned entry_n, const Phv::Slice &pov_bit,
+                            unsigned byte) {
+        if (check_chunk(lineno, ch)) {
+            regs.chunk_info[ch].chunk_vld = 1;
+            regs.chunk_info[ch].pov = pov.at(&pov_bit.reg) + pov_bit.lo;
+            regs.chunk_info[ch].seg_vld = 0;
+            regs.chunk_info[ch].seg_slice = byte & 7;
+            regs.chunk_info[ch].seg_sel = byte >> 3;
+        }
+    };
+
+    auto write_clot = [&](unsigned &ch, unsigned &entry_n, int seg_tag, int clot_tag,
+                          const Phv::Ref &pov_bit, Deparser::FDEntry::Clot *clot) {
+        const unsigned CHUNKS_PER_GROUP = Target::JBay::DEPARSER_CHUNKS_PER_GROUP;
+        const int group = ch / CHUNKS_PER_GROUP;
+        if (group < regs.fd_tags.size()) regs.fd_tags[group].segment_tag[seg_tag] = clot_tag;
+        LOG5("  chunk " << ch << ": " << *clot << " (pov " << pov_bit << ")");
+        for (int i = 0; i < clot->length; i += 8, ++ch) {
+            // CLOTs cannot span multiple groups.
+            BUG_CHECK(ch / CHUNKS_PER_GROUP == group || error_count > 0, "CLOT spanning groups");
+            if (check_chunk(lineno, ch)) {
+                regs.chunk_info[ch].chunk_vld = 1;
+                regs.chunk_info[ch].pov = pov.at(&pov_bit->reg) + pov_bit->lo;
+                regs.chunk_info[ch].seg_vld = 1;
+                regs.chunk_info[ch].seg_sel = seg_tag;
+                regs.chunk_info[ch].seg_slice = i / 8U;
+            }
+        }
+    };
+
+    output_jbay_field_dictionary_helper(lineno, pov, dict, write_chunk, finish_chunk, write_clot);
+}
+
+template <class CHUNKS, class CLOTS, class POV, class DICT>
+void output_jbay_field_dictionary_slice(int lineno, CHUNKS &chunk, CLOTS &clots, POV &pov,
+                                        DICT &dict, json::vector &fd_gress,
+                                        json::vector &fd_entries, gress_t gress) {
+    json::map fd;
+    json::map fd_entry;
+    json::vector chunk_bytes;
+    json::vector fd_entry_chunk_bytes;
+
+    auto write_chunk = [&](unsigned ch, const Phv::Slice &prev_pov, int prev, int ent_lineno,
+                           const Phv::Ref &ent_pov, Deparser::FDEntry::Base *ent_what,
+                           unsigned byte, unsigned size) {
+        while (size--) {
+            json::map chunk_byte;
+            json::map fd_entry_chunk_byte;
+            json::map fd_entry_chunk;
+            chunk_byte["Byte"] = byte;
+            fd_entry_chunk_byte["chunk_number"] = byte;
+            if (ent_what->encode() < CONSTANTS_PHVID_JBAY_LOW) {
+                auto *phv = dynamic_cast<Deparser::FDEntry::Phv *>(ent_what);
+                auto phv_reg = phv->reg();
+                write_field_name_in_json(phv_reg, &ent_pov->reg, ent_pov->lo, chunk_byte,
+                                         fd_entry_chunk, 19, gress);
+            } else {
+                write_csum_const_in_json(ent_what->encode(), chunk_byte, fd_entry_chunk, gress);
+            }
+            fd_entry_chunk_byte["chunk"] = std::move(fd_entry_chunk);
+            chunk_bytes.push_back(std::move(chunk_byte));
+            fd_entry_chunk_bytes.push_back(std::move(fd_entry_chunk_byte));
+            if (check_chunk(lineno, ch)) {
+                chunk[ch].is_phv |= 1 << byte;
+                chunk[ch].byte_off.phv_offset[byte++] = ent_what->encode();
+            }
+        }
+    };
+
+    auto finish_chunk = [&](unsigned ch, unsigned entry_n, const Phv::Slice &pov_bit,
+                            unsigned byte) {
+        fd["Field Dictionary Number"] = entry_n;
+        fd["Field Dictionary Chunk"] = ch;
+        fd_entry["entry"] = entry_n;
+        // fd_entry["fde_chunk"] = ch;  -- requires compiler_interfaces change
+        Deparser::write_pov_in_json(fd, fd_entry, &pov_bit.reg, pov.at(&pov_bit.reg) + pov_bit.lo,
+                                    pov_bit.lo);
+        if (check_chunk(lineno, ch)) {
+            chunk[ch].cfg.seg_vld = 0;  // no CLOTs yet
+            chunk[ch].cfg.seg_slice = byte & 7;
+            chunk[ch].cfg.seg_sel = byte >> 3;
+        }
+
+        fd["Content"] = std::move(chunk_bytes);
+        fd_entry["chunks"] = std::move(fd_entry_chunk_bytes);
+        fd_entries.push_back(std::move(fd_entry));
+        fd_gress.push_back(std::move(fd));
+    };
+
+    auto write_clot = [&](unsigned &ch, unsigned &entry_n, int seg_tag, int clot_tag,
+                          const Phv::Ref &pov_bit, Deparser::FDEntry::Clot *clot) {
+        const unsigned CHUNKS_PER_GROUP = Target::JBay::DEPARSER_CHUNKS_PER_GROUP;
+        const int group = ch / CHUNKS_PER_GROUP;
+        if (group < clots.size()) clots[group].segment_tag[seg_tag] = clot_tag;
+        auto phv_repl = clot->phv_replace.begin();
+        auto csum_repl = clot->csum_replace.begin();
+        for (int i = 0; i < clot->length; i += 8, ++ch, ++entry_n) {
+            // CLOTs cannot span multiple groups.
+            BUG_CHECK(ch / CHUNKS_PER_GROUP == group || error_count > 0, "CLOT spanning groups");
+
+            fd["Field Dictionary Number"] = entry_n;
+            fd["Field Dictionary Chunk"] = ch;
+            fd_entry["entry"] = entry_n;
+            // fd_entry["fde_chunk"] = ch;  -- requires compiler_interfaces change
+            Deparser::write_pov_in_json(fd, fd_entry, &pov_bit->reg,
+                                        pov.at(&pov_bit->reg) + pov_bit->lo, pov_bit->lo);
+
+            if (check_chunk(lineno, ch)) {
+                chunk[ch].cfg.seg_vld = 1;
+                chunk[ch].cfg.seg_sel = seg_tag;
+                chunk[ch].cfg.seg_slice = i / 8U;
+            }
+
+            for (int j = 0; j < 8 && i + j < clot->length; ++j) {
+                json::map chunk_byte;
+                json::map fd_entry_chunk_byte;
+                json::map fd_entry_chunk;
+                chunk_byte["Byte"] = j;
+                fd_entry_chunk_byte["chunk_number"] = j;
+                if (phv_repl != clot->phv_replace.end() && int(phv_repl->first) <= i + j) {
+                    // This is PHV replaced, PHV is used
+                    chunk[ch].is_phv |= 1 << j;
+                    chunk[ch].byte_off.phv_offset[j] = phv_repl->second->reg.deparser_id();
+                    auto phv_reg = &phv_repl->second->reg;
+                    write_field_name_in_json(phv_reg, &pov_bit->reg, pov_bit->lo, chunk_byte,
+                                             fd_entry_chunk, 19, gress);
+                    if (int(phv_repl->first + phv_repl->second->size() / 8U) <= i + j + 1)
+                        ++phv_repl;
+                } else if (csum_repl != clot->csum_replace.end() &&
+                           int(csum_repl->first) <= i + j) {
+                    if (check_chunk(lineno, ch)) {
+                        chunk[ch].is_phv |= 1 << j;
+                        chunk[ch].byte_off.phv_offset[j] = csum_repl->second.encode();
+                    }
+                    write_csum_const_in_json(csum_repl->second.encode(), chunk_byte, fd_entry_chunk,
+                                             gress);
+                    if (int(csum_repl->first + 2) <= i + j + 1) ++csum_repl;
+                } else {
+                    if (check_chunk(lineno, ch)) chunk[ch].byte_off.phv_offset[j] = i + j;
+                    chunk_byte["CLOT"] = clot_tag;
+                    chunk_byte["CLOT_OFFSET"] = i + j;
+                    fd_entry_chunk["clot_tag"] = clot_tag;
+                    // fd_entry_chunk["clot_offset"] = i + j;   requires compiler_interfaces change
+                }
+                fd_entry_chunk_byte["chunk"] = std::move(fd_entry_chunk);
+                chunk_bytes.push_back(std::move(chunk_byte));
+                fd_entry_chunk_bytes.push_back(std::move(fd_entry_chunk_byte));
+            }
+            fd["Content"] = std::move(chunk_bytes);
+            fd_entry["chunks"] = std::move(fd_entry_chunk_bytes);
+            fd_entries.push_back(std::move(fd_entry));
+            fd_gress.push_back(std::move(fd));
+        }
+    };
+
+    output_jbay_field_dictionary_helper(lineno, pov, dict, write_chunk, finish_chunk, write_clot);
+}
+
+static void check_jbay_ownership(bitvec phv_use[2]) {
+    unsigned mask = 0;
+    int group = -1;
+    for (auto i : phv_use[INGRESS]) {
+        if ((i | mask) == (group | mask)) continue;
+        switch (Phv::reg(i)->size) {
+            case 8:
+            case 16:
+                mask = 3;
+                break;
+            case 32:
+                mask = 1;
+                break;
+            default:
+                BUG();
+        }
+        group = i & ~mask;
+        if (phv_use[EGRESS].getrange(group, mask + 1)) {
+            error(0, "%s..%s used by both ingress and egress deparser", Phv::reg(group)->name,
+                  Phv::reg(group | mask)->name);
+        }
+    }
+}
+
+static void setup_jbay_ownership(bitvec phv_use, ubits_base &phv8, ubits_base &phv16,
+                                 ubits_base &phv32) {
+    std::set<unsigned> phv8_grps, phv16_grps, phv32_grps;
+
+    for (auto i : phv_use) {
+        auto *reg = Phv::reg(i);
+        switch (reg->size) {
+            case 8:
+                phv8_grps.insert(1U << ((reg->deparser_id() - 64) / 4U));
+                break;
+            case 16:
+                phv16_grps.insert(1U << ((reg->deparser_id() - 128) / 4U));
+                break;
+            case 32:
+                phv32_grps.insert(1U << (reg->deparser_id() / 2U));
+                break;
+            default:
+                BUG();
+        }
+    }
+
+    for (auto v : phv8_grps) phv8 |= v;
+    for (auto v : phv16_grps) phv16 |= v;
+    for (auto v : phv32_grps) phv32 |= v;
+}
+
+static short jbay_phv2cksum[224][2] = {
+    // Entries 0-127 are for 32 bit PHV
+    // Each 32 bit PHV uses two 16b adders
+    // The even addresses are for [31:16], the odd addresses are for [15:0]
+    // Note: The current CSR description of these entries for 32 bit containers is incorrect.
+    // 128-191 are for 8 bit PHV
+    // 192-287 are for 16 bit PHV
+    {1, 0},     {3, 2},     {5, 4},     {7, 6},     {9, 8},     {11, 10},   {13, 12},   {15, 14},
+    {17, 16},   {19, 18},   {21, 20},   {23, 22},   {25, 24},   {27, 26},   {29, 28},   {31, 30},
+    {33, 32},   {35, 34},   {37, 36},   {39, 38},   {41, 40},   {43, 42},   {45, 44},   {47, 46},
+    {49, 48},   {51, 50},   {53, 52},   {55, 54},   {57, 56},   {59, 58},   {61, 60},   {63, 62},
+    {65, 64},   {67, 66},   {69, 68},   {71, 70},   {73, 72},   {75, 74},   {77, 76},   {79, 78},
+    {81, 80},   {83, 82},   {85, 84},   {87, 86},   {89, 88},   {91, 90},   {93, 92},   {95, 94},
+    {97, 96},   {99, 98},   {101, 100}, {103, 102}, {105, 104}, {107, 106}, {109, 108}, {111, 110},
+    {113, 112}, {115, 114}, {117, 116}, {119, 118}, {121, 120}, {123, 122}, {125, 124}, {127, 126},
+    {128, -1},  {129, -1},  {130, -1},  {131, -1},  {132, -1},  {133, -1},  {134, -1},  {135, -1},
+    {136, -1},  {137, -1},  {138, -1},  {139, -1},  {140, -1},  {141, -1},  {142, -1},  {143, -1},
+    {144, -1},  {145, -1},  {146, -1},  {147, -1},  {148, -1},  {149, -1},  {150, -1},  {151, -1},
+    {152, -1},  {153, -1},  {154, -1},  {155, -1},  {156, -1},  {157, -1},  {158, -1},  {159, -1},
+    {160, -1},  {161, -1},  {162, -1},  {163, -1},  {164, -1},  {165, -1},  {166, -1},  {167, -1},
+    {168, -1},  {169, -1},  {170, -1},  {171, -1},  {172, -1},  {173, -1},  {174, -1},  {175, -1},
+    {176, -1},  {177, -1},  {178, -1},  {179, -1},  {180, -1},  {181, -1},  {182, -1},  {183, -1},
+    {184, -1},  {185, -1},  {186, -1},  {187, -1},  {188, -1},  {189, -1},  {190, -1},  {191, -1},
+    {192, -1},  {193, -1},  {194, -1},  {195, -1},  {196, -1},  {197, -1},  {198, -1},  {199, -1},
+    {200, -1},  {201, -1},  {202, -1},  {203, -1},  {204, -1},  {205, -1},  {206, -1},  {207, -1},
+    {208, -1},  {209, -1},  {210, -1},  {211, -1},  {212, -1},  {213, -1},  {214, -1},  {215, -1},
+    {216, -1},  {217, -1},  {218, -1},  {219, -1},  {220, -1},  {221, -1},  {222, -1},  {223, -1},
+    {224, -1},  {225, -1},  {226, -1},  {227, -1},  {228, -1},  {229, -1},  {230, -1},  {231, -1},
+    {232, -1},  {233, -1},  {234, -1},  {235, -1},  {236, -1},  {237, -1},  {238, -1},  {239, -1},
+    {240, -1},  {241, -1},  {242, -1},  {243, -1},  {244, -1},  {245, -1},  {246, -1},  {247, -1},
+    {248, -1},  {249, -1},  {250, -1},  {251, -1},  {252, -1},  {253, -1},  {254, -1},  {255, -1},
+    {256, -1},  {257, -1},  {258, -1},  {259, -1},  {260, -1},  {261, -1},  {262, -1},  {263, -1},
+    {264, -1},  {265, -1},  {266, -1},  {267, -1},  {268, -1},  {269, -1},  {270, -1},  {271, -1},
+    {272, -1},  {273, -1},  {274, -1},  {275, -1},  {276, -1},  {277, -1},  {278, -1},  {279, -1},
+    {280, -1},  {281, -1},  {282, -1},  {283, -1},  {284, -1},  {285, -1},  {286, -1},  {287, -1},
+};
+
+template <class ENTRIES>
+static void write_jbay_checksum_entry(ENTRIES &entry, unsigned mask, int swap, int pov, int id,
+                                      const char *reg = nullptr) {
+    write_checksum_entry(entry, mask, swap, id, reg);
+    entry.pov = pov;
+}
+
+// Populates pov_map which maps the bit in the main POV array [127:0]
+// to bit in the checksum pov array [32:0]
+// The checksum pov array is 32 bits / 4 bytes - pov_cfg.byte_set[4].
+// Each element of the pov_cfg.byte_sel array maps to the byte in the main POV array
+template <class POV>
+void jbay_csum_pov_config(Phv::Ref povRef, POV &pov_cfg,
+                          ordered_map<const Phv::Register *, unsigned> &pov,
+                          std::map<unsigned, unsigned> &pov_map, unsigned *prev_byte,
+                          int csum_unit) {
+    unsigned bit = pov.at(&povRef->reg) + povRef->lo;
+    if (pov_map.count(bit)) return;
+    for (unsigned i = 0; i < (*prev_byte); ++i) {
+        if (pov_cfg.byte_sel[i] == bit / 8U) {
+            pov_map[bit] = i * 8U + bit % 8U;
+            break;
+        }
+    }
+    if (pov_map.count(bit)) return;
+    if (*prev_byte >= (int)pov_cfg.byte_sel.size()) {
+        error(povRef.lineno, "Checksum unit %d exceeds %d bytes of POV", csum_unit,
+              (int)pov_cfg.byte_sel.size());
+        return;
+    }
+    pov_map[bit] = (*prev_byte) * 8U + bit % 8U;
+    pov_cfg.byte_sel[(*prev_byte)++] = bit / 8U;
+    return;
+}
+
+template <class POV>
+void set_jbay_pov_cfg(POV &pov_cfg, std::map<unsigned, unsigned> &pov_map,
+                      Deparser::FullChecksumUnit &full_csum,
+                      ordered_map<const Phv::Register *, unsigned> &pov, int csum_unit,
+                      unsigned *prev_byte) {
+    for (auto &unit_entry : full_csum.entries) {
+        for (auto val : unit_entry.second) {
+            if (val.pov.size() != 1) {
+                error(val.val.lineno, "one POV bit required for Tofino2");
+                continue;
+            }
+            jbay_csum_pov_config(val.pov.front(), pov_cfg, pov, pov_map, prev_byte, csum_unit);
+        }
+    }
+    for (auto &val : full_csum.clot_entries) {
+        if (val.pov.size() != 1) {
+            error(val.val.lineno, "one POV bit required for Tofino2");
+            continue;
+        }
+        jbay_csum_pov_config(val.pov.front(), pov_cfg, pov, pov_map, prev_byte, csum_unit);
+    }
+    for (auto &checksum_pov : full_csum.pov) {
+        jbay_csum_pov_config(checksum_pov.second, pov_cfg, pov, pov_map, prev_byte, csum_unit);
+    }
+    return;
+}
+
+template <class CSUM, class ENTRIES>
+void write_jbay_full_checksum_config(
+    CSUM &csum, ENTRIES &phv_entries, int unit, std::set<int> &visited,
+    std::array<std::map<unsigned, unsigned>, MAX_DEPARSER_CHECKSUM_UNITS> &pov_map,
+    Deparser::FullChecksumUnit &full_csum, ordered_map<const Phv::Register *, unsigned> &pov) {
+    for (auto &unit_entry : full_csum.entries) {
+        // Same partial checksum unit can be used in multiple full checksum unit.
+        // No need to rewrite the checksum entries multiple times for the same unit
+        if (visited.count(unit_entry.first)) continue;
+        visited.insert(unit_entry.first);
+        for (auto val : unit_entry.second) {
+            if (val.pov.size() != 1) continue;
+            int povbit =
+                pov_map[unit_entry.first].at(pov.at(&val.pov.front()->reg) + val.pov.front()->lo);
+            int mask = val.mask;
+            int swap = val.swap;
+            auto &remap = jbay_phv2cksum[val->reg.deparser_id()];
+            write_jbay_checksum_entry(phv_entries[unit_entry.first].entry[remap[0]], mask & 3,
+                                      swap & 1, povbit, unit_entry.first, val->reg.name);
+            if (remap[1] >= 0)
+                write_jbay_checksum_entry(phv_entries[unit_entry.first].entry[remap[1]], mask >> 2,
+                                          swap >> 1, povbit, unit_entry.first, val->reg.name);
+            else
+                BUG_CHECK((mask >> 2 == 0) && (swap >> 1 == 0));
+        }
+    }
+    int tag_idx = 0;
+    for (auto &val : full_csum.clot_entries) {
+        if (val.pov.size() != 1) continue;
+        int povbit = pov_map[unit].at(pov.at(&val.pov.front()->reg) + val.pov.front()->lo);
+        if (tag_idx == 16) error(-1, "Ran out of clot entries in deparser checksum unit %d", unit);
+        csum.clot_entry[tag_idx].pov = povbit;
+        csum.clot_entry[tag_idx].vld = 1;
+        csum.tags[tag_idx].tag = val.tag;
+        tag_idx++;
+    }
+    for (auto &checksum_pov : full_csum.pov) {
+        csum.phv_entry[checksum_pov.first].pov =
+            pov_map[unit].at(pov.at(&checksum_pov.second->reg) + checksum_pov.second->lo);
+        csum.phv_entry[checksum_pov.first].vld = 1;
+    }
+    csum.zeros_as_ones.en = full_csum.zeros_as_ones_en;
+
+    // FIXME -- use/set csum.csum_constant?
+}
+// Engine 0: scratch[23:0]
+// Engine 1: { scratch2[15:0], scratch[31:24] }
+// Engine 2: { scratch[7:0] , scratch2[31:16] }
+// Engine 3: scratch[31:8]
+// So each engine gets a cfg_vector[23:0]
+// There are 16 CLOT csums and 8 PHV csums that can be inverted:
+// CLOT csum [15:0] are controlled by cfg_vector [15:0]
+// PHV csums [7:0] are controlled by cfg_vector [23:16]
+
+template <class SCRATCH1, class SCRATCH2, class SCRATCH3>
+void write_jbay_full_checksum_invert_config(SCRATCH1 &scratch1, SCRATCH2 &scratch2,
+                                            SCRATCH3 &scratch3, int unit,
+                                            Deparser::FullChecksumUnit &full_csum) {
+    ubits<32> value1;
+    ubits<32> value2;
+    ubits<32> value3;
+    for (auto checksum_unit : full_csum.checksum_unit_invert) {
+        if (unit == 0) {
+            value1 |= (1 << (16 + checksum_unit));
+        } else if (unit == 1) {
+            value1 |= (1 << (8 + checksum_unit));
+        } else if (unit == 2) {
+            value3 |= (1 << checksum_unit);
+        } else if (unit == 3) {
+            value3 |= (1 << (24 + checksum_unit));
+        }
+    }
+    for (auto clot_tag : full_csum.clot_tag_invert) {
+        if (unit == 0) {
+            value1 |= (1 << clot_tag);
+        } else if (unit == 1) {
+            if (clot_tag > 7) {
+                value1 |= (1 << (clot_tag - 8));
+            } else {
+                value3 |= (1 << (16 + clot_tag));
+            }
+        } else if (unit == 2) {
+            value2 |= (1 << (16 + clot_tag));
+        } else if (unit == 3) {
+            value3 |= (1 << (8 + clot_tag));
+        }
+    }
+    if (value1 || value2 || value3) {
+        scratch1.value |= value1;
+        scratch2.value |= value2;
+        scratch3.value |= value3;
+    }
+    return;
+}
+
+template <class CONS>
+void write_jbay_constant_config(CONS &cons, const std::set<int> &vals) {
+    unsigned idx = 0;
+    for (auto v : vals) {
+        cons[idx] = v;
+        idx++;
+    }
+}
+
+template <>
+void Deparser::write_config(Target::JBay::deparser_regs &regs) {
+    regs.dprsrreg.inp.icr.disable();          // disable this whole tree
+    regs.dprsrreg.inp.icr.disabled_ = false;  // then enable just certain subtrees
+    regs.dprsrreg.inp.icr.csum_engine.enable();
+    regs.dprsrreg.inp.icr.egr.enable();
+    regs.dprsrreg.inp.icr.egr_meta_pov.enable();
+    regs.dprsrreg.inp.icr.ingr.enable();
+    regs.dprsrreg.inp.icr.ingr_meta_pov.enable();
+    regs.dprsrreg.inp.icr.scratch.enable();
+    regs.dprsrreg.inp.icr.scratch2.enable();
+    regs.dprsrreg.inp.ipp.scratch.enable();
+    regs.dprsrreg.inp.iim.disable();
+    regs.dprsrreg.inpslice.disable();
+    for (auto &r : regs.dprsrreg.ho_i) r.out_ingr.disable();
+    for (auto &r : regs.dprsrreg.ho_e) r.out_egr.disable();
+
+    for (auto &r : regs.dprsrreg.ho_i)
+        write_jbay_constant_config(r.hir.h.hdr_xbar_const.value, constants[INGRESS]);
+    for (auto &r : regs.dprsrreg.ho_e)
+        write_jbay_constant_config(r.her.h.hdr_xbar_const.value, constants[EGRESS]);
+    std::set<int> visited_i;
+    std::array<std::map<unsigned, unsigned>, MAX_DEPARSER_CHECKSUM_UNITS> pov_map_i;
+    for (int csum_unit = 0; csum_unit < Target::JBay::DEPARSER_CHECKSUM_UNITS; csum_unit++) {
+        unsigned prev_byte = 0;
+        if (full_checksum_unit[INGRESS][csum_unit].clot_entries.empty() &&
+            full_checksum_unit[INGRESS][csum_unit].entries.empty())
+            continue;
+        set_jbay_pov_cfg(regs.dprsrreg.inp.ipp.phv_csum_pov_cfg.csum_pov_cfg[csum_unit],
+                         pov_map_i[csum_unit], full_checksum_unit[INGRESS][csum_unit], pov[INGRESS],
+                         csum_unit, &prev_byte);
+        if (error_count > 0) break;
+    }
+    for (int csum_unit = 0; csum_unit < Target::JBay::DEPARSER_CHECKSUM_UNITS && error_count == 0;
+         csum_unit++) {
+        if (full_checksum_unit[INGRESS][csum_unit].clot_entries.empty() &&
+            full_checksum_unit[INGRESS][csum_unit].entries.empty())
+            continue;
+        regs.dprsrreg.inp.ipp.phv_csum_pov_cfg.thread.thread[csum_unit] = INGRESS;
+        write_jbay_full_checksum_config(
+            regs.dprsrreg.inp.icr.csum_engine[csum_unit], regs.dprsrreg.inp.ipp_m.i_csum.engine,
+            csum_unit, visited_i, pov_map_i, full_checksum_unit[INGRESS][csum_unit], pov[INGRESS]);
+        write_jbay_full_checksum_invert_config(
+            regs.dprsrreg.inp.icr.scratch, regs.dprsrreg.inp.icr.scratch2,
+            regs.dprsrreg.inp.ipp.scratch, csum_unit, full_checksum_unit[INGRESS][csum_unit]);
+    }
+    std::set<int> visited_e;
+    std::array<std::map<unsigned, unsigned>, MAX_DEPARSER_CHECKSUM_UNITS> pov_map_e;
+    for (int csum_unit = 0; csum_unit < Target::JBay::DEPARSER_CHECKSUM_UNITS; csum_unit++) {
+        unsigned prev_byte = 0;
+        if (full_checksum_unit[EGRESS][csum_unit].clot_entries.empty() &&
+            full_checksum_unit[EGRESS][csum_unit].entries.empty())
+            continue;
+        set_jbay_pov_cfg(regs.dprsrreg.inp.ipp.phv_csum_pov_cfg.csum_pov_cfg[csum_unit],
+                         pov_map_e[csum_unit], full_checksum_unit[EGRESS][csum_unit], pov[EGRESS],
+                         csum_unit, &prev_byte);
+        if (error_count > 0) break;
+    }
+    for (int csum_unit = 0; csum_unit < Target::JBay::DEPARSER_CHECKSUM_UNITS && error_count == 0;
+         csum_unit++) {
+        if (full_checksum_unit[EGRESS][csum_unit].clot_entries.empty() &&
+            full_checksum_unit[EGRESS][csum_unit].entries.empty())
+            continue;
+        regs.dprsrreg.inp.ipp.phv_csum_pov_cfg.thread.thread[csum_unit] = EGRESS;
+        write_jbay_full_checksum_config(
+            regs.dprsrreg.inp.icr.csum_engine[csum_unit], regs.dprsrreg.inp.ipp_m.i_csum.engine,
+            csum_unit, visited_e, pov_map_e, full_checksum_unit[EGRESS][csum_unit], pov[EGRESS]);
+        write_jbay_full_checksum_invert_config(
+            regs.dprsrreg.inp.icr.scratch, regs.dprsrreg.inp.icr.scratch2,
+            regs.dprsrreg.inp.ipp.scratch, csum_unit, full_checksum_unit[EGRESS][csum_unit]);
+    }
+
+    output_jbay_field_dictionary(lineno[INGRESS], regs.dprsrreg.inp.icr.ingr,
+                                 regs.dprsrreg.inp.ipp.main_i.pov.phvs, pov[INGRESS],
+                                 dictionary[INGRESS]);
+    json::map field_dictionary_alloc;
+    json::vector fde_entries_i;
+    json::vector fde_entries_e;
+    json::vector fde_entries;
+    json::vector fd_gress;
+    for (auto &rslice : regs.dprsrreg.ho_i) {
+        output_jbay_field_dictionary_slice(lineno[INGRESS], rslice.him.fd_compress.chunk,
+                                           rslice.hir.h.compress_clot_sel, pov[INGRESS],
+                                           dictionary[INGRESS], fd_gress, fde_entries, INGRESS);
+        field_dictionary_alloc["ingress"] = std::move(fd_gress);
+        fde_entries_i = std::move(fde_entries);
+    }
+    output_jbay_field_dictionary(lineno[EGRESS], regs.dprsrreg.inp.icr.egr,
+                                 regs.dprsrreg.inp.ipp.main_e.pov.phvs, pov[EGRESS],
+                                 dictionary[EGRESS]);
+    for (auto &rslice : regs.dprsrreg.ho_e) {
+        output_jbay_field_dictionary_slice(lineno[EGRESS], rslice.hem.fd_compress.chunk,
+                                           rslice.her.h.compress_clot_sel, pov[EGRESS],
+                                           dictionary[EGRESS], fd_gress, fde_entries, EGRESS);
+        field_dictionary_alloc["egress"] = std::move(fd_gress);
+        fde_entries_e = std::move(fde_entries);
+    }
+    if (Log::verbosity() > 0) {
+        auto json_dump = open_output("logs/field_dictionary.log");
+        *json_dump << &field_dictionary_alloc;
+    }
+    // Output deparser resources
+    report_resources_deparser_json(fde_entries_i, fde_entries_e);
+
+    if (Phv::use(INGRESS).intersects(Phv::use(EGRESS))) {
+        if (!options.match_compiler) {
+            error(lineno[INGRESS], "Registers used in both ingress and egress in pipeline: %s",
+                  Phv::db_regset(Phv::use(INGRESS) & Phv::use(EGRESS)).c_str());
+        } else {
+            warning(lineno[INGRESS], "Registers used in both ingress and egress in pipeline: %s",
+                    Phv::db_regset(Phv::use(INGRESS) & Phv::use(EGRESS)).c_str());
+        }
+        /* FIXME -- this only (sort-of) works because 'deparser' comes first in the alphabet,
+         * FIXME -- so is the first section to have its 'output' method run.  Its a hack
+         * FIXME -- anyways to attempt to correct broken asm that should be an error */
+        Phv::unsetuse(INGRESS, phv_use[EGRESS]);
+        Phv::unsetuse(EGRESS, phv_use[INGRESS]);
+    }
+
+    check_jbay_ownership(phv_use);
+    regs.dprsrreg.inp.icr.i_phv8_grp.enable();
+    regs.dprsrreg.inp.icr.i_phv16_grp.enable();
+    regs.dprsrreg.inp.icr.i_phv32_grp.enable();
+    //  regs.dprsrreg.inp.icr.scratch.enable();
+    regs.dprsrreg.inp.icr.i_phv8_grp.val = 0;
+    regs.dprsrreg.inp.icr.i_phv16_grp.val = 0;
+    regs.dprsrreg.inp.icr.i_phv32_grp.val = 0;
+    //  regs.dprsrreg.inp.icr.scratch.value = 0;
+    setup_jbay_ownership(phv_use[INGRESS], regs.dprsrreg.inp.icr.i_phv8_grp.val,
+                         regs.dprsrreg.inp.icr.i_phv16_grp.val,
+                         regs.dprsrreg.inp.icr.i_phv32_grp.val);
+    regs.dprsrreg.inp.icr.e_phv8_grp.enable();
+    regs.dprsrreg.inp.icr.e_phv16_grp.enable();
+    regs.dprsrreg.inp.icr.e_phv32_grp.enable();
+    setup_jbay_ownership(phv_use[EGRESS], regs.dprsrreg.inp.icr.e_phv8_grp.val,
+                         regs.dprsrreg.inp.icr.e_phv16_grp.val,
+                         regs.dprsrreg.inp.icr.e_phv32_grp.val);
+
+    for (auto &intrin : intrinsics) intrin.type->setregs(regs, *this, intrin);
+
+    /* resubmit_mode specifies whether this pipe can perform a resubmit operation on
+       a packet. i.e. tell the IPB to resubmit a packet to the MAU pipeline for a second
+       time. If the compiler determines that no resubmit is possible, then it can set this
+       bit, which should lower latency in some circumstances.
+       0 = Resubmit is allowed.  1 = Resubmit is not allowed */
+    bool resubmit = false;
+    for (auto &digest : digests) {
+        if (digest.type->name == "resubmit" ||
+            digest.type->name == "resubmit_preserving_field_list") {
+            resubmit = true;
+            break;
+        }
+    }
+    if (resubmit)
+        regs.dprsrreg.inp.ipp.ingr.resubmit_mode.mode = 0;
+    else
+        regs.dprsrreg.inp.ipp.ingr.resubmit_mode.mode = 1;
+
+    for (auto &digest : digests) digest.type->setregs(regs, *this, digest);
+
+    /* Set learning digest mask for JBay */
+    for (auto &digest : digests) {
+        if (digest.type->name == "learning") {
+            regs.dprsrreg.inp.icr.lrnmask.enable();
+            for (auto &set : digest.layout) {
+                int id = set.first;
+                int len = regs.dprsrreg.inp.ipp.ingr.learn_tbl[id].len;
+                if (len == 0) continue;  // Allow empty param list
+
+                // Fix for TF2LAB-37s:
+                // This fixes a hardware limitation where the container following
+                // the last PHV used cannot be the same non 8 bit container as the last entry.
+                // E.g. For len = 5, (active entries start at index 47)
+                // Used   - PHV[47] ... PHV[43] = 0;
+                // Unused - PHV[42] ... PHV[0] = 0; // Defaults to 0
+                // This causes issues in hardware as container 0 is used.
+                // We fix by setting the default as 64 an 8 - bit container. It can be any
+                // other 8 bit container value.
+                // The hardware does not cause any issues for 8 bit conatiners.
+                for (int i = 47 - len; i >= 0; i--)
+                    regs.dprsrreg.inp.ipp.ingr.learn_tbl[id].phvs[i] = 64;
+                // Fix for TF2LAB-37 end
+
+                // Create a bitvec of all phv masks stacked up next to each
+                // other in big-endian. 'setregs' above stacks the digest fields
+                // in a similar manner to setup the phvs per byte on learn_tbl
+                // regs. To illustrate with an example - tna_digest.p4 (since
+                // this is not clear based on reg descriptions);
+                //
+                // BFA Output:
+                //
+                //   learning:
+                //      select: { B1(0..2): B0(1) }  # L[0..2]b:
+                //      ingress::ig_intr_md_for_dprsr.digest_type 0:
+                //        - B1(0..2)  # L[0..2]b: ingress::ig_intr_md_for_dprsr.digest_type
+                //        - MW0  # ingress::hdr.ethernet.dst_addr.16-47
+                //        - MH1  # ingress::hdr.ethernet.dst_addr.0-15
+                //        - MH0(0..8)  # L[0..8]b: ingress::ig_md.port
+                //        - MW1  # ingress::hdr.ethernet.src_addr.16-47
+                //        - MH2  # ingress::hdr.ethernet.src_addr.0-15
+                //
+                // PHV packing for digest,
+                //
+                //    B1(7..0) | MW0 (31..24) | MW0(23..16) | MW0(15..8)  |
+                //   MW0(7..0) | MH1 (15..8)  | MH1(7..0)   | MH0(16..8)  |
+                //   MH0(7..0) | MW1 (31..24) | MW1(23..16) | MW1(15..8)  |
+                //   MW1(7..0) | MH2 (15..8)  | MH2(7..0)   | ----------  |
+                //
+                // Learn Mask Regs for above digest
+                //   deparser.regs.dprsrreg.inp.icr.lrnmask[0].mask[11] = 4294967047 (0x07ffffff)
+                //   deparser.regs.dprsrreg.inp.icr.lrnmask[0].mask[10] = 4294967295 (0xffffff01)
+                //   deparser.regs.dprsrreg.inp.icr.lrnmask[0].mask[9]  = 4278321151 (0xffffffff)
+                //   deparser.regs.dprsrreg.inp.icr.lrnmask[0].mask[8]  = 4294967040 (0xffffff00)
+
+                bitvec lrnmask;
+                int startBit = 0;
+                int size = 0;
+                for (auto p : set.second) {
+                    if (size > 0) lrnmask <<= p->reg.size;
+                    auto psliceSize = p.size();
+                    startBit = p.lobit();
+                    lrnmask.setrange(startBit, psliceSize);
+                    size += p->reg.size;
+                }
+                // Pad to a 32 bit word
+                auto shift = (size % 32) ? (32 - (size % 32)) : 0;
+                lrnmask <<= shift;
+                int num_words = (size + 31) / 32;
+                int quanta_index = 11;
+                for (int index = num_words - 1; index >= 0; index--) {
+                    BUG_CHECK(quanta_index >= 0);
+                    unsigned word = lrnmask.getrange(index * 32, 32);
+                    regs.dprsrreg.inp.icr.lrnmask[id].mask[quanta_index--] = word;
+                }
+            }
+        }
+    }
+
+#define DISBALE_IF_NOT_SET(ISARRAY, ARRAY, REGS, DISABLE) \
+    ISARRAY(for (auto &r : ARRAY)) if (!ISARRAY(r.) REGS.modified()) ISARRAY(r.) REGS.DISABLE = 1;
+    JBAY_DISABLE_REGBITS(DISBALE_IF_NOT_SET)
+
+    if (options.condense_json) regs.disable_if_reset_value();
+    if (error_count == 0 && options.gen_json)
+        regs.emit_json(*open_output("regs.deparser.cfg.json"));
+    TopLevel::regs<Target::JBay>()->reg_pipe.pardereg.dprsrreg.set("regs.deparser", &regs);
+}
+
+#if 0
+namespace {
+static struct JbayChecksumReg : public Phv::Register {
+    JbayChecksumReg(int unit) : Phv::Register("", Phv::Register::CHECKSUM, unit,
+                                              unit+CONSTANTS_PHVID_JBAY_HIGH, 16) {
+        snprintf(name, "csum%d", unit); }
+    int deparser_id() const override { return uid; }
+} jbay_checksum_units[8] = { {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7} };
+}
+
+template<> Phv::Slice Deparser::RefOrChksum::lookup<Target::JBay>() const {
+    if (lo != hi || lo < 0 || lo >= Target::JBay::DEPARSER_CHECKSUM_UNITS) {
+        error(lineno, "Invalid checksum unit number");
+        return Phv::Slice(); }
+    return Phv::Slice(tofino_checksum_units[lo], 0, 15);
+}
+#endif
+
+template <>
+unsigned Deparser::FDEntry::Checksum::encode<Target::JBay>() {
+    return CONSTANTS_PHVID_JBAY_HIGH + unit;
+}
+
+template <>
+unsigned Deparser::FDEntry::Constant::encode<Target::JBay>() {
+    return CONSTANTS_PHVID_JBAY_LOW + Deparser::constant_idx(gress, val);
+}
+
+template <>
+void Deparser::gen_learn_quanta(Target::JBay::parser_regs &regs, json::vector &learn_quanta) {}
+
+template <>
+void Deparser::process(Target::JBay *) {
+    // Chip-specific code for process method
+    // None for JBay
+}
diff --git a/backends/tofino/bf-asm/jbay/gateway.cpp b/backends/tofino/bf-asm/jbay/gateway.cpp
new file mode 100644
index 00000000000..9f7da083748
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/gateway.cpp
@@ -0,0 +1,99 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/jbay/gateway.h"
+
+#include "backends/tofino/bf-asm/stage.h"
+
+void Target::Tofino::GatewayTable::write_next_table_regs(Target::JBay::mau_regs &regs) {
+    auto &merge = regs.rams.match.merge;
+    if (need_next_map_lut) merge.next_table_map_en_gateway |= 1U << logical_id;
+    int idx = 3;
+    for (auto &line : table) {
+        BUG_CHECK(idx >= 0);
+        if (!line.run_table) {
+            if (need_next_map_lut)
+                merge.gateway_next_table_lut[logical_id][idx] = line.next_map_lut;
+            else
+                merge.gateway_next_table_lut[logical_id][idx] = line.next.next_table_id();
+        }
+        --idx;
+    }
+    if (!miss.run_table) {
+        if (need_next_map_lut)
+            merge.gateway_next_table_lut[logical_id][4] = miss.next_map_lut;
+        else
+            merge.gateway_next_table_lut[logical_id][4] = miss.next.next_table_id();
+    }
+    if (!match_table && need_next_map_lut) {
+        // Factor with common code in jbay/match_table.cpp write_next_table_regs
+        merge.next_table_map_en |= 1U << logical_id;
+        int i = 0;
+        for (auto &n : extra_next_lut) {
+            merge.pred_map_loca[logical_id][i].pred_map_loca_next_table = n.next_table_id();
+            merge.pred_map_loca[logical_id][i].pred_map_loca_exec =
+                n.next_in_stage(stage->stageno) >> 1;
+            merge.pred_map_glob[logical_id][i].pred_map_glob_exec =
+                n.next_in_stage(stage->stageno + 1);
+            merge.pred_map_glob[logical_id][i].pred_map_long_brch |= n.long_branch_tags();
+            ++i;
+        }
+        // is this needed?  The model complains if we leave the unused slots as 0
+        while (i < Target::NEXT_TABLE_SUCCESSOR_TABLE_DEPTH())
+            merge.pred_map_loca[logical_id][i++].pred_map_loca_next_table = 0x1ff;
+    }
+}
+
+template <>
+void GatewayTable::standalone_write_regs(Target::JBay::mau_regs &regs) {
+    // FIXME -- factor this with JBay MatchTable::write_regs
+    auto &merge = regs.rams.match.merge;
+    if (gress == GHOST) merge.pred_ghost_thread |= 1 << logical_id;
+    merge.pred_glob_exec_thread[gress] |= 1 << logical_id;
+    if (always_run || pred.empty()) merge.pred_always_run[gress] |= 1 << logical_id;
+
+    if (long_branch_input >= 0)
+        setup_muxctl(merge.pred_long_brch_lt_src[logical_id], long_branch_input);
+
+    bool is_branch = (miss.next.next_table() != nullptr);
+    if (!is_branch && !need_next_map_lut) {
+        for (auto &line : table) {
+            if (line.next.next_table() != nullptr) {
+                is_branch = true;
+                break;
+            }
+        }
+    }
+    if (!is_branch)
+        for (auto &n : hit_next)
+            if (n.next_table() != nullptr) {
+                is_branch = true;
+                break;
+            }
+    if (!is_branch)
+        for (auto &n : extra_next_lut)
+            if (n.next_table() != nullptr) {
+                is_branch = true;
+                break;
+            }
+    if (is_branch) merge.pred_is_a_brch |= 1 << logical_id;
+
+    merge.mpr_glob_exec_thread |= merge.logical_table_thread[0].logical_table_thread_egress &
+                                  ~merge.logical_table_thread[0].logical_table_thread_ingress &
+                                  ~merge.pred_ghost_thread;
+}
+template void GatewayTable::standalone_write_regs(Target::JBay::mau_regs &regs);
diff --git a/backends/tofino/bf-asm/jbay/gateway.h b/backends/tofino/bf-asm/jbay/gateway.h
new file mode 100644
index 00000000000..3700f07f9dd
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/gateway.h
@@ -0,0 +1,27 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_JBAY_GATEWAY_H_
+#define BACKENDS_TOFINO_BF_ASM_JBAY_GATEWAY_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+#include "backends/tofino/bf-asm/tofino/gateway.h"
+
+template <>
+void GatewayTable::standalone_write_regs(Target::JBay::mau_regs &regs);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_JBAY_GATEWAY_H_ */
diff --git a/backends/tofino/bf-asm/jbay/input_xbar.cpp b/backends/tofino/bf-asm/jbay/input_xbar.cpp
new file mode 100644
index 00000000000..e0e7e7e61b7
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/input_xbar.cpp
@@ -0,0 +1,70 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/jbay/input_xbar.h"
+
+template <>
+void InputXbar::write_galois_matrix(Target::JBay::mau_regs &regs, HashTable id,
+                                    const std::map<int, HashCol> &mat) {
+    int parity_col = -1;
+    BUG_CHECK(id.type == HashTable::EXACT, "not an exact hash table %d", id.type);
+    if (hash_table_parity.count(id) && !options.disable_gfm_parity) {
+        parity_col = hash_table_parity[id];
+    }
+    auto &hash = regs.dp.xbar_hash.hash;
+    std::set<int> gfm_rows;
+    for (auto &col : mat) {
+        int c = col.first;
+        // Skip parity column encoding, if parity is set overall parity is
+        // computed later below
+        if (c == parity_col) continue;
+        const HashCol &h = col.second;
+        for (int word = 0; word < 4; word++) {
+            unsigned data = h.data.getrange(word * 16, 16);
+            if (data == 0) continue;
+            auto &w = hash.galois_field_matrix[id.index * 4 + word][c];
+            w.byte0 = data & 0xff;
+            w.byte1 = (data >> 8) & 0xff;
+            gfm_rows.insert(id.index * 4 + word);
+        }
+    }
+    // A GFM row can be shared by multiple tables. In most cases the columns are
+    // non overlapping but if they are overlapping the GFM encodings must be the
+    // same (e.g. ATCAM tables). The input xbar has checks to determine which
+    // cases are valid.
+    // The parity must be computed for all columns within the row and set into
+    // the parity column.
+    if (parity_col >= 0) {
+        for (auto r : gfm_rows) {
+            int hp_byte0 = 0, hp_byte1 = 0;
+            for (auto c = 0; c < 52; c++) {
+                if (c == parity_col) continue;
+                auto &w = hash.galois_field_matrix[r][c];
+                hp_byte0 ^= w.byte0;
+                hp_byte1 ^= w.byte1;
+            }
+            auto &w_hp = hash.galois_field_matrix[r][parity_col];
+            w_hp.byte0.rewrite();
+            w_hp.byte1.rewrite();
+            w_hp.byte0 = hp_byte0;
+            w_hp.byte1 = hp_byte1;
+        }
+    }
+}
+
+template void InputXbar::write_galois_matrix(Target::JBay::mau_regs &regs, HashTable id,
+                                             const std::map<int, HashCol> &mat);
diff --git a/backends/tofino/bf-asm/jbay/input_xbar.h b/backends/tofino/bf-asm/jbay/input_xbar.h
new file mode 100644
index 00000000000..45b11f3c581
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/input_xbar.h
@@ -0,0 +1,27 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_JBAY_INPUT_XBAR_H_
+#define BACKENDS_TOFINO_BF_ASM_JBAY_INPUT_XBAR_H_
+
+#include "backends/tofino/bf-asm/input_xbar.h"
+
+template <>
+void InputXbar::write_galois_matrix(Target::JBay::mau_regs &regs, HashTable id,
+                                    const std::map<int, HashCol> &mat);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_JBAY_INPUT_XBAR_H_ */
diff --git a/backends/tofino/bf-asm/jbay/instruction.cpp b/backends/tofino/bf-asm/jbay/instruction.cpp
new file mode 100644
index 00000000000..1eee6043d2f
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/instruction.cpp
@@ -0,0 +1,200 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* JBay overloads for instructions #included in instruction.cpp
+ * WARNING -- this is included in an anonymous namespace, as VLIWInstruction is
+ * in that anonymous namespace */
+
+template <typename REGS>
+void VLIWInstruction::write_regs_2(REGS &regs, Table *tbl, Table::Actions::Action *act) {
+    if (act != tbl->stage->imem_addr_use[imem_thread(tbl->gress)][act->addr]) {
+        LOG3("skipping " << tbl->name() << '.' << act->name << " as its imem is used by "
+                         << tbl->stage->imem_addr_use[imem_thread(tbl->gress)][act->addr]->name);
+        return;
+    }
+    LOG2(this);
+    auto &imem = regs.dp.imem;
+    int iaddr = act->addr / ACTION_IMEM_COLORS;
+    int color = act->addr % ACTION_IMEM_COLORS;
+    unsigned bits = encode();
+    BUG_CHECK(slot >= 0);
+    unsigned off = slot % Phv::mau_groupsize();
+    unsigned side = 0, group = 0;
+    switch (slot / Phv::mau_groupsize()) {
+        case 0:
+            side = 0;
+            group = 0;
+            break;
+        case 1:
+            side = 0;
+            group = 1;
+            break;
+        case 2:
+            side = 1;
+            group = 0;
+            break;
+        case 3:
+            side = 1;
+            group = 1;
+            break;
+        case 4:
+            side = 0;
+            group = 0;
+            break;
+        case 5:
+            side = 0;
+            group = 1;
+            break;
+        case 6:
+            side = 1;
+            group = 0;
+            break;
+        case 7:
+            side = 1;
+            group = 1;
+            break;
+        case 8:
+            side = 0;
+            group = 0;
+            break;
+        case 9:
+            side = 0;
+            group = 1;
+            break;
+        case 10:
+            side = 0;
+            group = 2;
+            break;
+        case 11:
+            side = 1;
+            group = 0;
+            break;
+        case 12:
+            side = 1;
+            group = 1;
+            break;
+        case 13:
+            side = 1;
+            group = 2;
+            break;
+        default:
+            BUG();
+    }
+
+    switch (Phv::reg(slot)->type) {
+        case Phv::Register::NORMAL:
+            switch (Phv::reg(slot)->size) {
+                case 8:
+                    BUG_CHECK(group == 0 || group == 1);
+                    imem.imem_subword8[side][group][off][iaddr].imem_subword8_instr = bits;
+                    imem.imem_subword8[side][group][off][iaddr].imem_subword8_color = color;
+                    imem.imem_subword8[side][group][off][iaddr].imem_subword8_parity =
+                        parity(bits) ^ color;
+                    break;
+                case 16:
+                    imem.imem_subword16[side][group][off][iaddr].imem_subword16_instr = bits;
+                    imem.imem_subword16[side][group][off][iaddr].imem_subword16_color = color;
+                    imem.imem_subword16[side][group][off][iaddr].imem_subword16_parity =
+                        parity(bits) ^ color;
+                    break;
+                case 32:
+                    BUG_CHECK(group == 0 || group == 1);
+                    imem.imem_subword32[side][group][off][iaddr].imem_subword32_instr = bits;
+                    imem.imem_subword32[side][group][off][iaddr].imem_subword32_color = color;
+                    imem.imem_subword32[side][group][off][iaddr].imem_subword32_parity =
+                        parity(bits) ^ color;
+                    break;
+                default:
+                    BUG();
+            }
+            break;
+        case Phv::Register::MOCHA:
+            switch (Phv::reg(slot)->size) {
+                case 8:
+                    BUG_CHECK(group == 0 || group == 1);
+                    imem.imem_mocha_subword8[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_instr = bits;
+                    imem.imem_mocha_subword8[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_color = color;
+                    imem.imem_mocha_subword8[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_parity = parity(bits) ^ color;
+                    break;
+                case 16:
+                    imem.imem_mocha_subword16[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_instr = bits;
+                    imem.imem_mocha_subword16[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_color = color;
+                    imem.imem_mocha_subword16[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_parity = parity(bits) ^ color;
+                    break;
+                case 32:
+                    BUG_CHECK(group == 0 || group == 1);
+                    imem.imem_mocha_subword32[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_instr = bits;
+                    imem.imem_mocha_subword32[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_color = color;
+                    imem.imem_mocha_subword32[side][group][off - 12][iaddr]
+                        .imem_mocha_subword_parity = parity(bits) ^ color;
+                    break;
+                default:
+                    BUG();
+            }
+            break;
+        case Phv::Register::DARK:
+            switch (Phv::reg(slot)->size) {
+                case 8:
+                    BUG_CHECK(group == 0 || group == 1);
+                    imem.imem_dark_subword8[side][group][off - 16][iaddr].imem_dark_subword_instr =
+                        bits;
+                    imem.imem_dark_subword8[side][group][off - 16][iaddr].imem_dark_subword_color =
+                        color;
+                    imem.imem_dark_subword8[side][group][off - 16][iaddr].imem_dark_subword_parity =
+                        parity(bits) ^ color;
+                    break;
+                case 16:
+                    imem.imem_dark_subword16[side][group][off - 16][iaddr].imem_dark_subword_instr =
+                        bits;
+                    imem.imem_dark_subword16[side][group][off - 16][iaddr].imem_dark_subword_color =
+                        color;
+                    imem.imem_dark_subword16[side][group][off - 16][iaddr]
+                        .imem_dark_subword_parity = parity(bits) ^ color;
+                    break;
+                case 32:
+                    BUG_CHECK(group == 0 || group == 1);
+                    imem.imem_dark_subword32[side][group][off - 16][iaddr].imem_dark_subword_instr =
+                        bits;
+                    imem.imem_dark_subword32[side][group][off - 16][iaddr].imem_dark_subword_color =
+                        color;
+                    imem.imem_dark_subword32[side][group][off - 16][iaddr]
+                        .imem_dark_subword_parity = parity(bits) ^ color;
+                    break;
+                default:
+                    BUG();
+            }
+            break;
+        default:
+            BUG();
+    }
+
+    auto &power_ctl = regs.dp.actionmux_din_power_ctl;
+    phvRead([&](const Phv::Slice &sl) { set_power_ctl_reg(power_ctl, sl.reg.mau_id()); });
+}
+
+void VLIWInstruction::write_regs(Target::JBay::mau_regs &regs, Table *tbl,
+                                 Table::Actions::Action *act) {
+    write_regs_2<Target::JBay::mau_regs>(regs, tbl, act);
+}
diff --git a/backends/tofino/bf-asm/jbay/match_table.cpp b/backends/tofino/bf-asm/jbay/match_table.cpp
new file mode 100644
index 00000000000..cd177a86af8
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/match_table.cpp
@@ -0,0 +1,130 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* mau table template specializations for jbay -- #included directly in match_tables.cpp */
+
+/**  Write next table setup, which is JBay-specific.  `tbl` here is the ternary indirect
+ * table if there is one, or the match table otherwise */
+template <>
+void MatchTable::write_next_table_regs(Target::JBay::mau_regs &regs, Table *tbl) {
+    auto &merge = regs.rams.match.merge;
+    if (!hit_next.empty() || !extra_next_lut.empty()) {
+        merge.next_table_map_en |= (1U << logical_id);
+        int i = 0;
+        for (auto &n : hit_next) {
+            merge.pred_map_loca[logical_id][i].pred_map_loca_next_table = n.next_table_id();
+            merge.pred_map_loca[logical_id][i].pred_map_loca_exec =
+                n.next_in_stage(stage->stageno) >> 1;
+            merge.pred_map_glob[logical_id][i].pred_map_glob_exec =
+                n.next_in_stage(stage->stageno + 1);
+            merge.pred_map_glob[logical_id][i].pred_map_long_brch |= n.long_branch_tags();
+            ++i;
+        }
+        for (auto &n : extra_next_lut) {
+            merge.pred_map_loca[logical_id][i].pred_map_loca_next_table = n.next_table_id();
+            merge.pred_map_loca[logical_id][i].pred_map_loca_exec =
+                n.next_in_stage(stage->stageno) >> 1;
+            merge.pred_map_glob[logical_id][i].pred_map_glob_exec =
+                n.next_in_stage(stage->stageno + 1);
+            merge.pred_map_glob[logical_id][i].pred_map_long_brch |= n.long_branch_tags();
+            ++i;
+        }
+        // is this needed?  The model complains if we leave the unused slots as 0
+        while (i < Target::NEXT_TABLE_SUCCESSOR_TABLE_DEPTH())
+            merge.pred_map_loca[logical_id][i++].pred_map_loca_next_table = 0x1ff;
+    }
+
+    merge.next_table_format_data[logical_id].match_next_table_adr_mask = next_table_adr_mask;
+    merge.next_table_format_data[logical_id].match_next_table_adr_miss_value =
+        miss_next.next_table_id();
+    merge.pred_miss_exec[logical_id].pred_miss_loca_exec =
+        miss_next.next_in_stage(stage->stageno) >> 1;
+    merge.pred_miss_exec[logical_id].pred_miss_glob_exec =
+        miss_next.next_in_stage(stage->stageno + 1);
+    merge.pred_miss_long_brch[logical_id] = miss_next.long_branch_tags();
+}
+
+template <>
+void MatchTable::write_regs(Target::JBay::mau_regs &regs, int type, Table *result) {
+    write_common_regs<Target::JBay>(regs, type, result);
+    // FIXME -- factor this with JBay GatewayTable::standalone_write_regs
+    auto &merge = regs.rams.match.merge;
+    if (gress == GHOST) merge.pred_ghost_thread |= 1 << logical_id;
+    merge.pred_glob_exec_thread[gress] |= 1 << logical_id;
+    if (always_run || pred.empty()) merge.pred_always_run[gress] |= 1 << logical_id;
+
+    if (long_branch_input >= 0)
+        setup_muxctl(merge.pred_long_brch_lt_src[logical_id], long_branch_input);
+
+    if (result == nullptr) result = this;
+
+    bool is_branch = (miss_next.next_table() != nullptr);
+    if (!is_branch && gateway && gateway->is_branch()) is_branch = true;
+    if (!is_branch)
+        for (auto &n : hit_next)
+            if (n.next_table() != nullptr) {
+                is_branch = true;
+                break;
+            }
+    if (!is_branch)
+        for (auto &n : extra_next_lut)
+            if (n.next_table() != nullptr) {
+                is_branch = true;
+                break;
+            }
+
+    if (!is_branch && result->get_format_field_size("next") > 3) is_branch = true;
+
+    // Check if any table actions have a next table miss set up
+    // if yes, the pred_is_a_brch register must be set on the table to override the next table
+    // configuration with this value.
+    //
+    // E.g.
+    //   switch (mc_filter.apply().action_run) {
+    //     NoAction : { // Has @defaultonly
+    //       ttl_thr_check.apply();
+    //     }
+    //   }
+    //
+    // Generated bfa
+    //   ...
+    //   hit: [  END  ]
+    //   miss:  END
+    //   ...
+    //   NoAction(-1, 1):
+    //     - hit_allowed: { allowed: false, reason: user_indicated_default_only  }
+    //     - default_only_action: { allowed: true, is_constant: true  }
+    //     - handle: 0x20000015
+    //     - next_table_miss:  ttl_thr_check_0
+    //
+    // If merge.pred_is_a_brch is not set in this usecase, the default miss configuration of 'END'
+    // or 'End of Pipe' is executed and next table ttl_thr_check_0 will not be executed.
+    if (!is_branch) {
+        for (auto &act : *result->actions) {
+            if (act.next_table_miss_ref.next_table()) {
+                is_branch = true;
+                break;
+            }
+        }
+    }
+
+    if (is_branch) merge.pred_is_a_brch |= 1 << logical_id;
+
+    merge.mpr_glob_exec_thread |= merge.logical_table_thread[0].logical_table_thread_egress &
+                                  ~merge.logical_table_thread[0].logical_table_thread_ingress &
+                                  ~merge.pred_ghost_thread;
+}
diff --git a/backends/tofino/bf-asm/jbay/meter.h b/backends/tofino/bf-asm/jbay/meter.h
new file mode 100644
index 00000000000..f6c8e6ef449
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/meter.h
@@ -0,0 +1,140 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_JBAY_METER_H_
+#define BACKENDS_TOFINO_BF_ASM_JBAY_METER_H_
+
+template <typename REGS>
+void MeterTable::setup_teop_regs_2(REGS &regs, int meter_group_index) {
+    BUG_CHECK(teop >= 0 && teop < 4);
+    BUG_CHECK(gress == EGRESS);
+
+    auto &adrdist = regs.rams.match.adrdist;
+    if (!teop_initialized) {
+        // assume this stage driving teop
+        auto delay = stage->pipelength(gress) - stage->pred_cycle(gress) - 7;
+        adrdist.teop_bus_ctl[teop].teop_bus_ctl_delay = delay;
+        adrdist.teop_bus_ctl[teop].teop_bus_ctl_delay_en = 1;
+        adrdist.teop_bus_ctl[teop].teop_bus_ctl_meter_en = 1;
+
+        adrdist.meter_to_teop_adr_oxbar_ctl[teop].enabled_2bit_muxctl_select = meter_group_index;
+        adrdist.meter_to_teop_adr_oxbar_ctl[teop].enabled_2bit_muxctl_enable = 1;
+        teop_initialized = true;
+    }
+
+    adrdist.teop_to_meter_adr_oxbar_ctl[meter_group_index].enabled_2bit_muxctl_select = teop;
+    adrdist.teop_to_meter_adr_oxbar_ctl[meter_group_index].enabled_2bit_muxctl_enable = 1;
+
+    // count all tEOP events
+    adrdist.dp_teop_meter_ctl[meter_group_index].dp_teop_meter_ctl_err = 0;
+    // Refer to JBAY uArch Section 6.4.4.10.8
+    //
+    // The user of the incoming tEOP address needs to consider the original
+    // driver. For instance, a meter address driver will be aliged with the LSB
+    // of the 18b incoming address, whereas a single-entry stats driver will be
+    // already padded with 2 zeros.
+    //
+    // For example, dp_teop_meter_ctl.dp_teop_meter_ctl_rx_shift must be
+    // programmed to 2 to compensate for the single-entry stats address driver:
+    //
+    // Meter (23b) = {4b CMD+Color, ((dp_teop{6b VPN, 10b addr, 2b subword
+    // zeros} >> 2) + 7b zero pad)}
+    //
+    // As per above, the dp_teop_meter_ctl_rx_shift is set based on the original
+    // driver. For a meter address driving there is no need for any shift,
+    // however if a stats address is driving then it needs to be shifted by 2.
+    // Compiler currently does not use this mechanism where a stats address is
+    // driving the meter, this is scope for optimization in future.
+    adrdist.dp_teop_meter_ctl[meter_group_index].dp_teop_meter_ctl_rx_shift = 0;
+    adrdist.dp_teop_meter_ctl[meter_group_index].dp_teop_meter_ctl_rx_en = 1;
+
+    auto &meter = regs.rams.map_alu.meter_group[meter_group_index].meter;
+    meter.meter_ctl_teop_en = 1;
+}
+
+template <typename REGS>
+void MeterTable::write_alu_vpn_range_2(REGS &regs) {
+    auto &adrdist = regs.rams.match.adrdist;
+    int minvpn, sparevpn;
+
+    // Used to validate the BFA VPN configuration
+    std::set<int> vpn_processed;
+    bitvec vpn_range;
+
+    // Get Spare VPN
+    layout_vpn_bounds(minvpn, sparevpn, false);
+
+    for (int home_row : home_rows) {
+        bool block_start = false;
+        bool block_end = false;
+        int min = 1000000;
+        int max = -1;
+        for (Layout &logical_row : layout) {
+            // Block Start with the home row and End with the Spare VPN
+            if (logical_row.row == home_row) block_start = true;
+
+            if (block_start) {
+                for (auto v : logical_row.vpns) {
+                    if (v == sparevpn) {
+                        block_end = true;
+                        break;
+                    }
+                    if (vpn_processed.count(v))
+                        error(home_lineno, "Multiple instance of the VPN %d detected", v);
+                    else
+                        vpn_processed.insert(v);
+
+                    if (v < min) min = v;
+                    if (v > max) max = v;
+                }
+            }
+            if (block_end) {
+                BUG_CHECK(min != 1000000 && max != -1);
+
+                bitvec block_range(min, max - min + 1);
+                if (vpn_range.intersects(block_range))
+                    error(home_lineno, "Overlapping of VPN range detected");
+                else
+                    vpn_range |= block_range;
+
+                adrdist.mau_meter_alu_vpn_range[home_row / 4].meter_vpn_base = min;
+                adrdist.mau_meter_alu_vpn_range[home_row / 4].meter_vpn_limit = max;
+                adrdist.mau_meter_alu_vpn_range[home_row / 4].meter_vpn_range_check_enable = 1;
+                for (MatchTable *m : match_tables)
+                    adrdist.meter_alu_adr_range_check_icxbar_map[home_row / 4] |= 1U
+                                                                                  << m->logical_id;
+                break;
+            }
+        }
+        BUG_CHECK(block_start && block_end);
+    }
+
+    if (vpn_range != bitvec(minvpn, sparevpn - minvpn))
+        error(home_lineno, "VPN range not entirely covered");
+}
+
+template <>
+void MeterTable::setup_teop_regs(Target::JBay::mau_regs &regs, int meter_group_index) {
+    setup_teop_regs_2(regs, meter_group_index);
+}
+
+template <>
+void MeterTable::write_alu_vpn_range(Target::JBay::mau_regs &regs) {
+    write_alu_vpn_range_2(regs);
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_JBAY_METER_H_ */
diff --git a/backends/tofino/bf-asm/jbay/parser.cpp b/backends/tofino/bf-asm/jbay/parser.cpp
new file mode 100644
index 00000000000..fb5b53caa17
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/parser.cpp
@@ -0,0 +1,583 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/parser-tofino-jbay.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/top_level.h"
+
+template <>
+void Parser::Checksum::write_config(Target::JBay::parser_regs &regs, Parser *parser) {
+    if (unit == 0)
+        write_row_config(regs.memory[gress].po_csum_ctrl_0_row[addr]);
+    else if (unit == 1)
+        write_row_config(regs.memory[gress].po_csum_ctrl_1_row[addr]);
+    else if (unit == 2)
+        write_row_config(regs.memory[gress].po_csum_ctrl_2_row[addr]);
+    else if (unit == 3)
+        write_row_config(regs.memory[gress].po_csum_ctrl_3_row[addr]);
+    else if (unit == 4)
+        write_row_config(regs.memory[gress].po_csum_ctrl_4_row[addr]);
+    else
+        error(lineno, "invalid unit for parser checksum");
+}
+
+template <>
+void Parser::Checksum::write_output_config(Target::JBay::parser_regs &regs, Parser *pa,
+                                           State::Match * /*ma*/, void *_row,
+                                           unsigned &used) const {
+    if (type != 0 || !dest) return;
+
+    Target::JBay::parser_regs::_memory::_po_action_row *row =
+        (Target::JBay::parser_regs::_memory::_po_action_row *)_row;
+
+    // checksum verification outputs "steal" extractors, see parser uArch (6.3.6)
+
+    for (int i = 0; i < 20; ++i) {
+        if (used & (1 << i)) continue;
+        used |= 1 << i;
+        row->phv_dst[i] = dest->reg.parser_id();
+        row->extract_type[i] = 3;
+        return;
+    }
+    error(lineno, "Ran out of phv output extractor slots");
+}
+
+template <>
+void Parser::CounterInit::write_config(Target::JBay::parser_regs &regs, gress_t gress, int idx) {
+    auto &ctr_init_ram = regs.memory[gress].ml_ctr_init_ram[idx];
+    ctr_init_ram.add = add;
+    ctr_init_ram.mask_8 = mask;
+    ctr_init_ram.rotate = rot;
+    ctr_init_ram.max = max;
+    ctr_init_ram.src = src;
+}
+
+template <>
+void Parser::State::Match::write_lookup_config(Target::JBay::parser_regs &regs, State *state,
+                                               int r) const {
+    auto &row = regs.memory[state->gress].ml_tcam_row[r];
+    match_t lookup = {0, 0};
+    unsigned dont_care = 0;
+    for (int i = 0; i < 4; i++) {
+        lookup.word0 <<= 8;
+        lookup.word1 <<= 8;
+        dont_care <<= 8;
+        if (state->key.data[i].bit >= 0) {
+            lookup.word0 |= ((match.word0 >> state->key.data[i].bit) & 0xff);
+            lookup.word1 |= ((match.word1 >> state->key.data[i].bit) & 0xff);
+        } else {
+            dont_care |= 0xff;
+        }
+    }
+    lookup.word0 |= dont_care;
+    lookup.word1 |= dont_care;
+    for (int i = 3; i >= 0; i--) {
+        row.w0_lookup_8[i] = lookup.word0 & 0xff;
+        row.w1_lookup_8[i] = lookup.word1 & 0xff;
+        lookup.word0 >>= 8;
+        lookup.word1 >>= 8;
+    }
+    row.w0_curr_state = state->stateno.word0;
+    row.w1_curr_state = state->stateno.word1;
+    if (state->key.ctr_zero >= 0) {
+        row.w0_ctr_zero = (match.word0 >> state->key.ctr_zero) & 1;
+        row.w1_ctr_zero = (match.word1 >> state->key.ctr_zero) & 1;
+    } else {
+        row.w0_ctr_zero = row.w1_ctr_zero = 1;
+    }
+    if (state->key.ctr_neg >= 0) {
+        row.w0_ctr_neg = (match.word0 >> state->key.ctr_neg) & 1;
+        row.w1_ctr_neg = (match.word1 >> state->key.ctr_neg) & 1;
+    } else {
+        row.w0_ctr_neg = row.w1_ctr_neg = 1;
+    }
+    row.w0_ver_0 = row.w1_ver_0 = 1;
+    row.w0_ver_1 = row.w1_ver_1 = 1;
+}
+
+/* FIXME -- combine these next two methods into a single method on MatchKey */
+/* FIXME -- factor Tofino/JBay variation better (most is common) */
+template <>
+int Parser::State::write_lookup_config(Target::JBay::parser_regs &regs, Parser *pa, State *state,
+                                       int row, const std::vector<State *> &prev) {
+    LOG2("-- checking match from state " << name << " (" << stateno << ')');
+    auto &ea_row = regs.memory[gress].ml_ea_row[row];
+    int max_off = -1;
+    for (int i = 0; i < 4; i++) {
+        if (key.data[i].bit < 0) continue;
+        bool set = true;
+        for (State *p : prev) {
+            if (p->key.data[i].bit >= 0) {
+                set = false;
+                if (p->key.data[i].byte != key.data[i].byte)
+                    error(p->lineno,
+                          "Incompatible match fields between states "
+                          "%s and %s, triggered from state %s",
+                          name.c_str(), p->name.c_str(), state->name.c_str());
+            }
+        }
+        if (set && key.data[i].byte != MatchKey::USE_SAVED) {
+            int off = key.data[i].byte + ea_row.shift_amt;
+            // Valid offset ranges:
+            //   0..31  : Input packet
+            //   60..63 : Scratch registers
+            if ((off < 0) || ((off > 31) && (off < 60)) || (off > 63)) {
+                error(key.lineno,
+                      "Match offset of %d in state %s out of range "
+                      "for previous state %s",
+                      key.data[i].byte, name.c_str(), state->name.c_str());
+            }
+            ea_row.lookup_offset_8[i] = off;
+            ea_row.ld_lookup_8[i] = 1;
+            max_off = std::max(max_off, off);
+        }
+    }
+    return max_off;
+}
+
+template <>
+int Parser::State::Match::write_load_config(Target::JBay::parser_regs &regs, Parser *pa,
+                                            State *state, int row) const {
+    auto &ea_row = regs.memory[state->gress].ml_ea_row[row];
+    int max_off = -1;
+    for (int i = 0; i < 4; i++) {
+        if (load.data[i].bit < 0) continue;
+        if (load.data[i].byte != MatchKey::USE_SAVED) {
+            int off = load.data[i].byte;
+            // Valid offset ranges:
+            //   0..31  : Input packet
+            //   60..63 : Scratch registers
+            if ((off < 0) || ((off > 31) && (off < 60)) || (off > 63)) {
+                error(load.lineno, "Load offset of %d in state %s out of range", load.data[i].byte,
+                      state->name.c_str());
+            }
+            ea_row.lookup_offset_8[i] = off;
+            ea_row.ld_lookup_8[i] = 1;
+            max_off = std::max(max_off, off);
+        }
+        ea_row.sv_lookup_8[i] = (load.save >> i) & 1;
+    }
+
+    return max_off;
+}
+
+static void write_output_slot(int lineno, Target::JBay::parser_regs::_memory::_po_action_row *row,
+                              unsigned &used, int src, int dest, int bytemask, bool offset) {
+    BUG_CHECK(bytemask > 0 && bytemask < 4);
+    for (int i = 0; i < 20; ++i) {
+        if (used & (1 << i)) continue;
+        row->phv_dst[i] = dest;
+        row->phv_src[i] = src;
+        if (offset) row->phv_offset_add_dst[i] = 1;
+        row->extract_type[i] = bytemask;
+        used |= 1 << i;
+        return;
+    }
+    error(lineno, "Ran out of phv output slots");
+}
+
+template <>
+void Parser::State::Match::write_row_config(Target::JBay::parser_regs &regs, Parser *pa,
+                                            State *state, int row, Match *def,
+                                            json::map &ctxt_json) {
+    write_common_row_config(regs, pa, state, row, def, ctxt_json);
+    auto &action_row = regs.memory[state->gress].po_action_row[row];
+
+    if (disable_partial_hdr_err > 0) action_row.disable_partial_hdr_err = 1;
+}
+
+template <>
+int Parser::State::Match::Save::write_output_config(Target::JBay::parser_regs &regs, void *_row,
+                                                    unsigned &used, int, int) const {
+    Target::JBay::parser_regs::_memory::_po_action_row *row =
+        (Target::JBay::parser_regs::_memory::_po_action_row *)_row;
+    int dest = where->reg.parser_id();
+    int mask = (1 << (1 + where->hi / 8U)) - (1 << (where->lo / 8U));
+    int lo = this->lo;
+    // 8b containers are paired in 16b chunks in the parser
+    // If we're extracting to the upper half of a chunk (the odd 8b register) then
+    // adjust the extract type to be to the upper half
+    if (where->reg.size == 8 && mask == 1) {
+        if (where->reg.index & 1) {
+            mask <<= 1;
+        }
+    }
+    if (flags & ROTATE) error(where.lineno, "no rotate support in Tofino2");
+
+    // All containers are 16b in the parser. 32b container extracts are implemented as
+    // a pair of 16b extracts.
+    int bytemask = (mask >> 2) & 3;
+    if (bytemask) {
+        write_output_slot(where.lineno, row, used, lo, dest + 1, bytemask, flags & OFFSET);
+        lo += bitcount(mask & 0xc);
+    }
+
+    bytemask = mask & 3;
+    if (bytemask) write_output_slot(where.lineno, row, used, lo, dest, bytemask, flags & OFFSET);
+    return hi;
+}
+
+#define SAVE_ONLY_USED_SLOTS 0xffc00
+static void write_output_const_slot(int lineno,
+                                    Target::JBay::parser_regs::_memory::_po_action_row *row,
+                                    unsigned &used, unsigned src, int dest, int bytemask,
+                                    int flags) {
+    // use bits 24..27 of 'used' to track the two constant slots
+    BUG_CHECK(bytemask > 0 && bytemask < 4);
+    BUG_CHECK((src & ~((0xffff00ff >> (8 * (bytemask - 1))) & 0xffff)) == 0);
+    // FIXME -- should be able to treat this as 4x8-bit rather than 2x16-bit slots, as long
+    // as the ROTATE flag is consistent for each half.
+    int cslot = -1;
+    // see if const already allocated and reuse
+    for (cslot = 0; cslot < 2; cslot++)
+        if (row->val_const[cslot] == src && (used & (bytemask << (2 * cslot + 24)))) break;
+    if (cslot >= 2) {
+        for (cslot = 0; cslot < 2; cslot++)
+            if (0 == (used & (bytemask << (2 * cslot + 24)))) break;
+    }
+    if (cslot >= 2) {
+        error(lineno, "Ran out of constant output slots");
+        return;
+    }
+    row->val_const[cslot] |= src;
+    if (flags & 2 /*ROTATE*/) row->val_const_rot[cslot] = 1;
+    used |= bytemask << (2 * cslot + 24);
+    unsigned tmpused = used | SAVE_ONLY_USED_SLOTS;
+    write_output_slot(lineno, row, tmpused, 62 - 2 * cslot + (bytemask == 1), dest, bytemask,
+                      flags & 1 /*OFFSET*/);
+    used |= tmpused & ~SAVE_ONLY_USED_SLOTS;
+}
+
+template <>
+void Parser::State::Match::Set::write_output_config(Target::JBay::parser_regs &regs, void *_row,
+                                                    unsigned &used, int, int) const {
+    Target::JBay::parser_regs::_memory::_po_action_row *row =
+        (Target::JBay::parser_regs::_memory::_po_action_row *)_row;
+    int dest = where->reg.parser_id();
+    int mask = (1 << (1 + where->hi / 8U)) - (1 << (where->lo / 8U));
+    unsigned what = this->what << where->lo;
+    // Trim the bytes to be written, unless the value is being rotated
+    if (what && !(flags & ROTATE)) {
+        for (unsigned i = 0; i < 4; ++i)
+            if (((what >> (8 * i)) & 0xff) == 0) mask &= ~(1 << i);
+    }
+    if (where->reg.size == 8) {
+        BUG_CHECK((mask & ~1) == 0);
+        if (where->reg.index & 1) {
+            mask <<= 1;
+            what <<= 8;
+        }
+    }
+    if (mask & 3)
+        write_output_const_slot(where.lineno, row, used, what & 0xffff, dest, mask & 3, flags);
+    if (mask & 0xc) {
+        write_output_const_slot(where.lineno, row, used, (what >> 16) & 0xffff, dest + 1,
+                                (mask >> 2) & 3, flags);
+        if ((mask & 3) && (flags & ROTATE)) row->val_const_32b_bond = 1;
+    }
+}
+
+/* Tofino2 has a simple uniform array of 20 extractors, so doesn't really need an output
+ * map to track them.  Constants 'sets' are handled by having 4 bytes of data that is set
+ * per row and extrated from the input buffer like a 'save', except only the first 10
+ * extractors can access them.  So `output_map` ends up being just a pointer to the
+ * register object for the row, and `used` is a 24-bit bitmap, tracking the 20 extractors
+ * and the 4 constant bytes.
+ */
+template <>
+void *Parser::setup_phv_output_map(Target::JBay::parser_regs &regs, gress_t gress, int row) {
+    return &regs.memory[gress].po_action_row[row];
+}
+template <>
+void Parser::mark_unused_output_map(Target::JBay::parser_regs & /*regs*/, void * /*map*/,
+                                    unsigned /*used*/) {
+    // unneeded on jbay
+}
+
+template <>
+void Parser::State::Match::HdrLenIncStop::write_config(
+    JBay::memories_parser_::_po_action_row &po_row) const {
+    po_row.hdr_len_inc_stop = 1;
+    po_row.hdr_len_inc_final_amt = final_amt;
+}
+
+template <>
+void Parser::State::Match::Clot::write_config(JBay::memories_parser_::_po_action_row &po_row,
+                                              int idx, bool offset_add) const {
+    po_row.clot_tag[idx] = tag;
+    po_row.clot_offset[idx] = start;
+    if (load_length) {
+        po_row.clot_type[idx] = 1;
+        po_row.clot_len_src[idx] = length;
+        po_row.clot_en_len_shr[idx] = length_shift;
+        // po_row.clot_len_mask[idx] = length_mask; -- FIXME -- CSR reg commented out
+    } else {
+        po_row.clot_len_src[idx] = length - 1;
+        po_row.clot_type[idx] = 0;
+        po_row.clot_en_len_shr[idx] = 1;
+    }
+    po_row.clot_has_csum[idx] = csum_unit > 0;
+    po_row.clot_tag_offset_add[idx] = offset_add;
+}
+
+template <>
+void Parser::State::Match::write_counter_config(
+    Target::JBay::parser_regs::_memory::_ml_ea_row &ea_row) const {
+    if (ctr_load) {
+        switch (ctr_ld_src) {
+            case 0:
+                ea_row.ctr_op = 2;
+                break;
+            case 1:
+                ea_row.ctr_op = 3;
+                break;
+            default:
+                error(lineno, "Unsupported parser counter load instruction (JBay)");
+        }
+    } else if (ctr_stack_pop) {
+        ea_row.ctr_op = 1;
+    } else {  // add
+        ea_row.ctr_op = 0;
+    }
+
+    ea_row.ctr_amt_idx = ctr_instr ? ctr_instr->addr : ctr_imm_amt;
+    ea_row.ctr_stack_push = ctr_stack_push;
+    ea_row.ctr_stack_upd_w_top = ctr_stack_upd_w_top;
+}
+
+// Workaround for JBAY-2717: parser counter adds RAM index or immediate value
+// to the pushed value when doing push + update_w_top. To cancel this offset,
+// we subtract the amount on pop.
+void jbay2717_workaround(Parser *parser, Target::JBay::parser_regs &regs) {
+    for (auto &kv : parser->match_to_row) {
+        if (kv.first->ctr_stack_pop) {
+            for (auto p : kv.first->get_all_preds()) {
+                if (p->ctr_stack_push && p->ctr_stack_upd_w_top) {
+                    auto &ea_row = regs.memory[parser->gress].ml_ea_row[kv.second];
+                    auto adjust = p->ctr_instr ? p->ctr_instr->addr : p->ctr_imm_amt;
+                    ea_row.ctr_amt_idx = ea_row.ctr_amt_idx.value - adjust;
+                    break;
+                }
+            }
+        }
+    }
+}
+
+template <>
+void Parser::write_config(Target::JBay::parser_regs &regs, json::map &ctxt_json,
+                          bool single_parser) {
+    if (single_parser) {
+        for (auto st : all)
+            st->write_config(regs, this, ctxt_json[st->gress == EGRESS ? "egress" : "ingress"]);
+    } else {
+        ctxt_json["states"] = json::vector();
+        for (auto st : all) st->write_config(regs, this, ctxt_json["states"]);
+    }
+    if (error_count > 0) return;
+
+    jbay2717_workaround(this, regs);
+
+    int i = 0;
+    for (auto ctr : counter_init) {
+        if (ctr) ctr->write_config(regs, gress, i);
+        ++i;
+    }
+
+    for (i = 0; i < checksum_use.size(); i++) {
+        for (auto csum : checksum_use[i]) {
+            if (csum) {
+                csum->write_config(regs, this);
+                if (csum->dest) phv_use[csum->gress].setbit(csum->dest->reg.uid);
+            }
+        }
+    }
+
+    if (gress == EGRESS) {
+        regs.egress.epbreg.chan0_group.chnl_ctrl.meta_opt = meta_opt;
+        regs.egress.epbreg.chan1_group.chnl_ctrl.meta_opt = meta_opt;
+        regs.egress.epbreg.chan2_group.chnl_ctrl.meta_opt = meta_opt;
+        regs.egress.epbreg.chan3_group.chnl_ctrl.meta_opt = meta_opt;
+        regs.egress.epbreg.chan4_group.chnl_ctrl.meta_opt = meta_opt;
+        regs.egress.epbreg.chan5_group.chnl_ctrl.meta_opt = meta_opt;
+        regs.egress.epbreg.chan6_group.chnl_ctrl.meta_opt = meta_opt;
+        regs.egress.epbreg.chan7_group.chnl_ctrl.meta_opt = meta_opt;
+    }
+
+    // All phvs used globaly by egress and not by ingress parser should be owned by
+    // egress parser so they get zeroed properly in the parser
+    phv_use[EGRESS] |= remove_nonparser(Phv::use(EGRESS)) - expand_parser_groups(phv_use[INGRESS]);
+
+    setup_jbay_ownership(phv_use, regs.merge.ul.phv_owner_127_0.owner,
+                         regs.merge.ur.phv_owner_255_128.owner, regs.main[INGRESS].phv_owner.owner,
+                         regs.main[EGRESS].phv_owner.owner);
+
+    setup_jbay_clear_on_write(phv_allow_clear_on_write, regs.merge.ul.phv_clr_on_wr_127_0.clr,
+                              regs.merge.ur.phv_clr_on_wr_255_128.clr,
+                              regs.main[INGRESS].phv_clr_on_wr.clr,
+                              regs.main[EGRESS].phv_clr_on_wr.clr);
+
+    setup_jbay_no_multi_write(phv_allow_bitwise_or, phv_allow_clear_on_write,
+                              regs.main[INGRESS].no_multi_wr.nmw,
+                              regs.main[EGRESS].no_multi_wr.nmw);
+
+    regs.main[gress].hdr_len_adj.amt = hdr_len_adj;
+
+    if (parser_error.lineno >= 0) {
+        for (auto i : {0, 1}) {
+            regs.main[gress].err_phv_cfg[i].en = 1;
+            regs.main[gress].err_phv_cfg[i].dst = parser_error->reg.parser_id();
+            regs.main[gress].err_phv_cfg[i].no_tcam_match_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].partial_hdr_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].ctr_range_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].timeout_iter_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].timeout_cycle_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].src_ext_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].phv_owner_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].multi_wr_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].aram_mbe_en = 1;
+            regs.main[gress].err_phv_cfg[i].fcs_err_en = 1;
+            regs.main[gress].err_phv_cfg[i].csum_mbe_en = 1;
+        }
+    } else {
+        // en has a reset value of 1 and that is why we have to explicitly disable it
+        // otherwise dst will assume default value of 0
+        for (auto i : {0, 1}) regs.main[gress].err_phv_cfg[i].en = 0;
+    }
+
+    int i_start = Stage::first_table(INGRESS) & 0x1ff;
+    for (auto &reg : regs.merge.ll1.i_start_table) reg.table = i_start;
+
+    int e_start = Stage::first_table(EGRESS) & 0x1ff;
+    for (auto &reg : regs.merge.lr1.e_start_table) reg.table = e_start;
+
+    regs.merge.lr1.g_start_table.table = Stage::first_table(GHOST) & 0x1ff;
+    if (ghost_parser.size()) {
+        // tm_status_phv sets the location for ghost intrinsic metadata in the
+        // parser
+        // The parser carves up the 4k of PHV it sees into 256 x 16b containers.
+        // (32b MAU containers map to a pair of parser 16b containers, and 2 x
+        // 8b MAU containers map to a single parser 16b container.) PHV
+        // specified here will take the two containers at address
+        // { PHV & 0xfe, PHV & 0xfe + 1 }.
+        // Hence, ghost intrinsic metadata is assumed to be allocated in a
+        // contiguous 32b location.
+        regs.merge.lr1.tm_status_phv.en = 1;
+        regs.merge.lr1.tm_status_phv.phv = ghost_parser[0]->reg.parser_id();
+        if (ghost_pipe_mask != 0xf)  // if not default set given value
+            regs.merge.lr1.tm_status_phv.pipe_mask = ghost_pipe_mask;
+    }
+
+    if (gress == INGRESS) {
+        for (auto &ref : regs.ingress.prsr)
+            ref.set("regs.parser.main.ingress", &regs.main[INGRESS]);
+    }
+    if (gress == EGRESS) {
+        for (auto &ref : regs.egress.prsr) ref.set("regs.parser.main.egress", &regs.main[EGRESS]);
+    }
+    if (error_count == 0) {
+        if (options.condense_json) {
+            // FIXME -- removing the uninitialized memory causes problems?
+            // FIXME -- walle gets the addresses wrong.  Might also require explicit
+            // FIXME -- zeroing in the driver on real hardware
+            // regs.memory[INGRESS].disable_if_reset_value();
+            // regs.memory[EGRESS].disable_if_reset_value();
+            // regs.ingress.disable_if_reset_value();
+            // regs.egress.disable_if_reset_value();
+            regs.main[INGRESS].disable_if_reset_value();
+            regs.main[EGRESS].disable_if_reset_value();
+            regs.merge.disable_if_reset_value();
+        }
+        if (options.gen_json) {
+            if (single_parser) {
+                regs.memory[INGRESS].emit_json(*open_output("memories.parser.ingress.cfg.json"),
+                                               "ingress");
+                regs.memory[EGRESS].emit_json(*open_output("memories.parser.egress.cfg.json"),
+                                              "egress");
+                regs.ingress.emit_json(*open_output("regs.parser.ingress.cfg.json"));
+                regs.egress.emit_json(*open_output("regs.parser.egress.cfg.json"));
+                regs.main[INGRESS].emit_json(*open_output("regs.parser.main.ingress.cfg.json"),
+                                             "ingress");
+                regs.main[EGRESS].emit_json(*open_output("regs.parser.main.egress.cfg.json"),
+                                            "egress");
+                regs.merge.emit_json(*open_output("regs.parse_merge.cfg.json"));
+            } else {
+                regs.memory[INGRESS].emit_json(
+                    *open_output("memories.parser.ingress.%02x.cfg.json", parser_no), "ingress");
+                regs.memory[EGRESS].emit_json(
+                    *open_output("memories.parser.egress.%02x.cfg.json", parser_no), "egress");
+                regs.ingress.emit_json(
+                    *open_output("regs.parser.ingress.%02x.cfg.json", parser_no));
+                regs.egress.emit_json(*open_output("regs.parser.egress.%02x.cfg.json", parser_no));
+                regs.main[INGRESS].emit_json(*open_output("regs.parser.main.ingress.cfg.json"),
+                                             "ingress");
+                regs.main[EGRESS].emit_json(*open_output("regs.parser.main.egress.cfg.json"),
+                                            "egress");
+                regs.merge.emit_json(*open_output("regs.parse_merge.cfg.json"));
+            }
+        }
+    }
+
+    /* multiple JBay parser mem blocks can respond to same address range to allow programming
+     * the device with a single write operation. See: pardereg.pgstnreg.ibprsr4reg.prsr.mem_ctrl */
+    if (gress == INGRESS) {
+        for (unsigned i = 0; i < TopLevel::regs<Target::JBay>()->mem_pipe.parde.i_prsr_mem.size();
+             options.singlewrite ? i += 4 : i += 1) {
+            TopLevel::regs<Target::JBay>()->mem_pipe.parde.i_prsr_mem[i].set(
+                "memories.parser.ingress", &regs.memory[INGRESS]);
+        }
+    }
+
+    if (gress == EGRESS) {
+        for (unsigned i = 0; i < TopLevel::regs<Target::JBay>()->mem_pipe.parde.e_prsr_mem.size();
+             options.singlewrite ? i += 4 : i += 1) {
+            TopLevel::regs<Target::JBay>()->mem_pipe.parde.e_prsr_mem[i].set(
+                "memories.parser.egress", &regs.memory[EGRESS]);
+        }
+    }
+
+    if (gress == INGRESS) {
+        for (auto &ref : TopLevel::regs<Target::JBay>()->reg_pipe.pardereg.pgstnreg.ipbprsr4reg)
+            ref.set("regs.parser.ingress", &regs.ingress);
+    }
+
+    if (gress == EGRESS) {
+        for (auto &ref : TopLevel::regs<Target::JBay>()->reg_pipe.pardereg.pgstnreg.epbprsr4reg)
+            ref.set("regs.parser.egress", &regs.egress);
+    }
+    TopLevel::regs<Target::JBay>()->reg_pipe.pardereg.pgstnreg.pmergereg.set("regs.parse_merge",
+                                                                             &regs.merge);
+}
+
+template <>
+void Parser::gen_configuration_cache(Target::JBay::parser_regs &regs, json::vector &cfg_cache) {
+    std::string reg_fqname;
+    std::string reg_name;
+    std::string reg_value_str;
+    unsigned reg_width = 13;
+
+    /* Publishing meta_opt field for chnl_ctrl register */
+    /* Are ovr_pipeid, chnl_clean, init_dprsr_credit, init_ebuf_credit always handled by the
+     * driver?
+     */
+    for (int i = 0; i < 9; i++) {
+        reg_fqname = "pardereg.pgstnreg.epbprsr4reg[" + std::to_string(i) +
+                     "].epbreg.chan0_group.chnl_ctrl.meta_opt";
+        reg_name = "epb" + std::to_string(i) + "parser0_chnl_ctrl_0";
+        reg_value_str = int_to_hex_string(meta_opt, reg_width);
+        add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+    }
+}
diff --git a/backends/tofino/bf-asm/jbay/phv.cpp b/backends/tofino/bf-asm/jbay/phv.cpp
new file mode 100644
index 00000000000..ba01b84ae00
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/phv.cpp
@@ -0,0 +1,66 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/jbay/phv.h"
+
+void Target::JBay::Phv::init_regs(::Phv &phv) {
+    // Allocating JBay regs so the uids map to mau register encodings
+    static const struct {
+        char code[2];
+        unsigned size, count;
+    } groups[] = {{"W", 32, 4}, {"B", 8, 4}, {"H", 16, 6}};
+    static const struct {
+        char code[2];
+        Register::type_t type;
+        unsigned count;
+    } types[] = {{"", Register::NORMAL, 12}, {"M", Register::MOCHA, 4}, {"D", Register::DARK, 4}};
+    unsigned uid = 0;
+    unsigned byte = 0;
+    unsigned deparser_id = 0;
+    phv.regs.resize(280);
+    for (unsigned i = 0; i < sizeof groups / sizeof *groups; i++) {
+        unsigned idx[sizeof types / sizeof *types] = {0};
+        for (unsigned j = 0; j < groups[i].count; j++) {
+            for (unsigned k = 0; k < sizeof types / sizeof *types; k++) {
+                for (unsigned l = 0; l < types[k].count; l++, idx[k]++, uid++) {
+                    auto reg = new Register;
+                    phv.regs[uid] = reg;
+                    memset(reg->name, 0, sizeof(reg->name));
+                    snprintf(reg->name, sizeof(reg->name), "%.2s%.2s%d", types[k].code,
+                             groups[i].code, idx[k]);
+                    reg->type = types[k].type;
+                    reg->index = idx[k];
+                    reg->uid = uid;
+                    reg->size = groups[i].size;
+                    if (reg->type == Register::DARK) {
+                        reg->parser_id_ = reg->deparser_id_ = -1;
+                    } else {
+                        reg->parser_id_ = byte / 2U;
+                        reg->deparser_id_ = deparser_id++;
+                        byte += reg->size / 8U;
+                    }
+                    phv.names[INGRESS][reg->name][0].slice = ::Phv::Slice(*reg, 0, reg->size - 1);
+                    phv.names[EGRESS][reg->name][0].slice = ::Phv::Slice(*reg, 0, reg->size - 1);
+                    phv.names[GHOST][reg->name][0].slice = ::Phv::Slice(*reg, 0, reg->size - 1);
+                }
+            }
+        }
+    }
+    BUG_CHECK(uid == phv.regs.size());
+    BUG_CHECK(deparser_id == 224);
+    BUG_CHECK(byte == 512);
+}
diff --git a/backends/tofino/bf-asm/jbay/phv.h b/backends/tofino/bf-asm/jbay/phv.h
new file mode 100644
index 00000000000..92b7af5532a
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/phv.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_JBAY_PHV_H_
+#define BACKENDS_TOFINO_BF_ASM_JBAY_PHV_H_
+
+#include "backends/tofino/bf-asm/phv.h"
+
+class Target::JBay::Phv : public Target::Phv {
+    friend class ::Phv;
+    struct Register : public ::Phv::Register {
+        short parser_id_, deparser_id_;
+        int parser_id() const override { return parser_id_; }
+        int mau_id() const override { return uid < 280 ? uid : -1; }
+        int ixbar_id() const override {
+            static const int ixbar_permute[16] = {0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, -6, -6, 0, 0};
+            return deparser_id_ + ixbar_permute[deparser_id_ & 0xf];
+        }
+        int deparser_id() const override { return deparser_id_; }
+    };
+    void init_regs(::Phv &phv) override;
+    target_t type() const override { return JBAY; }
+    unsigned mau_groupsize() const override { return 20; }
+};
+
+class Target::Tofino2H::Phv : public Target::JBay::Phv {
+    target_t type() const override { return TOFINO2H; }
+};
+
+class Target::Tofino2M::Phv : public Target::JBay::Phv {
+    target_t type() const override { return TOFINO2M; }
+};
+
+class Target::Tofino2U::Phv : public Target::JBay::Phv {
+    target_t type() const override { return TOFINO2U; }
+};
+
+class Target::Tofino2A0::Phv : public Target::JBay::Phv {
+    target_t type() const override { return TOFINO2A0; }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_JBAY_PHV_H_ */
diff --git a/backends/tofino/bf-asm/jbay/salu_inst.cpp b/backends/tofino/bf-asm/jbay/salu_inst.cpp
new file mode 100644
index 00000000000..f6d0ce90425
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/salu_inst.cpp
@@ -0,0 +1,493 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* JBay template specializations for instructions #included in salu_inst.cpp
+ * WARNING -- this is included in an anonymous namespace, as these SaluInstruction
+ * subclasses are all defined in that anonymous namespace */
+
+struct DivMod : public AluOP {
+    struct Decode : public AluOP::Decode {
+        Decode(const char *name, target_t targ, int opc) : AluOP::Decode(name, targ, opc) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override {
+            auto *rv = new DivMod(this, op[0].lineno);
+            if (op.size != 3) error(op[0].lineno, "divmod must have exactly 2 operands");
+            if (op.size > 1) rv->srca = operand(tbl, act, op[1]);
+            if (op.size > 2) rv->srcb = operand(tbl, act, op[2]);
+            rv->dest = AluOP::HI;
+            rv->slot = ALU2HI;
+            return rv;
+        }
+    };
+    DivMod(const Decode *op, int l) : AluOP(op, l) {}
+
+    Instruction *pass1(Table *tbl, Table::Actions::Action *act) override {
+        tbl->stage->table_use[timing_thread(tbl->gress)] |= Stage::USE_STATEFUL_DIVIDE;
+        BUG_CHECK(tbl->to<StatefulTable>(), "stateful instruction on non-stateful table?");
+        tbl->to<StatefulTable>()->divmod_used = true;
+        return AluOP::pass1(tbl, act);
+    }
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+// setz op, so can OR with alu1hi to get that result
+DivMod::Decode opDIVMOD("divmod", JBAY, 0x00);
+
+void DivMod::write_regs(Target::Tofino::mau_regs &, Table *, Table::Actions::Action *) { BUG(); }
+void DivMod::write_regs(Target::JBay::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    AluOP::write_regs(regs, tbl, act);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu_instr_common = meter_group.stateful.salu_instr_common[act->code];
+    salu_instr_common.salu_divide_enable |= 1;
+}
+
+struct MinMax : public SaluInstruction {
+    const struct Decode : public Instruction::Decode {
+        std::string name;
+        unsigned opcode;
+        Decode(const char *name, target_t targ, int op)
+            : Instruction::Decode(name, targ, STATEFUL_ALU), name(name), opcode(op) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    bool phv = false;  // source is mem or phv
+    operand mask, postmod;
+    // constants for mask and postmod packed together
+    boost::optional<unsigned> constval = boost::none;
+    MinMax(const Decode *op, int l) : SaluInstruction(l), opc(op) {}
+    std::string name() override { return opc->name; };
+    Instruction *pass1(Table *tbl, Table::Actions::Action *) override;
+    void pass2(Table *tbl, Table::Actions::Action *) override;
+    bool salu_alu() const override { return true; }
+    bool equiv(Instruction *a_) override {
+        if (auto *a = dynamic_cast<MinMax *>(a_))
+            return opc == a->opc && phv == a->phv && mask == a->mask && postmod == a->postmod;
+        return false;
+    }
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)>) override { return phv; }
+    void dbprint(std::ostream &out) const override {
+        out << "INSTR: " << opc->name << (phv ? "phv, " : "mem, ") << mask;
+        if (postmod) out << ", " << postmod;
+    }
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+MinMax::Decode opMIN8("min8", JBAY, 0), opMAX8("max8", JBAY, 1), opMIN16("min16", JBAY, 2),
+    opMAX16("max16", JBAY, 3);
+
+Instruction *MinMax::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                    const VECTOR(value_t) & op) const {
+    auto *rv = new MinMax(this, op[0].lineno);
+    if (op.size > 2) {
+        if (op[1] == "phv")
+            rv->phv = true;
+        else if (op[1] != "mem")
+            error(op[1].lineno, "%s source must be 'mem' or 'phv'", op[0].s);
+        rv->mask = operand(tbl, act, op[2]);
+        if (!rv->mask.to<operand::Phv>() && !rv->mask.to<operand::Const>())
+            error(op[1].lineno, "%s mask must be constant or from phv or hash_dist", op[0].s);
+    } else {
+        error(op[0].lineno, "%s must have a single mask operand", op[0].s);
+    }
+    if (op.size == 4) {
+        rv->postmod = operand(tbl, act, op[3]);
+    } else if (op.size > 4) {
+        error(op[0].lineno, "too many operands for %s", op[0].s);
+    }
+    rv->slot = MINMAX;
+    return rv;
+}
+Instruction *MinMax::pass1(Table *tbl_, Table::Actions::Action *act) {
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int mask_size = (opc->opcode & 2) ? 8 : 16;
+    constval = boost::none;
+    mask->pass1(tbl);
+    act->minmax_use = true;
+    if (auto k = mask.to<operand::Const>()) {
+        if (k->value < 0 || k->value >= (1U << mask_size) || mask.neg)
+            error(k->lineno, "%s mask value out of range", name().c_str());
+        constval = k->value & ((1U << mask_size) - 1);
+    } else if (auto p = mask.to<operand::Phv>()) {
+        if (p->phv_index(tbl))
+            error(lineno, "%s phv mask must come from the lower half input", name().c_str());
+    } else {
+        error(mask->lineno, "%s invalid mask", name().c_str());
+    }
+    if (postmod) {
+        if (auto k = postmod.to<operand::Const>()) {
+            if (k->value < 0) {
+                k->value = -k->value;
+                postmod.neg = !postmod.neg;
+            }
+            if (k->value > 255) error(lineno, "%s post mod too large", name().c_str());
+            constval = constval.get_value_or(0) | (k->value & 0xff) << mask_size;
+        } else if (auto p = postmod.to<operand::Phv>()) {
+            if (!p->phv_index(tbl))
+                error(lineno, "%s phv post mod must come from the upper half input",
+                      name().c_str());
+        } else {
+            error(postmod->lineno, "%s invalid post mod", name().c_str());
+        }
+    }
+    // We allocate the value here in order to report an error in pass1 if the capacity
+    // of the register file is exceeded. The next call in write_regs with the same value
+    // will return already allocated register file row index.
+    if (constval) tbl->get_const(lineno, *constval);
+    return this;
+}
+void MinMax::pass2(Table *tbl, Table::Actions::Action *act) {
+    if (act->slot_use.intersects(bitvec(ALU2LO, 4)))
+        error(lineno, "min/max requires all 4 stateful alu slots be unused");
+}
+void MinMax::write_regs(Target::JBay::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu_instr_common = meter_group.stateful.salu_instr_common[act->code];
+    if (auto k = mask.to<operand::Const>()) {
+        salu_instr_common.salu_minmax_mask_ctl = 1;
+    } else {
+        salu_instr_common.salu_minmax_mask_ctl = 0;
+    }
+    salu_instr_common.salu_minmax_ctl = opc->opcode;
+    salu_instr_common.salu_minmax_enable = 1;
+    if (postmod) {
+        if (auto k = postmod.to<operand::Const>()) {
+            salu_instr_common.salu_minmax_postmod_value_ctl = 0;
+        } else {
+            salu_instr_common.salu_minmax_postmod_value_ctl = 1;
+        }
+        if (postmod.neg)
+            salu_instr_common.salu_minmax_postdec_enable = 1;
+        else
+            salu_instr_common.salu_minmax_postinc_enable = 1;
+    }
+    if (constval) {
+        auto &salu_instr_cmp = meter_group.stateful.salu_instr_cmp_alu[act->code][3];
+        salu_instr_cmp.salu_cmp_regfile_adr = tbl->get_const(lineno, *constval);
+    }
+    // salu_instr_common.salu_minmax_src_sel = phv;  -- FIXME -- specify PHV source?
+    for (auto &salu : meter_group.stateful.salu_instr_state_alu[act->code]) {
+        salu.salu_op = 0xd;
+        salu.salu_arith = 1;
+        salu.salu_pred = 0xffff;
+    }
+}
+void MinMax::write_regs(Target::Tofino::mau_regs &, Table *, Table::Actions::Action *) { BUG(); }
+
+template <>
+void AluOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    LOG2(this);
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_state_alu[act->code][slot - ALU2LO];
+    auto &salu_ext = meter_group.stateful.salu_instr2_state_alu[act->code][slot - ALU2LO];
+    auto &salu_instr_common = meter_group.stateful.salu_instr_common[act->code];
+    auto &salu_instr_output_alu = meter_group.stateful.salu_instr_output_alu[act->code];
+    salu.salu_op = opc->opcode & 0xf;
+    salu.salu_arith = opc->opcode >> 4;
+    salu.salu_pred = predication_encode;
+    bool need_flyover = (tbl->format->size >> tbl->is_dual_mode()) > 32;
+    const int alu_const_min = Target::STATEFUL_ALU_CONST_MIN();
+    const int alu_const_max = Target::STATEFUL_ALU_CONST_MAX();
+    if (srca) {
+        if (auto m = srca.to<operand::Memory>()) {
+            salu.salu_asrc_input = m->field->bit(0) > 0 ? 1 : 0;
+            if (need_flyover) {
+                salu_ext.salu_flyover_src_sel = 1;
+                need_flyover = false;
+            }
+        } else if (auto f = srca.to<operand::Phv>()) {
+            salu.salu_asrc_input = f->phv_index(tbl) ? 3 : 2;
+            if (need_flyover) {
+                salu_ext.salu_flyover_src_sel = 1;
+                need_flyover = false;
+            }
+        } else if (auto k = srca.to<operand::Const>()) {
+            salu.salu_asrc_input = 4;
+            if (k->value >= alu_const_min && k->value <= alu_const_max) {
+                salu.salu_const_src = k->value & Target::STATEFUL_ALU_CONST_MASK();
+                salu.salu_regfile_const = 0;
+            } else {
+                salu.salu_const_src = tbl->get_const(k->lineno, k->value);
+                salu.salu_regfile_const = 1;
+            }
+        } else if (auto r = srca.to<operand::Regfile>()) {
+            salu.salu_asrc_input = 4;
+            salu.salu_const_src = r->index;
+            salu.salu_regfile_const = 1;
+        } else {
+            BUG();
+        }
+    }
+    if (srcb) {
+        if (auto m = srcb.to<operand::Memory>()) {
+            salu.salu_bsrc_input = m->field->bit(0) > 0 ? 3 : 2;
+            if (need_flyover) {
+                salu_ext.salu_flyover_src_sel = 0;
+                need_flyover = false;
+            }
+        } else if (auto f = srcb.to<operand::Phv>()) {
+            salu.salu_bsrc_input = f->phv_index(tbl) ? 1 : 0;
+            if (need_flyover) {
+                salu_ext.salu_flyover_src_sel = 0;
+                need_flyover = false;
+            }
+        } else if (auto m = srcb.to<operand::MathFn>()) {
+            salu_instr_common.salu_alu2_lo_bsrc_math = 1;
+            if (auto b = m->of.to<operand::Phv>()) {
+                salu_instr_common.salu_alu2_lo_math_src = b->phv_index(tbl);
+            } else if (auto b = m->of.to<operand::Memory>()) {
+                salu_instr_common.salu_alu2_lo_math_src = b->field->bit(0) > 0 ? 3 : 2;
+            } else {
+                BUG();
+            }
+        } else if (auto k = srcb.to<operand::Const>()) {
+            salu.salu_bsrc_input = 4;
+            if (k->value >= alu_const_min && k->value <= alu_const_max) {
+                salu.salu_const_src = k->value & Target::STATEFUL_ALU_CONST_MASK();
+                salu.salu_regfile_const = 0;
+            } else {
+                salu.salu_const_src = tbl->get_const(k->lineno, k->value);
+                salu.salu_regfile_const = 1;
+            }
+        } else if (auto r = srcb.to<operand::Regfile>()) {
+            salu.salu_bsrc_input = 4;
+            salu.salu_const_src = r->index;
+            salu.salu_regfile_const = 1;
+        } else {
+            BUG();
+        }
+    }
+}
+void AluOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::JBay::mau_regs>(regs, tbl, act);
+}
+
+template <>
+void BitOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    LOG2(this);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_state_alu[act->code][slot - ALU2LO];
+    salu.salu_op = opc->opcode & 0xf;
+    salu.salu_pred = predication_encode;
+    // 1b instructions are from mem-lo to alu1-lo
+    salu.salu_asrc_input = 0;
+}
+void BitOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::JBay::mau_regs>(regs, tbl, act);
+}
+
+static int sbus_mask(int alu, const std::vector<Table::Ref> &tbls) {
+    int rv = 0;
+    for (auto &tbl : tbls) {
+        int bit = tbl->layout[0].row / 4U;
+        if (bit > alu) --bit;
+        rv |= 1 << bit;
+    }
+    return rv;
+}
+
+template <>
+void CmpOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    LOG2(this);
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_cmp_alu[act->code][slot];
+    auto &salu_instr_common = meter_group.stateful.salu_instr_common[act->code];
+    if (srca) {
+        salu.salu_cmp_asrc_input = srca->field->bit(0) > 0;
+        salu.salu_cmp_asrc_sign = srca_neg;
+        salu.salu_cmp_asrc_enable = 1;
+        if (maska != uint32_t(-1)) {
+            salu.salu_cmp_asrc_mask_enable = 1;
+            auto cval = 0;
+            if (auto k = dynamic_cast<const operand::Const *>(srcc))
+                cval = k->value;
+            else if (auto r = dynamic_cast<const operand::Regfile *>(srcc))
+                cval = tbl->get_const_val(r->index);
+            int64_t min = Target::STATEFUL_CMP_CONST_MIN();
+            int64_t max = Target::STATEFUL_CMP_CONST_MAX();
+            bool maska_outside = (maska < uint32_t(min) && maska > max);
+            bool maska_equal_inside = (uint32_t(cval) != maska && cval >= min && cval <= max);
+            if (!maska_outside && !maska_equal_inside) {
+                salu.salu_cmp_const_src = maska & Target::STATEFUL_CMP_CONST_MASK();
+                salu.salu_cmp_mask_input = 0;
+            } else {
+                salu.salu_cmp_regfile_adr = tbl->get_const(srca->lineno, maska);
+                salu.salu_cmp_mask_input = 1;
+            }
+        }
+    }
+    if (srcb) {
+        salu.salu_cmp_bsrc_input = srcb->phv_index(tbl);
+        salu.salu_cmp_bsrc_sign = srcb_neg;
+        salu.salu_cmp_bsrc_enable = 1;
+        if (maskb != uint32_t(-1)) {
+            // uarch 6.2.12.6.1, masks for operand a/b are sourced from the
+            // same regfile slot.
+            if (salu.salu_cmp_asrc_mask_enable && salu.salu_cmp_mask_input && maskb != maska)
+                error(lineno, "inconsistent operand mask %d and %d in salu compare operation",
+                      maska, maskb);
+            salu.salu_cmp_bsrc_mask_enable = 1;
+            salu.salu_cmp_regfile_adr = tbl->get_const(srcb->lineno, maskb);
+        }
+    }
+    if (srcc) {
+        if (auto k = dynamic_cast<const operand::Const *>(srcc)) {
+            const int cmp_const_min = Target::STATEFUL_CMP_CONST_MIN();
+            const int cmp_const_max = Target::STATEFUL_CMP_CONST_MAX();
+            if (k->value >= cmp_const_min && k->value <= cmp_const_max) {
+                salu.salu_cmp_const_src = k->value & Target::STATEFUL_CMP_CONST_MASK();
+                salu.salu_cmp_regfile_const = 0;
+            } else {
+                // uarch 6.2.12.6.1, masks for operand a/b are sourced from the
+                // same regfile slot as operand c if c is a constant.
+                if (salu.salu_cmp_asrc_mask_enable && salu.salu_cmp_mask_input &&
+                    maska != uint32_t(k->value))
+                    error(lineno, "inconsistent operand mask %d and %d in salu compare operation",
+                          maska, uint32_t(k->value));
+                if (salu.salu_cmp_bsrc_mask_enable && salu.salu_cmp_mask_input &&
+                    maskb != uint32_t(k->value))
+                    error(lineno, "inconsistent operand mask %d and %d in salu compare operation",
+                          maskb, uint32_t(k->value));
+                salu.salu_cmp_regfile_adr = tbl->get_const(srcc->lineno, k->value);
+                salu.salu_cmp_regfile_const = 1;
+            }
+        } else if (auto r = dynamic_cast<const operand::Regfile *>(srcc)) {
+            salu.salu_cmp_regfile_adr = r->index;
+            salu.salu_cmp_regfile_const = 1;
+        }
+    } else {
+        salu.salu_cmp_const_src = 0;
+        salu.salu_cmp_regfile_const = 0;
+    }
+    salu.salu_cmp_opcode = opc->opcode | (type << 2);
+    auto lmask = sbus_mask(logical_home_row / 4U, tbl->sbus_learn);
+    auto mmask = sbus_mask(logical_home_row / 4U, tbl->sbus_match);
+    salu_instr_common.salu_lmatch_sbus_listen = lmask;
+    salu_instr_common.salu_match_sbus_listen = mmask;
+    salu_instr_common.salu_sbus_in_comb = tbl->sbus_comb;
+    if (lmask || mmask) {
+        // if lmask and mmask are both zero, these registers don't matter, but the model
+        // will assert if they are non-zero)
+        salu.salu_cmp_sbus_or = 0;
+        salu.salu_cmp_sbus_and = learn ? 1 : 0;
+        salu.salu_cmp_sbus_invert = learn_not ? 1 : 0;
+    }
+}
+void CmpOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::JBay::mau_regs>(regs, tbl, act);
+}
+
+template <>
+void TMatchOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    LOG2(this);
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_cmp_alu[act->code][slot];
+    auto &salu_tmatch = meter_group.stateful.salu_instr_tmatch_alu[act->code][slot];
+    auto &salu_instr_common = meter_group.stateful.salu_instr_common[act->code];
+    salu.salu_cmp_tmatch_enable = 1;
+    salu.salu_cmp_asrc_enable = 1;
+    salu.salu_cmp_bsrc_enable = 1;
+    meter_group.stateful.tmatch_mask[slot][0] = ~mask & 0xffffffffU;
+    meter_group.stateful.tmatch_mask[slot][1] = ~mask >> 32;
+    salu.salu_cmp_opcode = 2;
+    salu.salu_cmp_asrc_input = srca->field->bit(0) > 0;
+    salu.salu_cmp_bsrc_input = srcb->phv_index(tbl);
+    if (auto lmask = sbus_mask(logical_home_row / 4U, tbl->sbus_learn))
+        salu_instr_common.salu_lmatch_sbus_listen = lmask;
+    if (auto mmask = sbus_mask(logical_home_row / 4U, tbl->sbus_match))
+        salu_instr_common.salu_match_sbus_listen = mmask;
+    salu.salu_cmp_sbus_or = 0;
+    salu.salu_cmp_sbus_and = learn ? 1 : 0;
+    salu.salu_cmp_sbus_invert = learn_not ? 1 : 0;
+    // we set the learn output unconditionally if there's a tmatch -- should it be controllable?
+    salu_tmatch.salu_tmatch_vld_ctl = 1;
+    // salu_tmatch.salu_tmatch_invert = 0;  -- when can this be useful?
+}
+
+void TMatchOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::JBay::mau_regs>(regs, tbl, act);
+}
+
+void OutOP::decode_output_mux(Target::JBay, Table *tbl, value_t &op) {
+    static const std::map<std::string, int> ops_mux_lookup = {
+        {"mem_hi", 1},    {"mem_lo", 0},  {"memory_hi", 1}, {"memory_lo", 0},    {"phv_hi", 3},
+        {"phv_lo", 2},    {"alu_hi", 5},  {"alu_lo", 4},    {"minmax_index", 5}, {"minmax_post", 4},
+        {"predicate", 6}, {"address", 7}, {"div", 8},       {"mod", 9},          {"minmax", 10}};
+    if (op.type == tCMD && ops_mux_lookup.count(op[0].s))
+        output_mux = ops_mux_lookup.at(op[0].s);
+    else if (op.type == tSTR && ops_mux_lookup.count(op.s))
+        output_mux = ops_mux_lookup.at(op.s);
+    else
+        output_mux = -1;
+    if (src) {
+        int tmp = output_mux;
+        if (auto *phv = src.to<operand::Phv>())
+            output_mux = 2 + phv->phv_index(tbl->to<StatefulTable>());
+        else if (auto *mem = src.to<operand::Memory>())
+            output_mux = mem->field->bit(0) > 0 ? 1 : 0;
+        BUG_CHECK(tmp < 0 || tmp == output_mux, "inconsistent output mux decode");
+    }
+}
+int OutOP::decode_output_option(Target::JBay, value_t &op) {
+    if (op == "lmatch") {
+        lmatch = true;
+        if (op.type == tCMD)
+            lmatch_pred = decode_predicate(op[1]);
+        else
+            lmatch_pred = STATEFUL_PREDICATION_ENCODE_UNCOND;
+    } else {
+        return -1;
+    }
+    return 0;
+}
+
+template <>
+void OutOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    LOG2(this);
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_output_alu[act->code][slot - ALUOUT0];
+    if (predication_encode) {
+        salu.salu_output_cmpfn = predication_encode;
+    } else {
+        salu.salu_output_cmpfn = STATEFUL_PREDICATION_ENCODE_UNCOND;
+    }
+    salu.salu_output_asrc = output_mux;
+    if ((salu.salu_lmatch_adr_bit_enable = lmatch))
+        meter_group.stateful.salu_mathtable[0] = lmatch_pred;
+    if (output_mux == STATEFUL_PREDICATION_OUTPUT)
+        meter_group.stateful.stateful_ctl.salu_output_pred_sel = slot - ALUOUT0;
+}
+void OutOP::write_regs(Target::JBay::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::JBay::mau_regs>(regs, tbl, act);
+}
diff --git a/backends/tofino/bf-asm/jbay/stage.cpp b/backends/tofino/bf-asm/jbay/stage.cpp
new file mode 100644
index 00000000000..38f37c71b5e
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/stage.cpp
@@ -0,0 +1,316 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* mau stage template specializations for jbay -- #included directly in top-level stage.cpp */
+
+template <>
+void Stage::gen_configuration_cache(Target::JBay::mau_regs &regs, json::vector &cfg_cache) {
+    Stage::gen_configuration_cache_common(regs, cfg_cache);
+
+    static unsigned i_pdddelay;
+    static unsigned e_pdddelay;
+    unsigned reg_width = 8;  // this means number of hex characters
+    std::string i_reg_value_str;
+    std::string e_reg_value_str;
+    std::string reg_fqname;
+    std::string reg_name;
+    unsigned reg_value;
+    std::string reg_value_str;
+
+    if (stageno != 0) {
+        if (i_pdddelay > regs.cfg_regs.amod_pre_drain_delay[INGRESS])
+            i_pdddelay = regs.cfg_regs.amod_pre_drain_delay[INGRESS];
+        if (e_pdddelay > regs.cfg_regs.amod_pre_drain_delay[EGRESS])
+            e_pdddelay = regs.cfg_regs.amod_pre_drain_delay[EGRESS];
+
+        if (stageno == AsmStage::numstages() - 1) {
+            // 64 is due to number of CSR's
+            i_pdddelay += (7 + 64);
+            i_reg_value_str = int_to_hex_string(i_pdddelay, reg_width);
+            e_pdddelay += (7 + 64);
+            e_reg_value_str = int_to_hex_string(e_pdddelay, reg_width);
+
+            add_cfg_reg(cfg_cache, "pardereg.pgstnreg.parbreg.left.i_wb_ctrl", "left_i_wb_ctrl",
+                        i_reg_value_str);
+            add_cfg_reg(cfg_cache, "pardereg.pgstnreg.parbreg.right.e_wb_ctrl", "right_e_wb_ctrl",
+                        e_reg_value_str);
+        }
+    }
+
+    // meter_ctl
+    auto &meter_ctl = regs.rams.map_alu.meter_group;
+    for (int i = 0; i < 4; i++) {
+        reg_fqname = "mau[" + std::to_string(stageno) + "].rams.map_alu.meter_group[" +
+                     std::to_string(i) + "]" + ".meter.meter_ctl";
+        reg_name = "stage_" + std::to_string(stageno) + "_meter_ctl_" + std::to_string(i);
+        reg_value = meter_ctl[i].meter.meter_ctl;
+        if ((reg_value != 0) || (options.match_compiler)) {
+            reg_value_str = int_to_hex_string(reg_value, reg_width);
+            add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+        }
+    }
+}
+
+static void addvec(json::vector &vec, ubits_base &val, uint32_t extra = 0) {
+    vec.push_back(val | extra);
+}
+static void addvec(json::vector &vec, uint32_t val, uint32_t extra = 0) {
+    vec.push_back(val | extra);
+}
+
+template <class T>
+static void addvec(json::vector &vec, checked_array_base<T> &array, uint32_t extra = 0) {
+    for (auto &el : array) addvec(vec, el, extra);
+}
+
+template <class REGS, class REG>
+static json::map make_reg_vec(REGS &regs, REG &reg, const char *name, uint32_t mask0,
+                              uint32_t mask1, uint32_t mask2, uint32_t extra = 0) {
+    json::map rv;
+    rv["name"] = name;
+    rv["offset"] = regs.binary_offset(&reg);
+    addvec(rv["value"], reg, extra);
+    rv["mask"] = json::vector{json::number(mask0), json::number(mask1), json::number(mask2)};
+    return rv;
+}
+
+template <class REGS>
+void Stage::gen_mau_stage_extension(REGS &regs, json::map &extend) {
+    extend["last_programmed_stage"] = Target::NUM_MAU_STAGES() - 1;
+    json::vector &registers = extend["registers"] = json::vector();
+    registers.push_back(make_reg_vec(regs, regs.dp.phv_ingress_thread, "regs.dp.phv_ingress_thread",
+                                     0, 0x3ff, 0x3ff));
+    registers.push_back(make_reg_vec(regs, regs.dp.phv_ingress_thread_imem,
+                                     "regs.dp.phv_ingress_thread_imem", 0, 0x3ff, 0x3ff));
+    registers.push_back(make_reg_vec(regs, regs.dp.phv_egress_thread, "regs.dp.phv_egress_thread",
+                                     0, 0x3ff, 0x3ff));
+    registers.push_back(make_reg_vec(regs, regs.dp.phv_egress_thread_imem,
+                                     "regs.dp.phv_egress_thread_imem", 0, 0x3ff, 0x3ff));
+    registers.push_back(make_reg_vec(regs, regs.rams.match.adrdist.adr_dist_pipe_delay,
+                                     "regs.rams.match.adrdist.adr_dist_pipe_delay", 0, 0xf, 0xf));
+    typename std::remove_reference<
+        decltype(regs.rams.match.adrdist.deferred_eop_bus_delay[0])>::type mask0,
+        mask1;
+    mask0.eop_delay_fifo_en = mask1.eop_delay_fifo_en = 1;
+    mask0.eop_internal_delay_fifo = mask1.eop_internal_delay_fifo = 0x1f;
+    mask0.eop_output_delay_fifo = 0x1;
+    mask1.eop_output_delay_fifo = 0x1f;
+    BUG_CHECK(regs.rams.match.adrdist.deferred_eop_bus_delay[0].eop_output_delay_fifo &
+              regs.rams.match.adrdist.deferred_eop_bus_delay[1].eop_output_delay_fifo & 1);
+    registers.push_back(make_reg_vec(regs, regs.rams.match.adrdist.deferred_eop_bus_delay,
+                                     "regs.rams.match.adrdist.deferred_eop_bus_delay", mask0, mask0,
+                                     mask1));
+    registers.push_back(make_reg_vec(regs, regs.dp.cur_stage_dependency_on_prev,
+                                     "regs.dp.cur_stage_dependency_on_prev", 0, 0x3, 0x3, 0x1));
+    registers.push_back(make_reg_vec(regs, regs.dp.next_stage_dependency_on_cur,
+                                     "regs.dp.next_stage_dependency_on_cur", 0x3, 0x3, 0, 0x1));
+    registers.push_back(make_reg_vec(regs, regs.rams.match.merge.mpr_bus_dep,
+                                     "regs.rams.match.merge.mpr_bus_dep", 0x3, 0x3, 0, 0x3));
+    registers.push_back(make_reg_vec(regs, regs.dp.pipelength_added_stages,
+                                     "regs.dp.pipelength_added_stages", 0, 0xf, 0xf));
+    registers.push_back(make_reg_vec(regs, regs.rams.match.merge.exact_match_delay_thread,
+                                     "regs.rams.match.merge.exact_match_delay_thread", 0, 0x3,
+                                     0x3));
+    BUG_CHECK((regs.rams.match.merge.mpr_thread_delay[0] & 1) == 0);
+    BUG_CHECK((regs.rams.match.merge.mpr_thread_delay[1] & 1) == 0);
+    registers.push_back(make_reg_vec(regs, regs.rams.match.merge.mpr_thread_delay,
+                                     "regs.rams.match.merge.mpr_thread_delay", 1, 1, 0x1f));
+}
+
+/* disable power gating configuration for specific MAU regs to weedout delay programming
+ * issues. We dont expect to call this function in the normal usage of JBay - this is
+ * only for emulator debug/bringup
+ */
+template <class REGS>
+static void disable_jbay_power_gating(REGS &regs) {
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        regs.dp.mau_match_input_xbar_exact_match_enable[gress] |= 0x1;
+        regs.dp.xbar_hash.xbar.mau_match_input_xbar_ternary_match_enable[gress] |= 0x1;
+    }
+
+    auto &xbar_power_ctl = regs.dp.match_input_xbar_din_power_ctl;
+    auto &actionmux_power_ctl = regs.dp.actionmux_din_power_ctl;
+    for (int side = 0; side < 2; side++) {
+        for (int reg = 0; reg < 16; reg++) {
+            xbar_power_ctl[side][reg] |= 0x3FF;
+            actionmux_power_ctl[side][reg] |= 0x3FF;
+        }
+    }
+}
+
+template <>
+void Stage::write_regs(Target::JBay::mau_regs &regs, bool) {
+    write_common_regs<Target::JBay>(regs);
+    auto &merge = regs.rams.match.merge;
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        if (stageno == 0) {
+            merge.predication_ctl[gress].start_table_fifo_delay0 = pred_cycle(gress) - 2;
+            merge.predication_ctl[gress].start_table_fifo_enable = 1;
+        } else if (stage_dep[gress] == MATCH_DEP) {
+            merge.predication_ctl[gress].start_table_fifo_delay0 =
+                this[-1].pipelength(gress) - this[-1].pred_cycle(gress) + pred_cycle(gress) - 3;
+            merge.predication_ctl[gress].start_table_fifo_enable = 1;
+        } else {
+            BUG_CHECK(stage_dep[gress] == ACTION_DEP);
+            merge.predication_ctl[gress].start_table_fifo_delay0 = 0;
+            merge.predication_ctl[gress].start_table_fifo_enable = 0;
+        }
+
+        if (stageno != 0)
+            regs.dp.cur_stage_dependency_on_prev[gress] = stage_dep[gress] != MATCH_DEP;
+
+        /* set stage0 dependency if explicitly set by the commandline option */
+        if (stageno == 0 && !options.stage_dependency_pattern.empty())
+            regs.dp.cur_stage_dependency_on_prev[gress] = stage_dep[gress] != MATCH_DEP;
+
+        if (stageno != AsmStage::numstages() - 1)
+            regs.dp.next_stage_dependency_on_cur[gress] = this[1].stage_dep[gress] != MATCH_DEP;
+        else if (AsmStage::numstages() < Target::NUM_MAU_STAGES())
+            regs.dp.next_stage_dependency_on_cur[gress] = 1;
+        auto &deferred_eop_bus_delay = regs.rams.match.adrdist.deferred_eop_bus_delay[gress];
+        deferred_eop_bus_delay.eop_internal_delay_fifo = pred_cycle(gress) + 2;
+        /* FIXME -- making this depend on the dependency of the next stage seems wrong */
+        if (stageno == AsmStage::numstages() - 1) {
+            if (AsmStage::numstages() < Target::NUM_MAU_STAGES())
+                deferred_eop_bus_delay.eop_output_delay_fifo = 1;
+            else
+                deferred_eop_bus_delay.eop_output_delay_fifo = pipelength(gress) - 2;
+        } else if (this[1].stage_dep[gress] == MATCH_DEP) {
+            deferred_eop_bus_delay.eop_output_delay_fifo = pipelength(gress) - 2;
+        } else {
+            deferred_eop_bus_delay.eop_output_delay_fifo = 1;
+        }
+        deferred_eop_bus_delay.eop_delay_fifo_en = 1;
+        if (stageno != AsmStage::numstages() - 1 && this[1].stage_dep[gress] == MATCH_DEP) {
+            merge.mpr_thread_delay[gress] = pipelength(gress) - pred_cycle(gress) - 4;
+        } else {
+            /* last stage in JBay must be always set as match-dependent on deparser */
+            if (stageno == AsmStage::numstages() - 1) {
+                merge.mpr_thread_delay[gress] = pipelength(gress) - pred_cycle(gress) - 4;
+            } else {
+                merge.mpr_thread_delay[gress] = 0;
+            }
+        }
+    }
+
+    for (gress_t gress : Range(INGRESS, EGRESS))
+        if (table_use[gress] & USE_TCAM)
+            regs.tcams.tcam_piped |= options.match_compiler ? 3 : 1 << gress;
+
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        regs.cfg_regs.amod_pre_drain_delay[gress] = pipelength(gress) - 9;
+        if (this[1].stage_dep[gress] == MATCH_DEP)
+            regs.cfg_regs.amod_wide_bubble_rsp_delay[gress] = pipelength(gress) - 3;
+        else
+            regs.cfg_regs.amod_wide_bubble_rsp_delay[gress] = 0;
+    }
+    /* Max re-request limit with a long interval.  Parb is going to have a large
+     * gap configured to minimize traffic hits during configuration this means
+     * that individual stages may not get their bubbles and will need to retry. */
+    regs.cfg_regs.amod_req_interval = 6732;
+    regs.cfg_regs.amod_req_limit = 15;
+
+    if (stageno == 0) {
+        /* MFerrera: "After some debug on the emulator, we've found a programming issue due to
+         * incorrect documentation and CSR description of match_ie_input_mux_sel in JBAY"
+         * MAU Stage 0 must always be configured to source iPHV from Parser-Arbiter
+         * Otherwise, MAU stage 0 is configured as match-dependent on Parser-Arbiter */
+        regs.dp.match_ie_input_mux_sel |= 3;
+    }
+
+    merge.pred_stage_id = stageno;
+    if (long_branch_terminate) merge.pred_long_brch_terminate = long_branch_terminate;
+    for (gress_t gress : Range(INGRESS, GHOST)) {
+        if (long_branch_thread[gress])
+            merge.pred_long_brch_thread[gress] = long_branch_thread[gress];
+    }
+
+    for (gress_t gress : Range(INGRESS, GHOST)) {
+        merge.mpr_stage_id[gress] = mpr_stage_id[gress];
+        for (int id = 0; id < LOGICAL_TABLES_PER_STAGE; ++id) {
+            merge.mpr_next_table_lut[gress][id] = mpr_next_table_lut[gress][id];
+        }
+    }
+    for (int id = 0; id < LOGICAL_TABLES_PER_STAGE; ++id) {
+        merge.mpr_glob_exec_lut[id] = mpr_glob_exec_lut[id];
+    }
+    for (int id = 0; id < MAX_LONGBRANCH_TAGS; ++id) {
+        merge.mpr_long_brch_lut[id] = mpr_long_brch_lut[id];
+    }
+    merge.mpr_always_run = mpr_always_run;
+
+    if (stageno != AsmStage::numstages() - 1) {
+        merge.mpr_bus_dep.mpr_bus_dep_ingress = this[1].stage_dep[INGRESS] != MATCH_DEP;
+        merge.mpr_bus_dep.mpr_bus_dep_egress = this[1].stage_dep[EGRESS] != MATCH_DEP;
+    }
+
+    merge.mpr_bus_dep.mpr_bus_dep_glob_exec = mpr_bus_dep_glob_exec[INGRESS] |
+                                              mpr_bus_dep_glob_exec[EGRESS] |
+                                              mpr_bus_dep_glob_exec[GHOST];
+    merge.mpr_bus_dep.mpr_bus_dep_long_brch = mpr_bus_dep_long_branch[INGRESS] |
+                                              mpr_bus_dep_long_branch[EGRESS] |
+                                              mpr_bus_dep_long_branch[GHOST];
+
+    merge.mpr_long_brch_thread = long_branch_thread[EGRESS];
+    if (auto conflict = (long_branch_thread[INGRESS] | long_branch_thread[GHOST]) &
+                        long_branch_thread[EGRESS]) {
+        // Should probably check this earlier, but there's not a good place to do it.
+        for (auto tag : bitvec(conflict)) {
+            error(long_branch_use[tag]->lineno,
+                  "Need one-stage turnaround before reusing "
+                  "long_branch tag %d in a different thread",
+                  tag);
+        }
+    }
+
+    bitvec in_use = match_use[INGRESS] | action_use[INGRESS] | action_set[INGRESS];
+    bitvec eg_use = match_use[EGRESS] | action_use[EGRESS] | action_set[EGRESS];
+    /* FIXME -- if the regs are live across a stage (even if not used in that stage) they
+     * need to be set in the thread registers.  For now we just assume if they are used
+     * anywhere, they need to be marked as live */
+    in_use |= Phv::use(INGRESS);
+    eg_use |= Phv::use(EGRESS);
+    static const int phv_use_transpose[2][14] = {
+        {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 20, 21},
+        {4, 5, 6, 7, 12, 13, 14, 15, 22, 23, 24, 25, 26, 27}};
+    // FIXME -- this code depends on the Phv::Register uids matching the
+    // FIXME -- mau encoding of phv containers. (FIXME-PHV)
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 14; j++) {
+            regs.dp.phv_ingress_thread[i][j] = regs.dp.phv_ingress_thread_imem[i][j] =
+                in_use.getrange(10 * phv_use_transpose[i][j], 10);
+            regs.dp.phv_egress_thread[i][j] = regs.dp.phv_egress_thread_imem[i][j] =
+                eg_use.getrange(10 * phv_use_transpose[i][j], 10);
+        }
+    }
+
+    /* Things following are for debug/bringup only : not for normal flows  */
+
+    if (options.disable_power_gating) {
+        disable_jbay_power_gating(regs);
+    }
+
+    write_teop_regs(regs);
+}
+
+void AlwaysRunTable::write_regs(Target::JBay::mau_regs &regs) {
+    if (gress == EGRESS)
+        regs.dp.imem_word_read_override.imem_word_read_override_egress = 1;
+    else
+        regs.dp.imem_word_read_override.imem_word_read_override_ingress = 1;
+    actions->write_regs(regs, this);
+}
diff --git a/backends/tofino/bf-asm/jbay/stateful.cpp b/backends/tofino/bf-asm/jbay/stateful.cpp
new file mode 100644
index 00000000000..a429828b723
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/stateful.cpp
@@ -0,0 +1,393 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/jbay/stateful.h"
+
+static const char *function_names[] = {"none", "log", "fifo", "stack", "clear"};
+
+static int decode_push_pop(const value_t &v) {
+    static const std::map<std::string, int> modes = {
+        {"hit", PUSH_HIT}, {"miss", PUSH_MISS}, {"gateway", PUSH_GATEWAY}, {"active", PUSH_ALL}};
+    if (!CHECKTYPE(v, tSTR)) return 0;
+    if (!modes.count(v.s)) {
+        error(v.lineno, "Unknown push/pop mode %s", v.s);
+        return 0;
+    }
+    return modes.at(v.s);
+}
+
+bool StatefulTable::setup_jbay(const pair_t &kv) {
+    if (kv.key == "sbus") {
+        // FIXME -- this should be in the stateful action setup as it is per action?
+        if (!CHECKTYPE(kv.value, tMAP)) return true;
+        for (auto &el : kv.value.map) {
+            if (el.key == "match") {
+                parse_vector(sbus_match, el.value);
+            } else if (el.key == "learn") {
+                parse_vector(sbus_learn, el.value);
+            } else if (el.key == "operation" || el.key == "combine") {
+                if (el.value == "and")
+                    sbus_comb = SBUS_AND;
+                else if (el.value == "or")
+                    sbus_comb = SBUS_OR;
+                else
+                    error(el.value.lineno, "Invalid sbus %s %s, must be 'and' or 'or'",
+                          value_desc(el.key), value_desc(el.value));
+            } else {
+                warning(el.key.lineno, "ignoring unknown item %s in sbus of table %s",
+                        value_desc(el.key), name());
+            }
+        }
+    } else if (kv.key == "fifo" || kv.key == "stack") {
+        if (stateful_counter_mode) {
+            error(kv.key.lineno, "Conflicting log counter functions in %s", name());
+            return true;
+        }
+        if (kv.key == "fifo")
+            stateful_counter_mode = FUNCTION_FIFO;
+        else if (kv.key == "stack")
+            stateful_counter_mode = FUNCTION_STACK;
+        if (!CHECKTYPE(kv.value, tMAP)) return true;
+        for (auto &el : MapIterChecked(kv.value.map)) {
+            if (el.key == "push")
+                stateful_counter_mode |= decode_push_pop(el.value);
+            else if (el.key == "pop")
+                stateful_counter_mode |= decode_push_pop(el.value) << PUSHPOP_BITS;
+            else
+                error(el.key.lineno, "Syntax error, expecting push or pop");
+        }
+    } else if (kv.key == "clear") {
+        if (stateful_counter_mode) {
+            error(kv.key.lineno, "Conflicting log counter functions in %s", name());
+            return true;
+        }
+        stateful_counter_mode = FUNCTION_FAST_CLEAR;
+        stateful_counter_mode |= decode_push_pop(kv.value);
+    } else if (kv.key == "watermark") {
+        if (kv.value == "pop")
+            watermark_pop_not_push = 1;
+        else if (kv.value != "push")
+            error(kv.value.lineno, "Syntax error, expecting push or pop");
+        if (kv.value.type == tSTR)
+            watermark_level = 1;
+        else if (CHECKTYPE(kv.value[1], tINT))
+            watermark_level = kv.value[1].i / 128;
+        if (kv.value[1].i % 128 != 0)
+            error(kv.value[1].lineno, "watermark level must be a mulitple of 128");
+    } else if (kv.key == "underflow") {
+        if (CHECKTYPE(kv.value, tSTR)) underflow_action = kv.value;
+    } else if (kv.key == "overflow") {
+        if (CHECKTYPE(kv.value, tSTR)) overflow_action = kv.value;
+    } else if (kv.key == "offset_vpn") {
+        offset_vpn = get_bool(kv.value);
+    } else if (kv.key == "address_shift") {
+        if (CHECKTYPE(kv.value, tINT)) meter_adr_shift = kv.value.i;
+    } else if (kv.key == "phv_hash_shift") {
+        if (CHECKTYPE(kv.value, tINT)) {
+            phv_hash_shift = kv.value.i / 8U;
+            if (kv.value.i % 8U != 0)
+                error(kv.value.lineno, "phv_hash_shift must be a mulitple of 8");
+            else if (phv_hash_shift < 0 || phv_hash_shift > 15)
+                error(kv.value.lineno, "phv_hash_shift %" PRId64 " out of range", kv.value.i);
+        }
+    } else if (kv.key == "phv_hash_mask") {
+        if (CHECKTYPE2(kv.value, tINT, tBIGINT)) phv_hash_mask = get_bitvec(kv.value);
+    } else if (kv.key == "stage_alu_id") {
+        if (CHECKTYPE(kv.value, tINT)) {
+            if (kv.value.i < 0 || kv.value.i >= 128)
+                error(kv.value.lineno, "invalid stage_alu_id %" PRIi64, kv.value.i);
+            stage_alu_id = kv.value.i;
+        }
+    } else {
+        return false;
+    }
+    return true;
+}
+
+int parse_jbay_counter_mode(const value_t &v) {
+    int rv = 0;
+    if (v == "counter")
+        rv = FUNCTION_LOG;
+    else if (v == "fifo")
+        rv = FUNCTION_FIFO;
+    else if (v == "stack")
+        rv = FUNCTION_STACK;
+    else if (v == "clear")
+        rv = FUNCTION_FAST_CLEAR;
+    else
+        return -1;
+    if (v.type == tSTR) return rv | PUSH_ALL;
+    if (v.type != tCMD) return -1;
+    int flag = 0;
+    for (int i = 1; i < v.vec.size; ++i) {
+        if (v[i] == "hit") {
+            flag |= PUSH_HIT;
+        } else if (v[i] == "miss") {
+            flag |= PUSH_MISS;
+        } else if (v[i] == "gateway") {
+            flag |= PUSH_GATEWAY;
+        } else if (v[i] == "gw0") {
+            flag |= PUSH_GW_ENTRY;
+        } else if (v[i] == "gw1") {
+            flag |= (PUSH_GW_ENTRY << 1);
+        } else if (v[i] == "gw2") {
+            flag |= (PUSH_GW_ENTRY << 2);
+        } else if (v[i] == "gw3") {
+            flag |= (PUSH_GW_ENTRY << 3);
+        } else if (v[i] == "push" && (rv & FUNCTION_MASK) != FUNCTION_LOG) {
+            rv |= flag ? flag : PUSH_ALL;
+            flag = 0;
+        } else if (v[i] == "pop" && (rv & FUNCTION_MASK) != FUNCTION_LOG) {
+            rv |= (flag ? flag : PUSH_ALL) << PUSHPOP_BITS;
+            flag = 0;
+        } else {
+            return -1;
+        }
+    }
+    return rv | flag;
+}
+int StatefulTable::parse_counter_mode(Target::JBay target, const value_t &v) {
+    return parse_jbay_counter_mode(v);
+}
+
+void StatefulTable::set_counter_mode(Target::JBay target, int mode) {
+    int fnmode = mode & FUNCTION_MASK;
+    BUG_CHECK(fnmode > 0 && (fnmode >> FUNCTION_SHIFT) <= FUNCTION_FAST_CLEAR);
+    if (stateful_counter_mode && (stateful_counter_mode & FUNCTION_MASK) != fnmode)
+        error(lineno, "Incompatible uses (%s and %s) of stateful alu counters",
+              function_names[stateful_counter_mode >> FUNCTION_SHIFT],
+              function_names[mode >> FUNCTION_SHIFT]);
+    else
+        stateful_counter_mode |= fnmode;
+    if (mode & PUSH_MASK) stateful_counter_mode |= PUSH_ANY;
+    if (mode & POP_MASK) stateful_counter_mode |= POP_ANY;
+}
+
+// DANGER -- nasty hack to set the raw bits of an SALU state alu instruction
+// really need to make the csr2cpp codegen handle this automatically
+template <class T>
+void set_raw_instr_bits(checked_array<4, T> &reg, bitvec v) {
+    for (int i = 0; i < 4; ++i) {
+        reg[i].salu_const_src = v.getrange(i * 32, 4);
+        reg[i].salu_regfile_const = v.getrange(i * 32 + 4, 1);
+        reg[i].salu_bsrc_input = v.getrange(i * 32 + 5, 3);
+        reg[i].salu_asrc_input = v.getrange(i * 32 + 8, 3);
+        reg[i].salu_op = v.getrange(i * 32 + 11, 4);
+        reg[i].salu_arith = v.getrange(i * 32 + 15, 1);
+        reg[i].salu_pred = v.getrange(i * 32 + 16, 16);
+    }
+}
+
+static int counter_to_use(MatchTable *m) {
+    for (auto st : m->get_attached()->statefuls) return st->to<StatefulTable>()->meter_group();
+    BUG("no attached stateful table?");
+    return 0;
+}
+
+template <class REGS>
+void StatefulTable::write_tofino2_common_regs(REGS &regs) {
+    auto &adrdist = regs.rams.match.adrdist;
+    auto &merge = regs.rams.match.merge;
+    auto &vpn_range = adrdist.mau_meter_alu_vpn_range[meter_group()];
+    auto &salu = regs.rams.map_alu.meter_group[meter_group()].stateful;
+    int minvpn, maxvpn;
+    layout_vpn_bounds(minvpn, maxvpn, true);
+    vpn_range.meter_vpn_base = minvpn;
+    vpn_range.meter_vpn_limit = maxvpn;
+    vpn_range.meter_vpn_range_check_enable = 1;
+    int counter_idx = -1;
+    Actions::Action *sweep_action = nullptr;
+    for (MatchTable *m : match_tables) {
+        int mode = 0;
+        if (auto *call = m->get_call(this)) {
+            if (call->args.at(0).type == Call::Arg::Counter) {
+                mode = call->args.at(0).count_mode();
+                if (counter_idx < 0)
+                    counter_idx = counter_to_use(m);
+                else
+                    BUG_CHECK(counter_idx == counter_to_use(m), "conflicting counter use in %s",
+                              name());
+            }
+            if ((mode & FUNCTION_MASK) == FUNCTION_FAST_CLEAR) {
+                for (auto &a : *m->get_actions()) {
+                    if (auto *sw = action_for_table_action(m, &a)) {
+                        BUG_CHECK(!sweep_action || sw == sweep_action,
+                                  "Inconsistent sweep action for %s", name());
+                        sweep_action = sw;
+                    }
+                }
+            }
+        }
+        if (address_used) {
+            auto &slog_map = adrdist.mau_stateful_log_counter_logical_map[m->logical_id];
+            slog_map.stateful_log_counter_logical_map_ctl = meter_group();
+            slog_map.stateful_log_counter_logical_map_enable = 1;
+        }
+        if (mode) {
+            merge.mau_stateful_log_counter_ctl[m->logical_id / 8U][0].set_subfield(
+                mode & PUSHPOP_MASK, 4 * (m->logical_id % 8U), 4);
+            merge.mau_stateful_log_counter_ctl[m->logical_id / 8U][1].set_subfield(
+                (mode >> PUSHPOP_BITS) & PUSHPOP_MASK, 4 * (m->logical_id % 8U), 4);
+            for (auto &rep : merge.mau_stateful_log_ctl_ixbar_map[m->logical_id / 8U]) {
+                if (mode & PUSHPOP_MASK)
+                    rep[0].set_subfield(counter_idx | 0x4, 3 * (m->logical_id % 8U), 3);
+                if ((mode >> PUSHPOP_BITS) & PUSHPOP_MASK)
+                    rep[1].set_subfield(counter_idx | 0x4, 3 * (m->logical_id % 8U), 3);
+            }
+        }
+        if (address_used)
+            adrdist.meter_alu_adr_range_check_icxbar_map[meter_group()] |= 1U << m->logical_id;
+        if (offset_vpn) {
+            if (!address_used)
+                warning(lineno,
+                        "Adjusting output address of %s for next stage, but noone is "
+                        "reading it",
+                        name());
+            adrdist.mau_stateful_log_stage_vpn_offset[m->logical_id].stateful_log_stage_vpn_offset =
+                maxvpn - minvpn + 1;
+            // state_instr_width_logical and stateful_log_stage_vpn_offset
+            // should be set or unset together as they are both used for the
+            // stateful logging fifo feature. See figure 6-73 in jbay uarch.
+            adrdist.stateful_instr_width_logical[m->logical_id] = format->log2size - 3;
+        }
+    }
+    switch (meter_group()) {
+        case 0:
+            adrdist.meter_adr_shift.meter_adr_shift0 = meter_adr_shift;
+            break;
+        case 1:
+            adrdist.meter_adr_shift.meter_adr_shift1 = meter_adr_shift;
+            break;
+        case 2:
+            adrdist.meter_adr_shift.meter_adr_shift2 = meter_adr_shift;
+            break;
+        case 3:
+            adrdist.meter_adr_shift.meter_adr_shift3 = meter_adr_shift;
+            break;
+    }
+    if (counter_idx >= 0) {
+        auto &oxbar_map = adrdist.mau_stateful_log_counter_oxbar_map[meter_group()];
+        oxbar_map.stateful_log_counter_oxbar_ctl = counter_idx;
+        oxbar_map.stateful_log_counter_oxbar_enable = 1;
+    }
+    auto &ctl2 = merge.mau_stateful_log_counter_ctl2[meter_group()];
+    auto &ctl3 = merge.mau_stateful_log_counter_ctl3[meter_group()];
+    if (stateful_counter_mode && (stateful_counter_mode & FUNCTION_MASK) != FUNCTION_FAST_CLEAR) {
+        ctl2.slog_counter_function = stateful_counter_mode >> FUNCTION_SHIFT;
+        ctl2.slog_instruction_width = format->log2size - 3;
+        if ((stateful_counter_mode & PUSH_ANY) == 0) ctl2.slog_push_event_ctl = 1;
+        if ((stateful_counter_mode & POP_ANY) == 0) ctl2.slog_pop_event_ctl = 1;
+        ctl2.slog_vpn_base = logvpn_min;
+        ctl2.slog_vpn_limit = logvpn_max;
+        if (watermark_level) {
+            ctl2.slog_watermark_ctl = watermark_pop_not_push;
+            ctl2.slog_watermark_enable = 1;
+            merge.mau_stateful_log_watermark_threshold[meter_group()] = watermark_level;
+        }
+        if (underflow_action.set()) {
+            auto act = actions->action(underflow_action.name);
+            BUG_CHECK(act);
+            // 4-bit stateful addr MSB encoding for instruction, as given by table 6-67 (6.4.4.11)
+            ctl3.slog_underflow_instruction = act->code * 2 + 1;
+        }
+        if (overflow_action.set()) {
+            auto act = actions->action(overflow_action.name);
+            BUG_CHECK(act);
+            ctl3.slog_overflow_instruction = act->code * 2 + 1;
+        }
+    } else {
+        // we set up for fast clear from the control plane if the counter mode is unused
+        ctl2.slog_counter_function = FUNCTION_FAST_CLEAR >> FUNCTION_SHIFT;
+        ctl2.slog_instruction_width = 4;  // 128 bits
+        ctl2.slog_vpn_base = minvpn;
+        ctl2.slog_vpn_limit = maxvpn;
+        if (busy_value) salu.stateful_clear_action_output = busy_value;
+        if (clear_value) {
+            set_raw_instr_bits(salu.salu_instr_state_alu[3], clear_value);
+            salu.stateful_ctl.salu_clear_value_ctl = 1;
+        }
+        if (sweep_action) {
+            ctl3.slog_overflow_instruction = sweep_action->code * 2 + 1;
+        } else {
+            ctl3.slog_overflow_instruction = 0x6;
+        }
+    }
+    regs.rams.map_alu.meter_alu_group_phv_hash_shift[meter_group()] = phv_hash_shift;
+    unsigned idx = 0;
+    for (auto &slice : regs.rams.map_alu.meter_alu_group_phv_hash_mask[meter_group()])
+        slice = phv_hash_mask.getrange(32 * idx++, 32);
+
+    for (size_t i = 0; i < const_vals.size(); ++i) {
+        if (const_vals[i].value > (INT64_C(1) << 33) || const_vals[i].value <= -(INT64_C(1) << 33))
+            error(const_vals[i].lineno, "constant value %" PRId64 " too large for stateful alu",
+                  const_vals[i].value);
+        salu.salu_const_regfile[i] = const_vals[i].value & 0xffffffffU;
+        salu.salu_const_regfile_msbs[i] = (const_vals[i].value >> 32) & 0x3;
+    }
+    if (stage_alu_id >= 0) {
+        salu.stateful_ctl.salu_stage_id = stage_alu_id;
+        salu.stateful_ctl.salu_stage_id_enable = 1;
+    }
+}
+
+// This is called write_logging_regs, but it handles all tofino2+ target specific
+// registers, as write_regs is not specialized and this is.  Should rename?
+template <>
+void StatefulTable::write_logging_regs(Target::JBay::mau_regs &regs) {
+    write_tofino2_common_regs(regs);
+}
+
+/// Compute the proper value for the register
+///    map_alu.meter_alu_group_data_delay_ctl[].meter_alu_right_group_delay
+/// which controls the two halves of the ixbar->meter_alu fifo, based on a bytemask of which
+/// bytes are needed in the meter_alu.  On JBay, the fifo is 128 bits wide, so each enable
+/// bit controls 64 bits
+int AttachedTable::meter_alu_fifo_enable_from_mask(Target::JBay::mau_regs &, unsigned bytemask) {
+    int rv = 0;
+    if (bytemask & 0xff) rv |= 1;
+    if (bytemask & 0xff00) rv |= 2;
+    return rv;
+}
+
+void StatefulTable::gen_tbl_cfg(Target::JBay, json::map &tbl, json::map &stage_tbl) const {
+    static const char *table_type[] = {"normal", "log", "fifo", "stack", "bloom_clear"};
+    if (tbl["stateful_table_type"]) {
+        // overall table info already set in an earlier stage; don't override it
+        return;
+    }
+    tbl["stateful_table_type"] = table_type[stateful_counter_mode >> FUNCTION_SHIFT];
+    bool has_push = (stateful_counter_mode & PUSHPOP_MASK) != 0;
+    bool has_pop = (stateful_counter_mode & (PUSHPOP_MASK << PUSHPOP_BITS)) != 0;
+    for (MatchTable *m : match_tables) {
+        if (auto *call = m->get_call(this)) {
+            if (call->args.at(0).type == Call::Arg::Counter) {
+                unsigned mode = call->args.at(0).count_mode();
+                has_push |= (mode & PUSHPOP_MASK) != 0;
+                has_pop |= (mode & (PUSHPOP_MASK << PUSHPOP_BITS)) != 0;
+            }
+        }
+    }
+    if (has_push) {
+        if (has_pop)
+            tbl["stateful_direction"] = "inout";
+        else
+            tbl["stateful_direction"] = "in";
+    } else if (has_pop) {
+        tbl["stateful_direction"] = "out";
+    }
+    tbl["stateful_counter_index"] = meter_group();
+}
diff --git a/backends/tofino/bf-asm/jbay/stateful.h b/backends/tofino/bf-asm/jbay/stateful.h
new file mode 100644
index 00000000000..91d9a8cb8a4
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/stateful.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_JBAY_STATEFUL_H_
+#define BACKENDS_TOFINO_BF_ASM_JBAY_STATEFUL_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+#include "backends/tofino/bf-asm/target.h"
+
+// FIXME -- should be a namespace somwhere?  Or in class StatefulTable
+/* for jbay counter mode, we may need both a push and a pop mode, as well as counter_function,
+ * so we pack them all into an int with some shifts and masks */
+enum {
+    PUSHPOP_BITS = 5,
+    PUSHPOP_MASK = 0xf,
+    PUSHPOP_ANY = 0x10,
+    PUSH_MASK = PUSHPOP_MASK,
+    PUSH_ANY = PUSHPOP_ANY,
+    POP_MASK = PUSHPOP_MASK << PUSHPOP_BITS,
+    POP_ANY = PUSHPOP_ANY << PUSHPOP_BITS,
+    PUSH_MISS = 1,
+    PUSH_HIT = 2,
+    PUSH_GATEWAY = 3,
+    PUSH_ALL = 4,
+    PUSH_GW_ENTRY = 5,
+    POP_MISS = PUSH_MISS << PUSHPOP_BITS,
+    POP_HIT = PUSH_HIT << PUSHPOP_BITS,
+    POP_GATEWAY = PUSH_GATEWAY << PUSHPOP_BITS,
+    POP_ALL = PUSH_ALL << PUSHPOP_BITS,
+    POP_GW_ENTRY = PUSH_GW_ENTRY << PUSHPOP_BITS,
+    FUNCTION_SHIFT = 2 * PUSHPOP_BITS,
+    FUNCTION_LOG = 1 << FUNCTION_SHIFT,
+    FUNCTION_FIFO = 2 << FUNCTION_SHIFT,
+    FUNCTION_STACK = 3 << FUNCTION_SHIFT,
+    FUNCTION_FAST_CLEAR = 4 << FUNCTION_SHIFT,
+    FUNCTION_MASK = 0xf << FUNCTION_SHIFT,
+};
+
+int parse_jbay_counter_mode(const value_t &v);
+template <>
+void StatefulTable::write_logging_regs(Target::JBay::mau_regs &regs);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_JBAY_STATEFUL_H_ */
diff --git a/backends/tofino/bf-asm/jbay/template_objects.yaml b/backends/tofino/bf-asm/jbay/template_objects.yaml
new file mode 100644
index 00000000000..9dc2f8ae478
--- /dev/null
+++ b/backends/tofino/bf-asm/jbay/template_objects.yaml
@@ -0,0 +1,121 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+global:
+  - namespace=JBay
+  - binary_offset
+  - emit_binary
+  - emit_fieldname
+  - emit_json
+  - enable_disable
+  - input_binary
+  - write_dma=mapram_config
+  - write_dma=imem_dark_subword8
+  - write_dma=imem_dark_subword16
+  - write_dma=imem_dark_subword32
+  - write_dma=imem_mocha_subword8
+  - write_dma=imem_mocha_subword16
+  - write_dma=imem_mocha_subword32
+  - write_dma=imem_subword8
+  - write_dma=imem_subword16
+  - write_dma=imem_subword32
+  - write_dma=galois_field_matrix
+generate:
+  memories:
+    jbay_mem:
+      memories.jbay_mem.h: [ decl, name=memories.top ]
+      memories.jbay_mem.cpp: [ defn, name=memories.top,
+          -Imemories.jbay_mem.h, -Imemories.pipe_addrmap.h ]
+    pipe_addrmap:  # pipes
+      memories.pipe_addrmap.h: [ decl, name=memories.pipe, widereg ]
+      memories.pipe_addrmap.cpp: [ defn, name=memories.pipe, widereg,
+          -Imemories.pipe_addrmap.h, -Imemories.prsr_mem_main_rspec.h ]
+    # parde_mem -- parde
+    prsr_mem_main_rspec:  # i_prsr_mem e_prsr_mem
+      memories.prsr_mem_main_rspec.h: [ decl, name=memories.parser.%s ]
+      memories.prsr_mem_main_rspec.cpp: [ defn, name=memories.parser.%s,
+          -Imemories.prsr_mem_main_rspec.h ]
+
+  regs:
+    jbay_reg:
+      regs.jbay_reg.h: [ decl, name=regs.top ]
+      regs.jbay_reg.cpp: [ defn, name=regs.top,
+          -Iregs.jbay_reg.h, -Iregs.pipe_addrmap.h ]
+    pipe_addrmap:  # pipea
+      regs.pipe_addrmap.h: [ decl, name=regs.pipe, widereg ]
+      regs.pipe_addrmap.cpp: [ defn, name=regs.pipe, widereg,
+          -Iregs.pipe_addrmap.h, -Iregs.ipb_prsr4_reg.h, -Iregs.epb_prsr4_reg.h,
+          -Iregs.pmerge_reg.h, -Iregs.mau_addrmap.h, -Iregs.dprsr_reg.h ]
+    mau_addrmap:  # mau
+      regs.mau_addrmap.h: [ decl, name=regs.match_action_stage.%02x ]
+      regs.mau_addrmap.cpp: [ defn, name=regs.match_action_stage.%02x,
+          -Iregs.mau_addrmap.h ]
+    # parde_glue_stn_reg
+    ipb_prsr4_reg:  # ipbprsr4reg
+      regs.ipb_prsr4_reg.h: [ decl, name=regs.parser.ingress ]
+      regs.ipb_prsr4_reg.cpp: [ defn, name=regs.parser.ingress,
+          -Iregs.ipb_prsr4_reg.h, -Iregs.prsr_reg_main_rspec.h ]
+    prsr_reg_main_rspec:  # prsr
+      regs.prsr_reg_main_rspec.h: [ decl, name=regs.parser.main.%s ]
+      regs.prsr_reg_main_rspec.cpp: [ defn, name=regs.parser.main.%s,
+          -Iregs.prsr_reg_main_rspec.h ]
+    pmerge_reg:  # pmergereg
+      regs.pmerge_reg.h: [ decl, name=regs.parse_merge ]
+      regs.pmerge_reg.cpp: [ defn, name=regs.parse_merge,
+          -Iregs.pmerge_reg.h ]
+    epb_prsr4_reg:  # epbprsr4reg
+      regs.epb_prsr4_reg.h: [ decl, name=regs.parser.egress ]
+      regs.epb_prsr4_reg.cpp: [ defn, name=regs.parser.egress,
+          -Iregs.epb_prsr4_reg.h, -Iregs.prsr_reg_main_rspec.h ]
+    # prsr_reg_main_rspec  # prsr
+    # mirr_ebuf_reg
+    dprsr_reg:
+      regs.dprsr_reg.h: [ decl, name=regs.deparser ]
+      regs.dprsr_reg.cpp: [ defn, name=regs.deparser,
+          -Iregs.dprsr_reg.h ]
+
+ignore:
+  memories:
+    # jbay_mem
+    - tm_top_mem_rspec  # tm
+    # pipes
+    - mau_addrmap  # mau -- just a dummy reg
+    # parde
+    - pgr_mem_rspec
+
+  regs:
+    # jbay_reg
+    - dvsl_addrmap
+    - eth100g_addrmap
+    - eth400g_addrmap
+    - gpio_regs
+    - serdes_addrmap
+    # pipes
+    # mau
+    # pardereg
+    # parde_glue_stn_reg
+    # parb_reg
+    - ebuf900_reg
+    - pbus_station_regs_rspec
+    - pgr_reg_rspec  # pgrreg
+    - s2p_reg
+    - p2s_reg
+    - parde_glue_reg_rspec  #pgluereg
+    # mirr_ebuf_reg  # mirefreg
+    # dprsr_reg
+    # dprsr_reg_rspec
+    - parde_dprsr_reg_rspec
diff --git a/backends/tofino/bf-asm/json.cpp b/backends/tofino/bf-asm/json.cpp
new file mode 100644
index 00000000000..4f1d1572024
--- /dev/null
+++ b/backends/tofino/bf-asm/json.cpp
@@ -0,0 +1,253 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/json.h"
+
+#include <iomanip>
+#include <sstream>
+
+#include "lib/hex.h"
+
+namespace json {
+
+static int digit_value(char ch) {
+    if (ch >= 'a') return ch - 'a' + 10;
+    if (ch >= 'A') return ch - 'A' + 10;
+    if (ch >= '0' && ch <= '9') return ch - '0';
+    return 999;
+}
+
+// true iff the string ends in an odd number of '\' characters
+static bool odd_backslash(const std::string &s) {
+    int cnt = 0;
+    for (int i = s.size() - 1; i >= 0; --i) {
+        if (s[i] != '\\') break;
+        cnt++;
+    }
+    return (cnt & 1) == 1;
+}
+
+std::istream &operator>>(std::istream &in, std::unique_ptr<obj> &json) {
+    while (in) {
+        bool neg = false;
+        char ch;
+        int base = 10, digit;
+        in >> ch;
+        switch (ch) {
+            case '-':
+                neg = true;
+                in >> ch;
+                if (ch != '0') goto digit;
+                /* fall through */
+            case '0':
+                base = 8;
+                in >> ch;
+                if (ch == 'x' || ch == 'X') {
+                    base = 16;
+                    in >> ch;
+                } else if (ch == 'b') {
+                    base = 2;
+                    in >> ch;
+                }
+                /* fall through */
+            digit:
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                int64_t l = 0;
+                while (in && (digit = digit_value(ch)) < base) {
+                    if ((INT64_MAX - digit) / base < l) {
+                        std::cerr << "overflow detected" << std::endl;
+                    }
+                    l = l * base + digit;
+                    in >> ch;
+                }
+                if (in) in.unget();
+                if (neg) l = -l;
+                json.reset(new number(l));
+                return in;
+            }
+            case '"': {
+                std::string s;
+                getline(in, s, '"');
+                while (odd_backslash(s)) {
+                    std::string tmp;
+                    getline(in, tmp, '"');
+                    s += '\"';
+                    s += tmp;
+                }
+                json.reset(new string(std::move(s)));
+                return in;
+            }
+            case '[': {
+                std::unique_ptr<vector> rv(new vector());
+                in >> ch;
+                if (ch != ']') {
+                    in.unget();
+                    do {
+                        std::unique_ptr<obj> o;
+                        in >> o >> ch;
+                        rv->push_back(std::move(o));
+                        if (ch != ',' && ch != ']') {
+                            std::cerr << "missing ',' in vector (saw '" << ch << "')" << std::endl;
+                            in.unget();
+                        }
+                    } while (in && ch != ']');
+                }
+                json = std::move(rv);
+                return in;
+            }
+            case '{': {
+                std::unique_ptr<map> rv(new map());
+                in >> ch;
+                if (ch != '}') {
+                    in.unget();
+                    do {
+                        std::unique_ptr<obj> key, val;
+                        in >> key >> ch;
+                        if (ch == '}') {
+                            std::cerr << "missing value in map" << std::endl;
+                        } else {
+                            if (ch != ':') {
+                                std::cerr << "missing ':' in map (saw '" << ch << "')" << std::endl;
+                                in.unget();
+                            }
+                            in >> val >> ch;
+                        }
+                        if (rv->count(key.get()))
+                            std::cerr << "duplicate key in map" << std::endl;
+                        else
+                            (*rv)[std::move(key)] = std::move(val);
+                        if (ch != ',' && ch != '}') {
+                            std::cerr << "missing ',' in map (saw '" << ch << "')" << std::endl;
+                            in.unget();
+                        }
+                    } while (in && ch != '}');
+                }
+                json = std::move(rv);
+                return in;
+            }
+            default:
+                if (isalpha(ch) || ch == '_') {
+                    std::string s;
+                    while (isalnum(ch) || ch == '_') {
+                        s += ch;
+                        if (!(in >> ch)) break;
+                    }
+                    in.unget();
+                    if (s == "true")
+                        json.reset(new True());
+                    else if (s == "false")
+                        json.reset(new False());
+                    else if (s == "null")
+                        json.reset();
+                    else
+                        json.reset(new string(std::move(s)));
+                    return in;
+                } else {
+                    std::cerr << "unexpected character '" << ch << "' (0x" << hex(ch) << ")"
+                              << std::endl;
+                }
+        }
+    }
+    return in;
+}
+
+void vector::print_on(std::ostream &out, int indent, int width, const char *pfx) const {
+    int twidth = width;
+    bool first = true;
+    bool oneline = test_width(twidth);
+    out << '[';
+    indent += 2;
+    for (auto &e : *this) {
+        if (!first) out << ',';
+        if (!oneline) out << '\n' << pfx << std::setw(indent);
+        out << ' ' << std::setw(0);
+        if (e)
+            e->print_on(out, indent, width - 2, pfx);
+        else
+            out << "null";
+        first = false;
+    }
+    indent -= 2;
+    if (!first) out << (oneline ? ' ' : '\n');
+    if (!oneline) out << std::setw(indent + 1);
+    out << ']';
+}
+
+void map::print_on(std::ostream &out, int indent, int width, const char *pfx) const {
+    int twidth = width;
+    bool first = true;
+    bool oneline = test_width(twidth);
+    // std::cout << "*** width=" << width << "  twdith=" << twidth << std::endl;
+    out << '{';
+    indent += 2;
+    for (auto &e : *this) {
+        if (!first) out << ',';
+        if (!oneline) out << '\n' << pfx << std::setw(indent);
+        out << ' ' << std::setw(0);
+        e.first->print_on(out, indent, width - 2, pfx);
+        out << ": ";
+        if (e.second)
+            e.second->print_on(out, indent, width - 2, pfx);
+        else
+            out << "null";
+        first = false;
+    }
+    indent -= 2;
+    if (!first) out << (oneline ? ' ' : '\n');
+    if (!oneline) out << std::setw(indent + 1);
+    out << '}';
+}
+
+std::string obj::toString() const {
+    std::stringstream buf;
+    print_on(buf);
+    return buf.str();
+}
+
+map &map::merge(const map &a) {
+    for (auto &el : a) {
+        if (!el.second) {
+            erase(el.first);
+        } else if (count(el.first)) {
+            auto &exist = at(el.first);
+            if (exist->is<map>() && el.second->is<map>()) {
+                exist->to<map>().merge(el.second->to<map>());
+            } else if (exist->is<vector>() && el.second->is<vector>()) {
+                auto &vec = exist->to<vector>();
+                for (auto &vel : el.second->to<vector>()) vec.push_back(vel->clone());
+            } else {
+                exist = el.second->clone();
+            }
+        } else {
+            emplace(el.first->clone().release(), el.second->clone());
+        }
+    }
+    return *this;
+}
+
+}  // namespace json
+
+void dump(const json::obj &o) { std::cout << &o << std::endl; }
+void dump(const json::obj *o) { std::cout << o << std::endl; }
diff --git a/backends/tofino/bf-asm/json.h b/backends/tofino/bf-asm/json.h
new file mode 100644
index 00000000000..27f707e9fa3
--- /dev/null
+++ b/backends/tofino/bf-asm/json.h
@@ -0,0 +1,641 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_JSON_H_  //  NOLINT(build/header_guard)
+#define BACKENDS_TOFINO_BF_ASM_JSON_H_
+
+#include <assert.h>
+
+#include <cinttypes>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <typeindex>
+#include <vector>
+
+#include "backends/tofino/bf-asm/rvalue_reference_wrapper.h"
+#include "lib/ordered_map.h"
+
+using namespace P4;
+
+namespace json {
+
+/* this is std::make_unique, except that is missing in some compilers/versions.  We give
+ * it a different name as other compilers complain about ambiguities if we don't... */
+template <class T, class... Args>
+std::unique_ptr<T> mkuniq(Args &&...args) {
+    std::unique_ptr<T> ret(new T(std::forward<Args>(args)...));
+    return ret;
+}
+
+class number;
+class string;
+class vector;
+class map;
+
+class obj {
+ public:
+    obj() {}
+    obj(const obj &) = default;
+    obj(obj &&) = default;
+    obj &operator=(const obj &) & = default;
+    obj &operator=(obj &&) & = default;
+    virtual ~obj() {}
+    virtual bool operator<(const obj &a) const = 0;
+    bool operator>=(const obj &a) const { return !(*this < a); }
+    bool operator>(const obj &a) const { return a < *this; }
+    bool operator<=(const obj &a) const { return !(a < *this); }
+    virtual bool operator==(const obj &a) const = 0;
+    bool operator!=(const obj &a) const { return !(*this == a); }
+    virtual bool operator==(const char * /*str*/) const { return false; }
+    virtual bool operator==(const std::string & /*str*/) const { return false; }
+    virtual bool operator==(const string & /*str*/) const { return false; }
+    bool operator!=(const char *str) const { return !(*this == str); }
+    virtual bool operator==(int64_t /*val*/) const { return false; }
+    bool operator!=(int64_t val) const { return !(*this == val); }
+    struct ptrless {
+        bool operator()(const obj *a, const obj *b) const { return b ? a ? *a < *b : true : false; }
+        bool operator()(const std::unique_ptr<obj> &a, const std::unique_ptr<obj> &b) const {
+            return b ? a ? *a < *b : true : false;
+        }
+    };
+    virtual void print_on(std::ostream &out, int /*indent*/ = 0, int /*width*/ = 80,
+                          const char * /*pfx*/ = "") const = 0;
+    virtual bool test_width(int &limit) const = 0;
+    virtual number *as_number() { return nullptr; }
+    virtual const number *as_number() const { return nullptr; }
+    virtual string *as_string() { return nullptr; }
+    virtual const string *as_string() const { return nullptr; }
+    virtual vector *as_vector() { return nullptr; }
+    virtual const vector *as_vector() const { return nullptr; }
+    virtual map *as_map() { return nullptr; }
+    virtual const map *as_map() const { return nullptr; }
+    virtual const char *c_str() const { return nullptr; }
+    template <class T>
+    bool is() const {
+        return dynamic_cast<const T *>(this) != nullptr;
+    }
+    template <class T>
+    T &to() {
+        return dynamic_cast<T &>(*this);
+    }
+    template <class T>
+    const T &to() const {
+        return dynamic_cast<const T &>(*this);
+    }
+    virtual std::unique_ptr<obj> copy() && = 0;      // Creates a shallow copy of unique_ptr
+    virtual std::unique_ptr<obj> clone() const = 0;  // Creates a deep copy of obj
+    static std::unique_ptr<obj> clone_ptr(const std::unique_ptr<obj> &a) {
+        return a ? a->clone() : std::unique_ptr<obj>();
+    }
+    std::string toString() const;
+};
+
+class True : public obj {
+    bool operator<(const obj &a) const {
+        return std::type_index(typeid(*this)) < std::type_index(typeid(a));
+    }
+    bool operator==(const obj &a) const { return dynamic_cast<const True *>(&a) != 0; }
+    void print_on(std::ostream &out, int /*indent*/ = 0, int /*width*/ = 80,
+                  const char * /*pfx*/ = "") const {
+        out << "true";
+    }
+    bool test_width(int &limit) const {
+        limit -= 4;
+        return limit >= 0;
+    }
+    std::unique_ptr<obj> copy() && { return mkuniq<True>(std::move(*this)); }
+    std::unique_ptr<obj> clone() const { return mkuniq<True>(); }
+};
+
+class False : public obj {
+    bool operator<(const obj &a) const {
+        return std::type_index(typeid(*this)) < std::type_index(typeid(a));
+    }
+    bool operator==(const obj &a) const { return dynamic_cast<const False *>(&a) != 0; }
+    void print_on(std::ostream &out, int /*indent*/ = 0, int /*width*/ = 80,
+                  const char * /*pfx*/ = "") const {
+        out << "false";
+    }
+    bool test_width(int &limit) const {
+        limit -= 5;
+        return limit >= 0;
+    }
+    std::unique_ptr<obj> copy() && { return mkuniq<False>(std::move(*this)); }
+    std::unique_ptr<obj> clone() const { return mkuniq<False>(); }
+};
+
+class number : public obj {
+ public:
+    int64_t val;
+    explicit number(int64_t l) : val(l) {}
+    ~number() {}
+    bool operator<(const obj &a) const override {
+        if (auto *b = dynamic_cast<const number *>(&a)) return val < b->val;
+        return std::type_index(typeid(*this)) < std::type_index(typeid(a));
+    }
+    bool operator==(const obj &a) const override {
+        if (auto *b = dynamic_cast<const number *>(&a)) return val == b->val;
+        return false;
+    }
+    bool operator==(int64_t v) const override { return val == v; }
+    void print_on(std::ostream &out, int /*indent*/ = 0, int /*width*/ = 80,
+                  const char * /*pfx*/ = "") const override {
+        out << val;
+    }
+    bool test_width(int &limit) const override {
+        char buf[32];
+        limit -= snprintf(buf, sizeof(buf), "%" PRId64, val);
+        return limit >= 0;
+    }
+    number *as_number() override { return this; }
+    const number *as_number() const override { return this; }
+    std::unique_ptr<obj> copy() && override { return mkuniq<number>(std::move(*this)); }
+    std::unique_ptr<obj> clone() const override { return mkuniq<number>(val); }
+};
+
+class string : public obj, public std::string {
+ public:
+    string() {}
+    string(const string &) = default;
+    string(const std::string &a) : std::string(a) {}  // NOLINT(runtime/explicit)
+    string(const char *a) : std::string(a) {}         // NOLINT(runtime/explicit)
+    string(string &&) = default;
+    string(std::string &&a) : std::string(a) {}            // NOLINT
+    string(int64_t l) : std::string(std::to_string(l)) {}  // NOLINT
+    string &operator=(const string &) & = default;
+    string &operator=(string &&) & = default;
+    ~string() {}
+    bool operator<(const obj &a) const override {
+        if (const string *b = dynamic_cast<const string *>(&a))
+            return static_cast<const std::string &>(*this) < static_cast<const std::string &>(*b);
+        return std::type_index(typeid(*this)) < std::type_index(typeid(a));
+    }
+    bool operator==(const obj &a) const override {
+        if (const string *b = dynamic_cast<const string *>(&a))
+            return static_cast<const std::string &>(*this) == static_cast<const std::string &>(*b);
+        return false;
+    }
+    bool operator==(const string &a) const override {
+        return static_cast<const std::string &>(*this) == static_cast<const std::string &>(a);
+    }
+    bool operator==(const char *str) const override {
+        return static_cast<const std::string &>(*this) == str;
+    }
+    bool operator==(const std::string &str) const override {
+        return static_cast<const std::string &>(*this) == str;
+    }
+    void print_on(std::ostream &out, int /*indent*/ = 0, int /*width*/ = 80,
+                  const char * /*pfx*/ = "") const override {
+        out << '"' << *this << '"';
+    }
+    bool test_width(int &limit) const override {
+        limit -= size() + 2;
+        return limit >= 0;
+    }
+    const char *c_str() const override { return std::string::c_str(); }
+    string *as_string() override { return this; }
+    const string *as_string() const override { return this; }
+    std::unique_ptr<obj> copy() && override { return mkuniq<string>(std::move(*this)); }
+    std::unique_ptr<obj> clone() const override { return mkuniq<string>(*this); }
+};
+
+class map;  // forward decl
+
+typedef std::vector<std::unique_ptr<obj>> vector_base;
+class vector : public obj, public vector_base {
+ public:
+    vector() {}
+    vector(const vector &) = delete;
+    vector(vector &&) = default;
+    vector(const std::initializer_list<rvalue_reference_wrapper<obj>> &init) {
+        for (auto o : init) push_back(o.get().copy());
+    }
+    vector &operator=(const vector &) & = delete;
+    vector &operator=(vector &&) & = default;
+    ~vector() {}
+    bool operator<(const obj &a) const override {
+        if (const vector *b = dynamic_cast<const vector *>(&a)) {
+            auto p1 = begin(), p2 = b->begin();
+            while (p1 != end() && p2 != b->end()) {
+                if (**p1 < **p2) return true;
+                if (**p1 != **p2) return false;
+                p1++;
+                p2++;
+            }
+            return p2 != b->end();
+        }
+        return std::type_index(typeid(*this)) < std::type_index(typeid(a));
+    }
+    using obj::operator<=;
+    using obj::operator>=;
+    using obj::operator>;
+    bool operator==(const obj &a) const override {
+        if (const vector *b = dynamic_cast<const vector *>(&a)) {
+            auto p1 = begin(), p2 = b->begin();
+            while (p1 != end() && p2 != b->end()) {
+                if (**p1 != **p2) return false;
+                p1++;
+                p2++;
+            }
+            return (p1 == end() && p2 == b->end());
+        }
+        return false;
+    }
+    using obj::operator!=;
+    void print_on(std::ostream &out, int /*indent*/ = 0, int /*width*/ = 80,
+                  const char * /*pfx*/ = "") const override;
+    bool test_width(int &limit) const override {
+        limit -= 2;
+        for (auto &e : *this) {
+            if (e ? !e->test_width(limit) : (limit -= 4) < 0) return false;
+            if ((limit -= 2) < 0) return false;
+        }
+        return true;
+    }
+    using vector_base::push_back;
+    void push_back(decltype(nullptr)) { push_back(std::unique_ptr<obj>()); }
+    void push_back(bool t) {
+        if (t)
+            push_back(mkuniq<True>(True()));
+        else
+            push_back(mkuniq<False>(False()));
+    }
+    void push_back(int64_t n) { push_back(mkuniq<number>(number(n))); }
+    void push_back(int n) { push_back((int64_t)n); }
+    void push_back(unsigned int n) { push_back((int64_t)n); }
+    void push_back(uint64_t n) { push_back((int64_t)n); }
+    void push_back(const char *s) { push_back(mkuniq<string>(string(s))); }
+    void push_back(std::string s) { push_back(mkuniq<string>(string(s))); }
+    void push_back(string s) { push_back(mkuniq<string>(s)); }
+    void push_back(vector &&v) { push_back(mkuniq<vector>(std::move(v))); }
+    void push_back(json::map &&);  // NOLINT(whitespace/operators)
+    vector *as_vector() override { return this; }
+    const vector *as_vector() const override { return this; }
+    std::unique_ptr<obj> copy() && override { return mkuniq<vector>(std::move(*this)); }
+    std::unique_ptr<obj> clone() const override {
+        vector *v = new vector();
+        for (auto &e : *this) v->push_back(clone_ptr(e));
+        return std::unique_ptr<vector>(v);
+    }
+};
+
+typedef ordered_map<obj *, std::unique_ptr<obj>, obj::ptrless> map_base;
+class map : public obj, public map_base {
+ public:
+    map() {}
+    map(const map &) = default;
+    map(map &&) = default;
+    map(const std::initializer_list<std::pair<std::string, obj &&>> &init) {
+        for (auto &pair : init) (*this)[pair.first] = std::move(pair.second).copy();
+    }
+    map &operator=(const map &) & = default;
+    map &operator=(map &&) & = default;
+    ~map() {
+        for (auto &e : *this) delete e.first;
+    }
+    bool operator<(const obj &a) const override {
+        if (const map *b = dynamic_cast<const map *>(&a)) {
+            auto p1 = begin(), p2 = b->begin();
+            while (p1 != end() && p2 != b->end()) {
+                if (*p1->first < *p2->first) return true;
+                if (*p1->first != *p2->first) return false;
+                if (*p1->second < *p2->second) return true;
+                if (*p1->second != *p2->second) return false;
+                p1++;
+                p2++;
+            }
+            return p2 != b->end();
+        }
+        return std::type_index(typeid(*this)) < std::type_index(typeid(a));
+    }
+    using obj::operator<=;
+    using obj::operator>=;
+    using obj::operator>;
+    bool operator==(const obj &a) const override {
+        if (const map *b = dynamic_cast<const map *>(&a)) {
+            auto p1 = begin(), p2 = b->begin();
+            while (p1 != end() && p2 != b->end()) {
+                if (*p1->first != *p2->first) return false;
+                if (*p1->second != *p2->second) return false;
+                p1++;
+                p2++;
+            }
+            return (p1 == end() && p2 == b->end());
+        }
+        return false;
+    }
+    using obj::operator!=;
+    std::unique_ptr<obj> remove(const char *key) {
+        string tmp(key);
+        auto itr = find(&tmp);
+        if (itr != end()) {
+            std::unique_ptr<obj> val = std::move(itr->second);
+            this->erase(itr);
+            return val;
+        }
+        return std::unique_ptr<obj>();
+    }
+    void print_on(std::ostream &out, int /*indent*/ = 0, int /*width*/ = 80,
+                  const char * /*pfx*/ = "") const override;
+    bool test_width(int &limit) const override {
+        limit -= 2;
+        for (auto &e : *this) {
+            if (!e.first->test_width(limit)) return false;
+            if (e.second ? !e.second->test_width(limit) : (limit -= 4) < 0) return false;
+            if ((limit -= 4) < 0) return false;
+        }
+        return true;
+    }
+    using map_base::count;
+    map_base::size_type count(const char *str) const {
+        string tmp(str);
+        return count(&tmp);
+    }
+    map_base::size_type count(std::string &str) const {
+        string tmp(str);
+        return count(&tmp);
+    }
+    map_base::size_type count(int64_t n) const {
+        number tmp(n);
+        return count(&tmp);
+    }
+    // using map_base::operator[];
+    obj *operator[](const std::unique_ptr<obj> &i) const {
+        auto rv = find(i.get());
+        if (rv != end()) return rv->second.get();
+        return 0;
+    }
+    obj *operator[](const char *str) const {
+        string tmp(str);
+        auto rv = find(&tmp);
+        if (rv != end()) return rv->second.get();
+        return 0;
+    }
+    obj *operator[](const std::string &str) const {
+        string tmp(str);
+        auto rv = find(&tmp);
+        if (rv != end()) return rv->second.get();
+        return 0;
+    }
+    obj *operator[](int64_t n) const {
+        number tmp(n);
+        auto rv = find(&tmp);
+        if (rv != end()) return rv->second.get();
+        return 0;
+    }
+
+ private:
+    class element_ref {
+        map &self;
+        std::unique_ptr<obj> key;
+        map_base::iterator iter;
+
+     public:
+        element_ref(map &s, const char *k) : self(s) {
+            string tmp(k);
+            iter = self.find(&tmp);
+            if (iter == self.end()) key.reset(new string(std::move(tmp)));
+        }
+        element_ref(map &s, int64_t k) : self(s) {
+            number tmp(k);
+            iter = self.find(&tmp);
+            if (iter == self.end()) key.reset(new number(std::move(tmp)));
+        }
+        element_ref(map &s, std::unique_ptr<obj> &&k) : self(s) {
+            iter = self.find(k.get());
+            if (iter == self.end()) key = std::move(k);
+        }
+        void operator=(decltype(nullptr)) {
+            if (key) {
+                iter = self.emplace(key.release(), std::unique_ptr<obj>()).first;
+            } else {
+                assert(iter != self.end());
+                iter->second.reset();
+            }
+        }
+        bool operator=(bool t) {
+            if (key) {
+                iter = self.emplace(key.release(),
+                                    std::unique_ptr<obj>(t ? static_cast<obj *>(new True())
+                                                           : static_cast<obj *>(new False())))
+                           .first;
+            } else {
+                assert(iter != self.end());
+                iter->second.reset(t ? static_cast<obj *>(new True())
+                                     : static_cast<obj *>(new False()));
+            }
+            return t;
+        }
+        bool operator=(void *);  // not defined to avoid converting pointers to bool
+        bool operator==(string &str) {
+            if (key) return false;
+            assert(iter != self.end());
+            return *iter->second == str;
+        }
+        bool operator!=(string &str) { return !(*this == str); }
+        bool operator==(const std::string &str) {
+            if (key) return false;
+            assert(iter != self.end());
+            return *iter->second == str;
+        }
+        bool operator!=(const std::string &str) { return !(*this == str); }
+        bool operator==(int64_t v) {
+            if (key) return false;
+            assert(iter != self.end());
+            return *iter->second == v;
+        }
+        bool operator!=(int64_t v) { return !(*this == v); }
+        const char *operator=(const char *v) {
+            if (key) {
+                iter = self.emplace(key.release(), std::unique_ptr<obj>(new string(v))).first;
+            } else {
+                assert(iter != self.end());
+                iter->second.reset(new string(v));
+            }
+            return v;
+        }
+        const std::string &operator=(const std::string &v) {
+            if (key) {
+                iter = self.emplace(key.release(), std::unique_ptr<obj>(new string(v))).first;
+            } else {
+                assert(iter != self.end());
+                iter->second.reset(new string(v));
+            }
+            return v;
+        }
+        int64_t operator=(int64_t v) {
+            if (key) {
+                iter = self.emplace(key.release(), std::unique_ptr<obj>(new number(v))).first;
+            } else {
+                assert(iter != self.end());
+                iter->second.reset(new number(v));
+            }
+            return v;
+        }
+        int operator=(int v) { return static_cast<int>(*this = static_cast<int64_t>(v)); }
+        unsigned int operator=(unsigned int v) { return (unsigned int)(*this = (int64_t)v); }
+#if defined(__clang__) && defined(__APPLE__)
+        // Clang ang gcc on Mac OS can't agree whether size_t overloads uint64_t or unsigned long
+        // or the overload is not defined!
+        size_t operator=(size_t v) { return (size_t)(*this = (int64_t)v); }
+#endif
+        uint64_t operator=(uint64_t v) { return (uint64_t)(*this = (int64_t)v); }
+        vector &operator=(vector &&v) {
+            if (key) {
+                iter = self.emplace(key.release(), mkuniq<vector>(std::move(v))).first;
+            } else {
+                assert(iter != self.end());
+                iter->second = mkuniq<vector>(std::move(v));
+            }
+            return dynamic_cast<vector &>(*iter->second);
+        }
+        map &operator=(map &&v) {
+            if (key) {
+                iter = self.emplace(key.release(), mkuniq<map>(std::move(v))).first;
+            } else {
+                assert(iter != self.end());
+                iter->second = mkuniq<map>(std::move(v));
+            }
+            return dynamic_cast<map &>(*iter->second);
+        }
+        const std::unique_ptr<obj> &operator=(std::unique_ptr<obj> &&v) {
+            if (key) {
+                iter = self.emplace(key.release(), std::move(v)).first;
+            } else {
+                assert(iter != self.end());
+                iter->second = std::move(v);
+            }
+            return iter->second;
+        }
+        obj &operator*() {
+            assert(!key && iter != self.end());
+            return *iter->second;
+        }
+        explicit operator bool() const { return !key; }
+        obj *get() const { return key ? 0 : iter->second.get(); }
+        obj *operator->() const { return key ? 0 : iter->second.get(); }
+        operator vector &() {
+            if (key) iter = self.emplace(key.release(), mkuniq<vector>()).first;
+            return dynamic_cast<vector &>(*iter->second);
+        }
+        operator map &() {
+            if (key) iter = self.emplace(key.release(), mkuniq<map>()).first;
+            return dynamic_cast<map &>(*iter->second);
+        }
+        element_ref operator[](const char *str) {
+            if (key) iter = self.emplace(key.release(), mkuniq<map>()).first;
+            map *m = dynamic_cast<map *>(iter->second.get());
+            if (!m) throw std::runtime_error("lookup in non-map json object");
+            return element_ref(*m, str);
+        }
+        element_ref operator[](const std::string &str) {
+            if (key) iter = self.emplace(key.release(), mkuniq<map>()).first;
+            map *m = dynamic_cast<map *>(iter->second.get());
+            if (!m) throw std::runtime_error("lookup in non-map json object");
+            return element_ref(*m, str.c_str());
+        }
+        element_ref operator[](int64_t n) {
+            if (key) iter = self.emplace(key.release(), mkuniq<map>()).first;
+            map *m = dynamic_cast<map *>(iter->second.get());
+            if (!m) throw std::runtime_error("lookup in non-map json object");
+            return element_ref(*m, n);
+        }
+        element_ref operator[](std::unique_ptr<obj> &&i) {
+            if (key) iter = self.emplace(key.release(), mkuniq<map>()).first;
+            map *m = dynamic_cast<map *>(iter->second.get());
+            if (!m) throw std::runtime_error("lookup in non-map json object");
+            return element_ref(*m, std::move(i));
+        }
+        template <class T>
+        void push_back(T &&v) {
+            vector &vec = *this;
+            vec.push_back(std::forward<T>(v));
+        }
+        template <class T>
+        bool is() const {
+            return !key && dynamic_cast<T *>(iter->second.get()) != nullptr;
+        }
+        template <class T>
+        T &to() {
+            if (key) iter = self.emplace(key.release(), mkuniq<T>()).first;
+            return dynamic_cast<T &>(*iter->second);
+        }
+    };
+    friend std::ostream &operator<<(std::ostream &out, const element_ref &el);
+
+ public:
+    element_ref operator[](const char *str) { return element_ref(*this, str); }
+    element_ref operator[](const std::string &str) { return element_ref(*this, str.c_str()); }
+    element_ref operator[](int64_t n) { return element_ref(*this, n); }
+    element_ref operator[](std::unique_ptr<obj> &&i) { return element_ref(*this, std::move(i)); }
+    using map_base::erase;
+    map_base::size_type erase(const char *str) {
+        string tmp(str);
+        return map_base::erase(&tmp);
+    }
+    map_base::size_type erase(int64_t n) {
+        number tmp(n);
+        return map_base::erase(&tmp);
+    }
+    map *as_map() override { return this; }
+    const map *as_map() const override { return this; }
+    std::unique_ptr<obj> copy() && override { return mkuniq<map>(std::move(*this)); }
+    std::unique_ptr<obj> clone() const override {
+        map *m = new map();
+        for (auto &e : *this)
+            m->emplace(e.first ? e.first->clone().release() : nullptr, clone_ptr(e.second));
+        return std::unique_ptr<map>(m);
+    }
+
+    /// Merges the given map into this one and returns this map. For any key collisions, if both
+    /// have a map, then they are merged recursively; if both have a vector, then the one in the
+    /// given map is appended to the one in this map; otherwise, the entry in the given map
+    /// replaces the entry in this one.
+    map &merge(const map &a);
+};
+
+inline void vector::push_back(map &&m) { emplace_back(mkuniq<map>(std::move(m))); }
+
+std::istream &operator>>(std::istream &in, std::unique_ptr<obj> &json);
+inline std::istream &operator>>(std::istream &in, obj *&json) {
+    std::unique_ptr<obj> p;
+    in >> p;
+    if (in) json = p.release();
+    return in;
+}
+
+inline std::ostream &operator<<(std::ostream &out, const obj *json) {
+    json->print_on(out);
+    return out;
+}
+inline std::ostream &operator<<(std::ostream &out, const std::unique_ptr<obj> &json) {
+    return out << json.get();
+}
+inline std::ostream &operator<<(std::ostream &out, const map::element_ref &el) {
+    el->print_on(out);
+    return out;
+}
+
+}  // end namespace json
+
+extern void dump(const json::obj *);
+extern void dump(const json::obj &);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_JSON_H_ */
diff --git a/backends/tofino/bf-asm/json_diff.cpp b/backends/tofino/bf-asm/json_diff.cpp
new file mode 100644
index 00000000000..27ff15e6219
--- /dev/null
+++ b/backends/tofino/bf-asm/json_diff.cpp
@@ -0,0 +1,628 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <set>
+
+#include "backends/tofino/bf-asm/json.h"
+#include "fdstream.h"
+#include "lib/ordered_map.h"
+
+static bool show_deletion = true;
+static bool show_addition = true;
+static bool sort_map = true;
+static std::vector<const char *> list_map_keys;
+static std::set<std::string> ignore_keys;
+static std::map<std::string, std::set<int64_t>> ignore_key_indexes;
+static std::vector<std::pair<int64_t, int64_t>> ignore_intkeys;
+
+bool is_list_map(json::vector *v, const char *key) {
+    if (!key) return false;
+    for (auto &e : *v)
+        if (json::map *m = dynamic_cast<json::map *>(e.get())) {
+            if (!m->count(key)) return false;
+        } else
+            return false;
+    return true;
+}
+
+void add_ignore(const char *a) {
+    while (isspace(*a)) a++;
+    if (*a == '#' || *a == 0) return;
+    if (*a == '&' || *a == '=' || *a == '|' || isdigit(*a)) {
+        int64_t mask, val;
+        int end = 0;
+        if (sscanf(a, "%" PRIi64 " %n", &val, &end) >= 1)
+            ignore_intkeys.emplace_back(-1, val);
+        else if (sscanf(a, "== %" PRIi64 " %n", &val, &end) >= 1)
+            ignore_intkeys.emplace_back(-1, val);
+        else if (sscanf(a, "& %" PRIi64 " == %" PRIi64 " %n", &mask, &val, &end) >= 2)
+            ignore_intkeys.emplace_back(mask, val);
+        else if (sscanf(a, "| %" PRIi64 " == %" PRIi64 " %n", &mask, &val, &end) >= 2)
+            ignore_intkeys.emplace_back(~mask, val ^ mask);
+        else {
+            std::cerr << "Unknown ignore expression " << a << std::endl;
+            return;
+        }
+        if (a[end]) std::cerr << "extra text after ignore " << (a + end) << std::endl;
+        return;
+    }
+    if (auto *idx = strchr(a, '[')) {
+        int64_t val;
+        int end = 0;
+        if (sscanf(idx, "[%" PRIi64 " ] %n", &val, &end) >= 1 && end > 0) {
+            end += idx - a;
+            while (idx > a && isspace(idx[-1])) --idx;
+            std::string key(a, idx - a);
+            ignore_key_indexes[key].insert(val);
+        } else {
+            std::cerr << "Unknown ignore expression " << a << std::endl;
+            return;
+        }
+        if (a[end]) std::cerr << "extra text after ignore " << (a + end) << std::endl;
+        return;
+    }
+    ignore_keys.insert(a);
+}
+bool ignore(json::obj *o) {
+    if (json::string *s = dynamic_cast<json::string *>(o)) {
+        if (ignore_keys.count(*s)) return true;
+    } else if (json::number *n = dynamic_cast<json::number *>(o)) {
+        for (auto &k : ignore_intkeys)
+            if ((n->val & k.first) == k.second) return true;
+    }
+    return false;
+}
+bool ignore(std::unique_ptr<json::obj> &o) { return ignore(o.get()); }
+
+const std::set<int64_t> &ignore_indexes_for_key(json::obj *key) {
+    if (key && key->as_string() && ignore_key_indexes.count(*key->as_string()))
+        return ignore_key_indexes.at(*key->as_string());
+    static std::set<int64_t> empty;
+    return empty;
+}
+
+std::map<json::obj *, json::map *, json::obj::ptrless> build_list_map(json::vector *v,
+                                                                      const char *key) {
+    std::map<json::obj *, json::map *, json::obj::ptrless> rv;
+    assert(key);
+    for (auto &e : *v) {
+        json::map *m = dynamic_cast<json::map *>(e.get());
+        assert(m);
+        rv[(*m)[key].get()] = m;
+    }
+    return rv;
+}
+
+void do_prefix(int indent, const char *prefix) {
+    std::cout << '\n' << prefix;
+    if (indent) std::cout << std::setw(indent) << ' ' << std::setw(0);
+}
+
+void do_output(json::obj *o, int indent, const char *prefix, const char *suffix = "") {
+    do_prefix(indent, prefix);
+    if (o)
+        o->print_on(std::cout, indent, 80 - indent, prefix);
+    else
+        std::cout << "null";
+    std::cout << suffix;
+}
+
+void do_output(int index, json::vector::iterator p, int indent, const char *prefix) {
+    do_prefix(indent, prefix);
+    std::cout << '[' << index << "] ";
+    if (*p)
+        (*p)->print_on(std::cout, indent, 80 - indent, prefix);
+    else
+        std::cout << "null";
+}
+
+void do_output(json::map::iterator p, int indent, const char *prefix) {
+    do_prefix(indent, prefix);
+    p->first->print_on(std::cout, indent, 80 - indent, prefix);
+    std::cout << ": ";
+    if (p->second)
+        p->second->print_on(std::cout, indent, 80 - indent, prefix);
+    else
+        std::cout << "null";
+}
+
+void do_output(std::map<json::obj *, json::map *, json::obj::ptrless>::iterator p, int indent,
+               const char *prefix) {
+    do_prefix(indent, prefix);
+    p->first->print_on(std::cout, indent, 80 - indent, prefix);
+    std::cout << ": ";
+    if (p->second)
+        p->second->print_on(std::cout, indent, 80 - indent, prefix);
+    else
+        std::cout << "null";
+}
+
+void do_output(std::map<json::obj *, json::obj *, json::obj::ptrless>::iterator p, int indent,
+               const char *prefix) {
+    do_prefix(indent, prefix);
+    p->first->print_on(std::cout, indent, 80 - indent, prefix);
+    std::cout << ": ";
+    if (p->second)
+        p->second->print_on(std::cout, indent, 80 - indent, prefix);
+    else
+        std::cout << "null";
+}
+
+bool equiv(json::obj *a, json::obj *b, json::obj *key = nullptr);
+bool equiv(std::unique_ptr<json::obj> &a, json::obj *b, json::obj *key = nullptr) {
+    return equiv(a.get(), b, key);
+}
+bool equiv(std::unique_ptr<json::obj> &a, std::unique_ptr<json::obj> &b, json::obj *key = nullptr) {
+    return equiv(a.get(), b.get(), key);
+}
+void print_diff(json::obj *a, json::obj *b, int indent, json::obj *key = nullptr);
+void print_diff(std::unique_ptr<json::obj> &a, std::unique_ptr<json::obj> &b, int indent,
+                json::obj *key = nullptr) {
+    print_diff(a.get(), b.get(), indent, key);
+}
+
+json::vector::iterator find(json::vector::iterator p, json::vector::iterator end, json::obj *m) {
+    while (p < end && !equiv(*p, m)) ++p;
+    return p;
+}
+
+bool list_map_equiv(json::vector *a, json::vector *b, const char *key) {
+    auto bmap = build_list_map(b, key);
+    for (auto &e : *a) {
+        json::map *m = dynamic_cast<json::map *>(e.get());
+        json::obj *ekey = (*m)[key].get();
+        if (!bmap.count(ekey)) {
+            if (show_deletion && !ignore(ekey)) return false;
+            continue;
+        }
+        if (!ignore(ekey) && !equiv(m, bmap[ekey], ekey)) return false;
+        bmap.erase(ekey);
+    }
+    if (show_addition)
+        for (auto &e : bmap)
+            if (!ignore(e.first)) return false;
+    return true;
+}
+void list_map_print_diff(json::vector *a, json::vector *b, int indent, const char *key) {
+    auto amap = build_list_map(a, key);
+    auto bmap = build_list_map(b, key);
+    auto p1 = amap.begin(), p2 = bmap.begin();
+    std::cout << " [";
+    indent += 2;
+    while (p1 != amap.end() && p2 != bmap.end()) {
+        if (*p1->first < *p2->first) {
+            if (show_deletion && !ignore(p1->first)) do_output(p1, indent, "-");
+            p1++;
+            continue;
+        }
+        if (*p2->first < *p1->first) {
+            if (show_addition && !ignore(p2->first)) do_output(p2, indent, "+");
+            p2++;
+            continue;
+        }
+        if (!ignore(p1->first) && !equiv(p1->second, p2->second, p1->first)) {
+            int width = 80 - indent, copy;
+            if (p1->first->test_width(width) && (copy = width) && p1->second &&
+                p1->second->test_width(width) && p2->second && p2->second->test_width(copy)) {
+                do_output(p1->first, indent, "-", ": ");
+                std::cout << p1->second;
+                do_output(p2->first, indent, "+", ": ");
+                std::cout << p2->second;
+            } else {
+                do_output(p1->first, indent, " ", ":");
+                print_diff(p1->second, p2->second, indent, p1->first);
+            }
+        }
+        p1++;
+        p2++;
+    }
+    if (show_deletion)
+        while (p1 != amap.end()) {
+            if (!ignore(p1->first)) do_output(p1, indent, "-");
+            p1++;
+        }
+    if (show_addition)
+        while (p2 != bmap.end()) {
+            if (!ignore(p2->first)) do_output(p2, indent, "+");
+            p2++;
+        }
+    indent -= 2;
+    do_prefix(indent, " ");
+    std::cout << ']';
+}
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpotentially-evaluated-expression"
+bool equiv(json::vector *a, json::vector *b, const std::set<int64_t> &ignore_idx) {
+    for (auto key : list_map_keys)
+        if (is_list_map(a, key) && is_list_map(b, key)) return list_map_equiv(a, b, key);
+    auto p1 = a->begin(), p2 = b->begin();
+    while (p1 != a->end() && p2 != b->end()) {
+        if (!ignore_idx.count(p1 - a->begin()) && !equiv(*p1, *p2)) {
+            auto s1 = find(p1, a->end(), p2->get());
+            auto s2 = find(p2, b->end(), p1->get());
+            if (typeid(**p1) == typeid(**p2) && p1 - a->begin() == p2 - b->begin() &&
+                (s1 - p1 == s2 - p2 || typeid(**p1) == typeid(json::vector) ||
+                 typeid(**p1) == typeid(json::map)))
+                return false;
+            if (s1 - p1 <= s2 - p2) {
+                if (show_deletion) return false;
+                ++p1;
+            } else {
+                if (show_addition) return false;
+                ++p2;
+            }
+        } else {
+            ++p1;
+            ++p2;
+        }
+    }
+    while (p1 != a->end() && ignore_idx.count(p1 - a->begin())) ++p1;
+    if (p1 != a->end() && show_deletion) return false;
+    while (p2 != b->end() && ignore_idx.count(p2 - b->begin())) ++p2;
+    if (p2 != b->end() && show_addition) return false;
+    return true;
+}
+void print_diff(json::vector *a, json::vector *b, const std::set<int64_t> &ignore_idx, int indent) {
+    for (auto key : list_map_keys)
+        if (is_list_map(a, key) && is_list_map(b, key)) {
+            list_map_print_diff(a, b, indent, key);
+            return;
+        }
+    auto p1 = a->begin(), p2 = b->begin();
+    std::cout << " [";
+    indent += 2;
+    while (p1 != a->end() && p2 != b->end()) {
+        if (!ignore_idx.count(p1 - a->begin()) && !equiv(*p1, *p2)) {
+            auto s1 = find(p1, a->end(), p2->get());
+            auto s2 = find(p2, b->end(), p1->get());
+            if ((p1 + 1 != a->end() && p2 + 1 != b->end() && equiv(p1[1], p2[1])) ||
+                (typeid(**p1) == typeid(**p2) && p1 - a->begin() == p2 - b->begin() &&
+                 (s1 - p1 == s2 - p2 || typeid(**p1) == typeid(json::vector) ||
+                  typeid(**p1) == typeid(json::map)))) {
+                do_prefix(indent, " ");
+                std::cout << '[' << p1 - a->begin() << "]";
+                print_diff(p1->get(), p2->get(), indent);
+            } else {
+                if (s1 - p1 <= s2 - p2) {
+                    if (show_deletion) do_output(p1 - a->begin(), p1, indent, "-");
+                    ++p1;
+                } else {
+                    if (show_addition) do_output(p2 - b->begin(), p2, indent, "+");
+                    ++p2;
+                }
+                continue;
+            }
+        }
+
+        ++p1;
+        ++p2;
+    }
+    if (show_deletion)
+        while (p1 != a->end()) {
+            if (!ignore_idx.count(p1 - a->begin())) do_output(p1 - a->begin(), p1, indent, "-");
+            ++p1;
+        }
+    if (show_addition)
+        while (p2 != b->end()) {
+            if (!ignore_idx.count(p2 - b->begin())) do_output(p2 - b->begin(), p2, indent, "+");
+            ++p2;
+        }
+    indent -= 2;
+    do_prefix(indent, " ");
+    std::cout << ']';
+}
+#pragma clang diagnostic pop
+
+std::map<json::obj *, json::obj *, json::obj::ptrless> build_sort_map(json::map *m) {
+    std::map<json::obj *, json::obj *, json::obj::ptrless> rv;
+    for (auto &e : *m) {
+        rv[e.first] = e.second.get();
+    }
+    return rv;
+}
+bool sort_map_equiv(json::map *a, json::map *b) {
+    auto bmap = build_sort_map(b);
+    for (auto &e : *a) {
+        json::obj *ekey = e.first;
+        if (!bmap.count(ekey)) {
+            if (show_deletion && !ignore(ekey)) return false;
+            continue;
+        }
+        if (!ignore(ekey) && !equiv(e.second.get(), bmap[ekey], ekey)) return false;
+        bmap.erase(ekey);
+    }
+    if (show_addition)
+        for (auto &e : bmap)
+            if (!ignore(e.first)) return false;
+    return true;
+}
+void sort_map_print_diff(json::map *a, json::map *b, int indent) {
+    auto amap = build_sort_map(a);
+    auto bmap = build_sort_map(b);
+    auto p1 = amap.begin(), p2 = bmap.begin();
+    std::cout << " {";
+    indent += 2;
+    while (p1 != amap.end() && p2 != bmap.end()) {
+        if (*p1->first < *p2->first) {
+            if (show_deletion && !ignore(p1->first) && p1->second) do_output(p1, indent, "-");
+            p1++;
+            continue;
+        }
+        if (*p2->first < *p1->first) {
+            if (show_addition && !ignore(p2->first) && p2->second) do_output(p2, indent, "+");
+            p2++;
+            continue;
+        }
+        if (!ignore(p1->first) && !equiv(p1->second, p2->second, p1->first)) {
+            int width = 80 - indent, copy;
+            if (p1->first->test_width(width) && (copy = width) && p1->second &&
+                p1->second->test_width(width) && p2->second && p2->second->test_width(copy)) {
+                do_output(p1->first, indent, "-", ": ");
+                std::cout << p1->second;
+                do_output(p2->first, indent, "+", ": ");
+                std::cout << p2->second;
+            } else {
+                do_output(p1->first, indent, " ", ":");
+                print_diff(p1->second, p2->second, indent, p1->first);
+            }
+        }
+        p1++;
+        p2++;
+    }
+    if (show_deletion)
+        while (p1 != amap.end()) {
+            if (!ignore(p1->first)) do_output(p1, indent, "-");
+            p1++;
+        }
+    if (show_addition)
+        while (p2 != bmap.end()) {
+            if (!ignore(p2->first)) do_output(p2, indent, "+");
+            p2++;
+        }
+    indent -= 2;
+    do_prefix(indent, " ");
+    std::cout << '}';
+}
+
+bool equiv(json::map *a, json::map *b) {
+    if (sort_map) return sort_map_equiv(a, b);
+    auto p1 = a->begin(), p2 = b->begin();
+    while (p1 != a->end() && p2 != b->end()) {
+        if (*p1->first < *p2->first) {
+            if (show_deletion && !ignore(p1->first)) return false;
+            ++p1;
+        } else if (*p2->first < *p1->first) {
+            if (show_addition && !ignore(p2->first)) return false;
+            ++p2;
+        } else if (!ignore(p1->first) && !(equiv(p1->second, p2->second, p1->first))) {
+            return false;
+        } else {
+            ++p1;
+            ++p2;
+        }
+    }
+    if (show_deletion)
+        for (; p1 != a->end(); ++p1)
+            if (!ignore(p1->first)) return false;
+    if (show_addition)
+        for (; p2 != b->end(); ++p2)
+            if (!ignore(p2->first)) return false;
+    return true;
+}
+void print_diff(json::map *a, json::map *b, int indent) {
+    if (sort_map) {
+        sort_map_print_diff(a, b, indent);
+        return;
+    }
+    auto p1 = a->begin(), p2 = b->begin();
+    std::cout << " {";
+    indent += 2;
+    while (p1 != a->end() && p2 != b->end()) {
+        if (*p1->first < *p2->first) {
+            if (show_deletion && !ignore(p1->first)) do_output(p1, indent, "-");
+            p1++;
+            continue;
+        }
+        if (*p2->first < *p1->first) {
+            if (show_addition && !ignore(p2->first)) do_output(p2, indent, "+");
+            p2++;
+            continue;
+        }
+        if (!ignore(p1->first) && !equiv(p1->second, p2->second, p1->first)) {
+            int width = 80 - indent, copy;
+            if (p1->first->test_width(width) && (copy = width) && p1->second &&
+                p1->second->test_width(width) && p2->second && p2->second->test_width(copy)) {
+                do_output(p1->first, indent, "-", ": ");
+                std::cout << p1->second;
+                do_output(p2->first, indent, "+", ": ");
+                std::cout << p2->second;
+            } else {
+                do_output(p1->first, indent, " ", ":");
+                print_diff(p1->second, p2->second, indent, p1->first);
+            }
+        }
+        p1++;
+        p2++;
+    }
+    if (show_deletion)
+        for (; p1 != a->end(); ++p1)
+            if (!ignore(p1->first)) do_output(p1, indent, "-");
+    if (show_addition)
+        for (; p2 != b->end(); ++p2)
+            if (!ignore(p2->first)) do_output(p2, indent, "+");
+    indent -= 2;
+    do_prefix(indent, " ");
+    std::cout << '}';
+}
+
+bool equiv(json::obj *a, json::obj *b, json::obj *key) {
+    if (a == b) return true;
+    // Check true for map/vector with nullptr v/s with no elements "{}"
+    if (!a) {
+        if (auto m = b->as_map()) {
+            if (m->empty()) return true;
+        }
+        if (auto v = b->as_vector()) {
+            if (v->empty()) return true;
+        }
+    }
+    if (!b) {
+        if (auto m = a->as_map()) {
+            if (m->empty()) return true;
+        }
+        if (auto v = a->as_vector()) {
+            if (v->empty()) return true;
+        }
+    }
+    if (!a || !b) return false;
+    if (typeid(*a) != typeid(*b)) return false;
+    if (typeid(*a) == typeid(json::vector))
+        return equiv(static_cast<json::vector *>(a), static_cast<json::vector *>(b),
+                     ignore_indexes_for_key(key));
+    if (typeid(*a) == typeid(json::map))
+        return equiv(static_cast<json::map *>(a), static_cast<json::map *>(b));
+    return *a == *b;
+}
+void print_diff(json::obj *a, json::obj *b, int indent, json::obj *key) {
+    if (equiv(a, b))
+        return;
+    else if (!a) {
+        if (show_deletion) do_output(b, indent, "+");
+        return;
+    } else if (!b) {
+        if (show_addition) do_output(a, indent, "-");
+        return;
+    } else if (typeid(*a) == typeid(*b)) {
+        if (typeid(*a) == typeid(json::vector)) {
+            print_diff(static_cast<json::vector *>(a), static_cast<json::vector *>(b),
+                       ignore_indexes_for_key(key), indent);
+            return;
+        } else if (typeid(*a) == typeid(json::map)) {
+            print_diff(static_cast<json::map *>(a), static_cast<json::map *>(b), indent);
+            return;
+        }
+    }
+    do_output(a, indent, "-");
+    do_output(b, indent, "+");
+}
+
+int do_diff(const char *a_name, json::obj *a, const char *b_name, json::obj *b) {
+    if (equiv(a, b)) return 0;
+    std::cout << "--- " << a_name << std::endl;
+    std::cout << "+++ " << b_name << std::endl;
+    print_diff(a, b, 0);
+    std::cout << std::endl;
+    return 1;
+}
+int do_diff(const char *a_name, std::unique_ptr<json::obj> &a, const char *b_name,
+            std::unique_ptr<json::obj> &b) {
+    return do_diff(a_name, a.get(), b_name, b.get());
+}
+
+int main(int ac, char **av) {
+    int error = 0;
+    std::unique_ptr<json::obj> file1;
+    const char *file1_name = 0;
+    for (int i = 1; i < ac; i++)
+        if (av[i][0] == '-' && av[i][1] == 0) {
+            if (file1) {
+                std::unique_ptr<json::obj> file2;
+                if (!(std::cin >> file2) || !file2) {
+                    std::cerr << "Failed reading json from stdin" << std::endl;
+                    error = 2;
+                } else if (!(error & 2))
+                    error |= do_diff(file1_name, file1, "<stdin>", file2);
+            } else if (!(std::cin >> file1) || !file1) {
+                std::cerr << "Failed reading json from stdin" << std::endl;
+                error = 2;
+            } else
+                file1_name = "<stdin>";
+        } else if (av[i][0] == '-' || av[i][0] == '+') {
+            bool flag = av[i][0] == '+';
+            for (char *arg = av[i] + 1; *arg;) switch (*arg++) {
+                    case 'a':
+                        show_addition = flag;
+                        break;
+                    case 'd':
+                        show_deletion = flag;
+                        break;
+                    case 'i':
+                        if (*av[++i] == '@') {
+                            std::ifstream file(av[i] + 1);
+                            std::string str;
+                            if (!file)
+                                std::cerr << "Can't read " << av[i] + 1 << std::endl;
+                            else
+                                while (getline(file, str)) add_ignore(str.c_str());
+                        } else
+                            add_ignore(av[i]);
+                        break;
+                    case 'l':
+                        list_map_keys.push_back(av[++i]);
+                        break;
+                    case 's':
+                        sort_map = flag;
+                        break;
+                    default:
+                        std::cerr << "Unknown option " << (flag ? '+' : '-') << arg[-1]
+                                  << std::endl;
+                        error = 2;
+                }
+        } else {
+            std::istream *in = nullptr;
+            if (auto ext = strrchr(av[i], '.')) {
+                std::string cmd;
+                if (!strcmp(ext, ".gz") || !strcmp(ext, ".Z"))
+                    cmd = "zcat ";
+                else if (!strcmp(ext, ".bz") || !strcmp(ext, ".bz2"))
+                    cmd = "bzcat ";
+                if (!cmd.empty()) {
+                    cmd += av[i];
+                    cmd = "2>/dev/null; " + cmd;  // ignore errors (Broken Pipe in particular)
+                    auto *pipe = popen(cmd.c_str(), "r");
+                    if (pipe) {
+                        auto *pstream = new fdstream(fileno(pipe));
+                        pstream->setclose([pipe]() { pclose(pipe); });
+                        in = pstream;
+                    }
+                }
+            }
+            if (!in) in = new std::ifstream(av[i]);
+            if (!in || !*in) {
+                std::cerr << "Can't open " << av[i] << " for reading" << std::endl;
+                error = 2;
+            } else if (file1) {
+                std::unique_ptr<json::obj> file2;
+                if (!(*in >> file2) || !file2) {
+                    std::cerr << "Failed reading json from " << av[i] << std::endl;
+                    error = 2;
+                } else if (!(error & 2))
+                    error |= do_diff(file1_name, file1, av[i], file2);
+            } else if (!(*in >> file1) || !file1) {
+                std::cerr << "Failed reading json from " << av[i] << std::endl;
+                error = 2;
+            } else
+                file1_name = av[i];
+            delete in;
+        }
+    if (error & 2) std::cerr << "usage: " << av[0] << " [-adi:l:] file1 file2" << std::endl;
+    return error;
+}
diff --git a/backends/tofino/bf-asm/lex-yaml.l b/backends/tofino/bf-asm/lex-yaml.l
new file mode 100644
index 00000000000..c28facd913e
--- /dev/null
+++ b/backends/tofino/bf-asm/lex-yaml.l
@@ -0,0 +1,283 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+
+%x LINESTART LINE1 LINE2 LINE3 COMMENT
+%s NORMAL
+
+%{
+#include <assert.h>
+#include <limits.h>
+#include <stack>
+static std::stack<int>  indent;
+static int              parens=0;
+static int indent_depth(const char *);
+static int parse_num(YYSTYPE *, const char *s, int base);
+static int parse_match(YYSTYPE *, const char *s, int bits_per_digit);
+
+#if YYDEBUG
+#undef BEGIN
+/* DANGER -- the depends on the internals of how flex sets states, but as
+ * DANGER -- its only for debugging, its not too bad */
+#define BEGIN(S)  ((yy_start) = 1 + 2*(S), \
+                   yydebug ? fprintf(stderr, "Setting lexer state "#S"\n") : 0)
+#define DB(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DB(...)
+#endif
+
+#pragma clang diagnostic ignored "-Wnull-conversion"
+
+%}
+
+ID [A-Za-z_@$]([-.]?*[A-Za-z0-9_@$])*
+STR \"(\\.|[^\n"\\])*\"
+%option nounput noyywrap
+
+%%
+
+<INITIAL>.*|\n        { yyless(0); BEGIN(LINESTART); indent.push(0); }
+<LINESTART>[ \t]*     { int depth = indent_depth(yytext);
+                        if (depth < indent.top()) {
+                            indent.pop();
+                            yyless(0);
+                            return UNINDENT; }
+                        BEGIN(NORMAL);
+                        if (depth > indent.top()) {
+                            indent.push(depth);
+                            return INDENT; } }
+
+<LINESTART>"#line"    { BEGIN(LINE1); }
+<LINESTART>"# "       { BEGIN(LINE1); }
+<LINE1>[0-9]+         { line_file_map[lineno].second = atoi(yytext)-1;
+                        DB("next line is %s\n", yytext);
+                        BEGIN(LINE2); }
+<LINE2>\"[^"]*        { line_file_map[lineno].first = yytext+1;
+                        DB("file is '%s'\n", yytext+1);
+                        BEGIN(LINE3); }
+<LINE1,LINE2>[ \t]      ;
+<LINE1,LINE2>.        { BEGIN(LINE3); }
+<LINE3>.                ;
+<LINE1,LINE2,LINE3>\n { lineno++; BEGIN(LINESTART); }
+<LINE1,LINE2,LINE3><<EOF>> { BEGIN(LINESTART); }
+
+<LINESTART>[ \t]*"#"  { BEGIN(LINE3); }
+<LINESTART>.          { yyless(0);
+                        if (indent.top() > 0) {
+                            indent.pop();
+                            return UNINDENT; }
+                        BEGIN(NORMAL); }
+<LINESTART><<EOF>>    { if (indent.top() > 0) {
+                            indent.pop();
+                            return UNINDENT; }
+                        BEGIN(NORMAL); }
+<LINESTART>[ \t\r]*\n { lineno++; }
+
+[[({]             { parens++; return *yytext; }
+[])}]             { if (--parens < 0) parens = 0;
+                    return *yytext; }
+\n                { lineno++;
+                    if (parens == 0) {
+                        BEGIN(LINESTART);
+                        return '\n'; } }
+[ \t\r]+            ;
+".."              { return DOTDOT; }
+{ID}              { yylval.str = strdup(yytext); return ID; }
+{STR}             { yylval.str = strndup(yytext+1, yyleng-2); return STR; }
+[0-9_]+           { return parse_num(&yylval, yytext, 10); }
+0[xX][0-9a-fA-F_]+ { return parse_num(&yylval, yytext+2, 16); }
+0[oO][0-7_]+      { return parse_num(&yylval, yytext+2, 8); }
+0[bB][0-1_]+      { return parse_num(&yylval, yytext+2, 2); }
+0[xX][0-9a-fA-F*_]+ { return parse_match(&yylval, yytext+2, 4); }
+0[oO][0-7*_]+     { return parse_match(&yylval, yytext+2, 3); }
+0[bB][0-1*_]+     { return parse_match(&yylval, yytext+2, 1); }
+"*"               { return parse_match(&yylval, yytext, 0); }
+
+"#".*               ;
+"/*"              { BEGIN(COMMENT); }
+<COMMENT>"*/"     { BEGIN(NORMAL); }
+<COMMENT>.              ;
+<COMMENT>\n       { lineno++; }
+
+.                 { return *yytext; }
+
+
+%%
+
+/* flex's #line generation is broken, so we manually resync so we can debug */
+#line 104 "lex-yaml.l"
+int indent_depth(const char *pfx) {
+    int rv = 0;
+    while (*pfx)
+        switch(*pfx++) {
+        case ' ': rv++; break;
+        case '\t': rv &= ~7; rv += 8; break;
+        default:
+            return rv;}
+    return rv;
+}
+
+#include "backends/tofino/bf-asm/gen/uptr_sizes.h"
+
+void bigint_mul(VECTOR(uintptr_t) &val, unsigned f) {
+    unsigned carry = 0;
+    for (int i = 0; i < val.size; i++) {
+#if defined(uint2ptr_t)
+        uint2ptr_t v = val.data[i];
+        v = v * f + carry;
+        val.data[i] = (uintptr_t)v;
+        carry = v >> CHAR_BIT * sizeof(uintptr_t);
+#elif defined(uinthptr_t)
+        uinthptr_t lo = val.data[i],
+                   hi = val.data[i] >> CHAR_BIT * sizeof(uinthptr_t);
+        uintptr_t tmp = (uintptr_t)lo * f + carry;
+        lo = tmp;
+        tmp >>= CHAR_BIT * sizeof(uinthptr_t);
+        tmp += (uintptr_t)hi * f;
+        carry = tmp >> CHAR_BIT * sizeof(uinthptr_t);
+        val.data[i] = (tmp << (CHAR_BIT * sizeof(uinthptr_t))) + lo;
+#else
+#error "No appropriately sized type for bigint_mul"
+#endif
+    }
+    if (carry)
+        VECTOR_add(val, carry);
+}
+
+void bigint_add(VECTOR(uintptr_t) &val, unsigned a) {
+    for (int i = 0; i < val.size; i++) {
+        if ((val.data[i] += a) >= a)
+            return;
+        a = 1; }
+    VECTOR_add(val, a);
+}
+
+void bigint_init(VECTOR(uintptr_t) &val, int64_t v) {
+    if (sizeof(int64_t)/sizeof(uintptr_t) > 1) {
+        VECTOR_init(val, sizeof(int64_t)/sizeof(uintptr_t));
+        do {
+            val.data[val.size++] = v;
+            v >>= CHAR_BIT * sizeof(uintptr_t) / 2;
+            v >>= CHAR_BIT * sizeof(uintptr_t) / 2;
+        } while (v > 0);
+    } else {
+        VECTOR_init1(val, v);
+    }
+}
+
+int parse_num(YYSTYPE *val, const char *s, int base) {
+    int rv = INT;
+    val->i = 0;
+    s--;
+    while (*++s) {
+        if (*s == '_') continue;
+        /* The comparison is intentionally against LONG_MAX, so that we put values larger than
+           uintptr_t size in big ints.
+        */
+        if (rv == INT && val->i > INT64_MAX/base) {
+            bigint_init(val->bigi, val->i);
+            rv = BIGINT; }
+        if (rv == INT)
+            val->i *= base;
+        else
+            bigint_mul(val->bigi, base);
+        switch (*s) {
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7': case '8': case '9':
+            if (rv == INT)
+                val->i += *s - '0';
+            else
+                bigint_add(val->bigi, *s - '0');
+            break;
+        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+            if (rv == INT)
+                val->i += *s - 'a' + 10;
+            else
+                bigint_add(val->bigi, *s - 'a' + 10);
+            break;
+        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+            if (rv == INT)
+                val->i += *s - 'A' + 10;
+            else
+                bigint_add(val->bigi, *s - 'A' + 10);
+            break;
+        default:
+            assert(0); }
+        if (rv == INT && val->i > 0xffffffff) {
+            // We limit INT tokens to what will fit in 32 bits (unsigned) even though
+            // we use a int64_t to hold them, as most parts of the compiler can't deal
+            // with larger constants.  The few places that can deal with >32bit values
+            // handle BIGINTs
+            bigint_init(val->bigi, val->i);
+            rv = BIGINT; } }
+    return rv;
+}
+
+int parse_match(YYSTYPE *val, const char *s, int bits_per_digit) {
+    val->match.word0 = val->match.word1 = 0;
+    int rv = MATCH;
+    VECTOR(match_t) bigm = EMPTY_VECTOR_INIT;
+    if (*s == '*' && bits_per_digit == 0) return rv;
+    unsigned digit = 0, digit_mask = (1U << bits_per_digit) - 1;
+    decltype(val->match.word0) overflow_mask = digit_mask;
+    overflow_mask <<= sizeof(val->match.word0) * 8 - bits_per_digit;
+    s--;
+    while (*++s) {
+        if (*s == '_') continue;
+        if (rv == BIGMATCH || ((val->match.word0 | val->match.word1) & overflow_mask)) {
+            rv = BIGMATCH;
+            if (bigm.size < 2) {
+                VECTOR_resize(bigm, 2);
+                bigm.data[0].word0 = bigm.data[0].word1 = 
+                bigm.data[1].word0 = bigm.data[1].word1 = 0; }
+            if ((bigm.data[bigm.size-1].word0 | bigm.data[bigm.size-1].word1) & overflow_mask) {
+                VECTOR_resize(bigm, bigm.size+1);
+                bigm.data[bigm.size-1].word0 = bigm.data[bigm.size-1].word1 = 0; }
+            for (int i = bigm.size-1; i > 0; --i) {
+                bigm.data[i].word0 <<= bits_per_digit;
+                bigm.data[i].word0 |= bigm.data[i].word0 >> (64 - bits_per_digit);
+                bigm.data[i].word1 <<= bits_per_digit;
+                bigm.data[i].word1 |= bigm.data[i].word1 >> (64 - bits_per_digit); }
+            bigm.data[1].word0 |= val->match.word0 >> (64 - bits_per_digit);
+            bigm.data[1].word1 |= val->match.word1 >> (64 - bits_per_digit); }
+        val->match.word0 <<= bits_per_digit;
+        val->match.word1 <<= bits_per_digit;
+        switch (*s) {
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7': case '8': case '9':
+            digit = *s - '0';
+            break;
+        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+            digit = *s - 'a' + 10;
+            break;
+        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+            digit = *s - 'A' + 10;
+            break;
+        case '*':
+            val->match.word1 |= digit_mask;
+            digit = 0;
+            break;
+        default:
+            assert(0); }
+        assert((digit & ~digit_mask) == 0);
+        val->match.word1 |= digit;
+        val->match.word0 |= digit_mask & ~digit; }
+    if (rv == BIGMATCH) {
+        bigm.data[0] = val->match;
+        val->bigm = bigm; }
+    return rv;
+}
diff --git a/backends/tofino/bf-asm/map.h b/backends/tofino/bf-asm/map.h
new file mode 100644
index 00000000000..fc6e82ba3b5
--- /dev/null
+++ b/backends/tofino/bf-asm/map.h
@@ -0,0 +1,255 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_MAP_H_
+#define BACKENDS_TOFINO_BF_ASM_MAP_H_
+
+#include <map>
+
+template <class K, class T, class V, class Comp, class Alloc>
+inline V get(const std::map<K, V, Comp, Alloc> &m, T key, V def = V()) {
+    auto it = m.find(key);
+    if (it != m.end()) return it->second;
+    return def;
+}
+
+template <class K, class T, class V, class Comp, class Alloc>
+inline V *getref(std::map<K, V, Comp, Alloc> &m, T key) {
+    auto it = m.find(key);
+    if (it != m.end()) return &it->second;
+    return 0;
+}
+
+template <class K, class T, class V, class Comp, class Alloc>
+inline const V *getref(const std::map<K, V, Comp, Alloc> &m, T key) {
+    auto it = m.find(key);
+    if (it != m.end()) return &it->second;
+    return 0;
+}
+
+template <class K, class T, class V, class Comp, class Alloc>
+inline V get(const std::map<K, V, Comp, Alloc> *m, T key, V def = V()) {
+    return m ? get(*m, key, def) : def;
+}
+
+template <class K, class T, class V, class Comp, class Alloc>
+inline V *getref(std::map<K, V, Comp, Alloc> *m, T key) {
+    return m ? getref(*m, key) : 0;
+}
+
+template <class K, class T, class V, class Comp, class Alloc>
+inline const V *getref(const std::map<K, V, Comp, Alloc> *m, T key) {
+    return m ? getref(*m, key) : 0;
+}
+
+/* iterate over the keys in a map */
+template <class PairIter>
+struct IterKeys {
+    class iterator
+        : public std::iterator<typename std::iterator_traits<PairIter>::iterator_category,
+                               typename std::iterator_traits<PairIter>::value_type,
+                               typename std::iterator_traits<PairIter>::difference_type,
+                               typename std::iterator_traits<PairIter>::pointer,
+                               typename std::iterator_traits<PairIter>::reference> {
+        PairIter it;
+
+     public:
+        iterator() {}
+        explicit iterator(PairIter i) : it(i) {}
+        iterator &operator=(PairIter i) {
+            it = i;
+            return *this;
+        }
+        iterator &operator++() {
+            ++it;
+            return *this;
+        }
+        iterator &operator--() {
+            --it;
+            return *this;
+        }
+        iterator operator++(int) {
+            auto copy = *this;
+            ++it;
+            return copy;
+        }
+        iterator operator--(int) {
+            auto copy = *this;
+            --it;
+            return copy;
+        }
+        bool operator==(const iterator &i) const { return it == i.it; }
+        bool operator!=(const iterator &i) const { return it != i.it; }
+        decltype(*&it->first) operator*() const { return it->first; }
+        decltype(&it->first) operator->() const { return &it->first; }
+    } b, e;
+
+    template <class U>
+    IterKeys(U &map) : b(map.begin()), e(map.end()) {}  // NOLINT(runtime/explicit)
+    IterKeys(PairIter b, PairIter e) : b(b), e(e) {}
+    iterator begin() const { return b; }
+    iterator end() const { return e; }
+
+ protected:
+    IterKeys() {}
+};
+
+template <class Map>
+struct IterKeysCopy : IterKeys<typename Map::const_iterator> {
+    Map m;
+    explicit IterKeysCopy(Map &&map) : m(std::move(map)) {
+        // move the map into this object, then setup the iterators
+        this->b = m.begin();
+        this->e = m.end();
+    }
+};
+
+template <class Map>
+IterKeys<typename Map::iterator> Keys(Map &m) {
+    return IterKeys<typename Map::iterator>(m);
+}
+
+template <class Map>
+IterKeys<typename Map::const_iterator> Keys(const Map &m) {
+    return IterKeys<typename Map::const_iterator>(m);
+}
+
+template <class Map>
+IterKeysCopy<Map> Keys(Map &&m) {
+    return IterKeysCopy<Map>(std::move(m));
+}
+
+template <class PairIter>
+IterKeys<PairIter> Keys(std::pair<PairIter, PairIter> range) {
+    return IterKeys<PairIter>(range.first, range.second);
+}
+
+/* iterate over the values in a map */
+template <class PairIter>
+struct IterValues {
+    class iterator
+        : public std::iterator<typename std::iterator_traits<PairIter>::iterator_category,
+                               typename std::iterator_traits<PairIter>::value_type,
+                               typename std::iterator_traits<PairIter>::difference_type,
+                               typename std::iterator_traits<PairIter>::pointer,
+                               typename std::iterator_traits<PairIter>::reference> {
+        PairIter it;
+
+     public:
+        iterator() {}
+        explicit iterator(PairIter i) : it(i) {}
+        iterator &operator=(PairIter i) {
+            it = i;
+            return *this;
+        }
+        iterator &operator++() {
+            ++it;
+            return *this;
+        }
+        iterator &operator--() {
+            --it;
+            return *this;
+        }
+        iterator operator++(int) {
+            auto copy = *this;
+            ++it;
+            return copy;
+        }
+        iterator operator--(int) {
+            auto copy = *this;
+            --it;
+            return copy;
+        }
+        bool operator==(const iterator &i) const { return it == i.it; }
+        bool operator!=(const iterator &i) const { return it != i.it; }
+        decltype(*&it->second) operator*() const { return it->second; }
+        decltype(&it->second) operator->() const { return &it->second; }
+    } b, e;
+
+    template <class U>
+    IterValues(U &map) : b(map.begin()), e(map.end()) {}  // NOLINT(runtime/explicit)
+    IterValues(PairIter b, PairIter e) : b(b), e(e) {}
+    iterator begin() const { return b; }
+    iterator end() const { return e; }
+
+ protected:
+    IterValues() {}
+};
+
+template <class Map>
+struct IterValuesCopy : IterValues<typename Map::const_iterator> {
+    Map m;
+    explicit IterValuesCopy(Map &&map) : m(std::move(map)) {
+        // move the map into this object, then setup the iterators
+        this->b = m.begin();
+        this->e = m.end();
+    }
+};
+
+template <class Map>
+IterValues<typename Map::iterator> Values(Map &m) {
+    return IterValues<typename Map::iterator>(m);
+}
+
+template <class Map>
+IterValues<typename Map::const_iterator> Values(const Map &m) {
+    return IterValues<typename Map::const_iterator>(m);
+}
+
+template <class Map>
+IterValuesCopy<Map> Values(Map &&m) {
+    return IterValuesCopy<Map>(std::move(m));
+}
+
+template <class PairIter>
+IterValues<PairIter> Values(std::pair<PairIter, PairIter> range) {
+    return IterValues<PairIter>(range.first, range.second);
+}
+
+/* iterate over the values for a single key in a multimap */
+template <class M>
+class MapForKey {
+    M &map;
+    typename M::key_type key;
+    class iterator {
+        const MapForKey &self;
+        decltype(map.begin()) it;
+
+     public:
+        iterator(const MapForKey &s, decltype(map.begin()) i) : self(s), it(i) {}
+        iterator &operator++() {
+            if (++it != self.map.end() && it->first != self.key) it = self.map.end();
+            return *this;
+        }
+        bool operator==(const iterator &i) const { return it == i.it; }
+        bool operator!=(const iterator &i) const { return it != i.it; }
+        decltype(*&it->second) operator*() const { return it->second; }
+        decltype(&it->second) operator->() const { return &it->second; }
+    };
+
+ public:
+    MapForKey(M &m, typename M::key_type k) : map(m), key(k) {}
+    iterator begin() const { return iterator(*this, map.find(key)); }
+    iterator end() const { return iterator(*this, map.end()); }
+};
+
+template <class M>
+MapForKey<M> ValuesForKey(M &m, typename M::key_type k) {
+    return MapForKey<M>(m, k);
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_MAP_H_ */
diff --git a/backends/tofino/bf-asm/mask_counter.h b/backends/tofino/bf-asm/mask_counter.h
new file mode 100644
index 00000000000..caa93b1d696
--- /dev/null
+++ b/backends/tofino/bf-asm/mask_counter.h
@@ -0,0 +1,65 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_MASK_COUNTER_H_
+#define BACKENDS_TOFINO_BF_ASM_MASK_COUNTER_H_
+
+#include <limits.h>
+
+#include "lib/bitvec.h"
+
+class MaskCounter {
+    unsigned mask, val;
+    bool oflo;
+
+ public:
+    explicit MaskCounter(unsigned m) : mask(m), val(0), oflo(false) {}
+    explicit operator bool() const { return !oflo; }
+    operator unsigned() const { return val; }
+    bool operator==(const MaskCounter &a) const { return val == a.val && oflo == a.oflo; }
+    MaskCounter &operator++() {
+        val = ((val | ~mask) + 1) & mask;
+        if (val == 0) oflo = true;
+        return *this;
+    }
+    MaskCounter operator++(int) {
+        MaskCounter tmp(*this);
+        ++*this;
+        return tmp;
+    }
+    MaskCounter &operator--() {
+        val = (val - 1) & mask;
+        if (val == mask) oflo = true;
+        return *this;
+    }
+    MaskCounter operator--(int) {
+        MaskCounter tmp(*this);
+        --*this;
+        return tmp;
+    }
+    MaskCounter &clear() {
+        val = 0;
+        oflo = false;
+        return *this;
+    }
+    MaskCounter &overflow(bool v = true) {
+        oflo = v;
+        return *this;
+    }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_MASK_COUNTER_H_ */
diff --git a/backends/tofino/bf-asm/match_source.h b/backends/tofino/bf-asm/match_source.h
new file mode 100644
index 00000000000..0014e0ed38f
--- /dev/null
+++ b/backends/tofino/bf-asm/match_source.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_MATCH_SOURCE_H_
+#define BACKENDS_TOFINO_BF_ASM_MATCH_SOURCE_H_
+
+#include <sstream>
+#include <string>
+
+#include "backends/tofino/bf-asm/asm-types.h"
+#include "lib/stringify.h"
+
+/**
+ * A source for a match key of a table.  The source can either be from the input xbar, or from the
+ * galois field matrix, as indicated in uArch Section Exact Match Row Vertical/Horizontal (VH)
+ * Xbars.  This class is the parent of HashMatchSource and Phv::Ref.
+ */
+class MatchSource : public IHasDbPrint {
+ public:
+    virtual int fieldlobit() const = 0;
+    virtual int fieldhibit() const = 0;
+    virtual unsigned size() const = 0;
+    virtual int slicelobit() const = 0;
+    virtual int slicehibit() const = 0;
+    virtual const char *name() const = 0;
+    virtual int get_lineno() const = 0;
+    virtual std::string toString() const = 0;
+    virtual void dbprint(std::ostream &out) const = 0;
+};
+
+/**
+ * The source used by proxy hash tables for their match key.
+ */
+class HashMatchSource : public MatchSource {
+    int lo = 0;
+    int hi = 0;
+
+ public:
+    int lineno = 0;
+    HashMatchSource(int line, int l, int h) : lo(l), hi(h), lineno(line) {}
+    explicit HashMatchSource(value_t value) {
+        if (CHECKTYPE(value, tCMD)) {
+            lineno = value.lineno;
+            if (value != "hash_group")
+                error(value.lineno, "Hash Match source must come from a hash group");
+            if (value.vec.size != 2) error(value.lineno, "Hash Match source requires a range");
+            if (CHECKTYPE(value.vec[1], tRANGE)) {
+                lo = value.vec[1].range.lo;
+                hi = value.vec[1].range.hi;
+            }
+        }
+    }
+
+    int get_lineno() const override { return lineno; }
+    int fieldlobit() const override { return lo < 0 ? 0 : lo; }
+    int fieldhibit() const override { return hi < 0 ? 0 : hi; }
+    unsigned size() const override { return hi >= lo && lo >= 0 ? hi - lo + 1 : 0; }
+    int slicelobit() const override { return fieldlobit(); }
+    int slicehibit() const override { return fieldhibit(); }
+    const char *name() const override { return "hash_group"; }
+    std::string toString() const override {
+        std::stringstream str;
+        str << *this;
+        return str.str();
+    }
+
+    void dbprint(std::ostream &out) const { out << name() << "(" << lo << ".." << hi << ")"; }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_MATCH_SOURCE_H_ */
diff --git a/backends/tofino/bf-asm/match_table.cpp b/backends/tofino/bf-asm/match_table.cpp
new file mode 100644
index 00000000000..7a602c4728d
--- /dev/null
+++ b/backends/tofino/bf-asm/match_table.cpp
@@ -0,0 +1,700 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <unordered_map>
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "lib/algorithm.h"
+#include "misc.h"
+
+Table::Format *MatchTable::get_format() const {
+    if (!format && gateway) return gateway->get_format();
+    return format.get();
+}
+
+Table::Format::Field *MatchTable::lookup_field(const std::string &n, const std::string &act) const {
+    auto *rv = format ? format->field(n) : nullptr;
+    if (!rv && gateway) rv = gateway->lookup_field(n, act);
+    return rv;
+}
+
+void MatchTable::common_init_setup(const VECTOR(pair_t) & data, bool ternary,
+                                   P4Table::type p4type) {
+    Table::common_init_setup(data, ternary, p4type);
+    setup_logical_id();
+    if (Target::DYNAMIC_CONFIG())
+        if (auto *dconfig = get(data, "dynamic_config"))
+            if (CHECKTYPESIZE(*dconfig, tMAP))
+                for (auto &kv : dconfig->map) dynamic_config.emplace_back(this, kv);
+    for (auto &kv : data)
+        if (kv.key == "input_xbar" && CHECKTYPESIZE(kv.value, tMAP))
+            input_xbar.emplace_back(InputXbar::create(this, ternary, kv.key, kv.value.map));
+}
+
+bool MatchTable::common_setup(pair_t &kv, const VECTOR(pair_t) & data, P4Table::type p4type) {
+    if (Table::common_setup(kv, data, p4type)) {
+        return true;
+    }
+    if (kv.key == "input_xbar" || kv.key == "hash_dist") {
+        /* done in common_init_setup */
+        return true;
+    }
+    if (kv.key == "dynamic_config" && Target::DYNAMIC_CONFIG()) {
+        /* done in common_init_setup */
+        return true;
+    }
+    if (kv.key == "always_run") {
+        if ((always_run = get_bool(kv.value)) && !Target::SUPPORT_ALWAYS_RUN())
+            error(kv.key.lineno, "always_run not supported on %s", Target::name());
+        return true;
+    }
+    if (kv.key == "gateway") {
+        if (CHECKTYPE(kv.value, tMAP)) {
+            gateway = GatewayTable::create(kv.key.lineno, name_ + " gateway", gress, stage, -1,
+                                           kv.value.map);
+            gateway->set_match_table(this, false);
+        }
+        return true;
+    }
+    if (kv.key == "idletime") {
+        if (CHECKTYPE(kv.value, tMAP)) {
+            idletime = IdletimeTable::create(kv.key.lineno, name_ + " idletime", gress, stage, -1,
+                                             kv.value.map);
+            idletime->set_match_table(this, false);
+        }
+        return true;
+    }
+    if (kv.key == "selector") {
+        attached.selector.setup(kv.value, this);
+        return true;
+    }
+    if (kv.key == "selector_length") {
+        attached.selector_length.setup(kv.value, this);
+        return true;
+    }
+    if (kv.key == "meter_color") {
+        attached.meter_color.setup(kv.value, this);
+        return true;
+    }
+    if (kv.key == "stats") {
+        if (kv.value.type == tVEC)
+            for (auto &v : kv.value.vec) attached.stats.emplace_back(v, this);
+        else
+            attached.stats.emplace_back(kv.value, this);
+        return true;
+    }
+    if (kv.key == "meter") {
+        if (kv.value.type == tVEC)
+            for (auto &v : kv.value.vec) attached.meters.emplace_back(v, this);
+        else
+            attached.meters.emplace_back(kv.value, this);
+        return true;
+    }
+    if (kv.key == "stateful") {
+        if (kv.value.type == tVEC)
+            for (auto &v : kv.value.vec) attached.statefuls.emplace_back(v, this);
+        else
+            attached.statefuls.emplace_back(kv.value, this);
+        return true;
+    }
+    if (kv.key == "table_counter") {
+        if (kv.value == "table_miss")
+            table_counter = TABLE_MISS;
+        else if (kv.value == "table_hit")
+            table_counter = TABLE_HIT;
+        else if (kv.value == "gateway_miss")
+            table_counter = GATEWAY_MISS;
+        else if (kv.value == "gateway_hit")
+            table_counter = GATEWAY_HIT;
+        else if (kv.value == "gateway_inhibit")
+            table_counter = GATEWAY_INHIBIT;
+        else if (kv.value == "disabled")
+            table_counter = DISABLED;
+        else
+            error(kv.value.lineno, "Invalid table counter %s", value_desc(kv.value));
+        return true;
+    }
+    return false;
+}
+
+bool MatchTable::is_attached(const Table *tbl) const {
+    return tbl && (tbl == gateway || tbl == idletime || get_attached()->is_attached(tbl));
+}
+
+bitvec MatchTable::compute_reachable_tables() {
+    Table::compute_reachable_tables();
+    if (gateway) reachable_tables_ |= gateway->reachable_tables();
+    if (idletime) reachable_tables_ |= idletime->reachable_tables();
+    reachable_tables_ |= attached.compute_reachable_tables();
+    return reachable_tables_;
+}
+
+/**
+ * Return the first default found meter type of a stateful/meter call.  If the meter type
+ * is considered to be default, then all of the meter types would be identical
+ */
+METER_ACCESS_TYPE MatchTable::default_meter_access_type(bool for_stateful) {
+    METER_ACCESS_TYPE rv = NOP;
+    auto actions = get_actions();
+    if (actions == nullptr) return rv;
+    for (auto it = actions->begin(); it != actions->end(); it++) {
+        if (it->default_only) continue;
+        for (auto &call : it->attached) {
+            auto type = call->table_type();
+            if (!((type == Table::METER && !for_stateful) ||
+                  (type == Table::STATEFUL && for_stateful)))
+                continue;
+            // Currently the first argument is the meter type
+            if (call.args[0].type == Table::Call::Arg::Const) {
+                return static_cast<METER_ACCESS_TYPE>(call.args[0].value());
+            } else if (auto n = call.args[0].name()) {
+                if (auto *st = call->to<StatefulTable>()) {
+                    if (auto *act = st->actions->action(call.args[0].name())) {
+                        return static_cast<METER_ACCESS_TYPE>((act->code << 1) | 1);
+                    }
+                }
+            }
+        }
+    }
+    return rv;
+}
+
+std::vector<Table::Call> MatchTable::get_calls() const {
+    std::vector<Call> rv = Table::get_calls();
+    if (attached.selector) rv.emplace_back(attached.selector);
+    if (attached.selector_length) rv.emplace_back(attached.selector_length);
+    for (auto &c : attached.stats)
+        if (c) rv.emplace_back(c);
+    for (auto &c : attached.meters)
+        if (c) rv.emplace_back(c);
+    for (auto &c : attached.statefuls)
+        if (c) rv.emplace_back(c);
+    if (attached.meter_color) rv.emplace_back(attached.meter_color);
+    return rv;
+}
+
+void MatchTable::pass0() {
+    LOG1("### match table " << name() << " pass0 " << loc());
+#if 0
+    // redundant with (and supercedes) choose_logical_id in pass2.  That function is much
+    // better, taking dependencies into account, so logical_id should not be allocated here
+    alloc_id("logical", logical_id, stage->pass1_logical_id,
+             LOGICAL_TABLES_PER_STAGE, true, stage->logical_id_use);
+#endif
+    if (logical_id >= 0) {
+        if (stage->logical_id_use[logical_id] && stage->logical_id_use[logical_id] != this) {
+            error(lineno, "Duplicate logical id %d use", logical_id);
+            error(stage->logical_id_use[logical_id]->lineno, "previous use here");
+        }
+        stage->logical_id_use[logical_id] = this;
+    }
+    for (auto physid : physical_ids) {
+        if (stage->physical_id_use[physid] && stage->physical_id_use[physid] != this) {
+            error(lineno, "Duplicate physical id %d use", physid);
+            error(stage->physical_id_use[physid]->lineno, "previous use here");
+        }
+        stage->physical_id_use[physid] = this;
+    }
+    if (action.check() && action->set_match_table(this, !action.is_direct_call()) != ACTION)
+        error(action.lineno, "%s is not an action table", action->name());
+    attached.pass0(this);
+}
+
+void MatchTable::pass1() {
+    if (gateway) {
+        // needs to happen before Actions::pass1 so that extra_next_lut is setup
+        gateway->setup_map_indexing(this);
+    }
+    Table::pass1();
+    if (!p4_table)
+        p4_table = P4Table::alloc(P4Table::MatchEntry, this);
+    else
+        p4_table->check(this);
+    // Set up default action. This will look up action and/or tind for default
+    // action if the match_table doesnt have one specified
+    if (default_action.empty()) default_action = get_default_action();
+    if (table_counter >= GATEWAY_MISS && !gateway)
+        error(lineno, "Can't count gateway events on table %s as it doesn't have a gateway",
+              name());
+    if (!p4_params_list.empty()) {
+        for (auto &p : p4_params_list) {
+            // bit_width_full should be generated in assembly as 'full_size' in
+            // the 'p4_param_order'. This is the full size of the field as used
+            // in p4 program.
+            if (!p.bit_width_full) p.bit_width_full = p.bit_width;
+
+            std::size_t found = p.name.find(".$valid");
+            if (found != std::string::npos) p.is_valid = true;
+        }
+    }
+    if (idletime) {
+        idletime->logical_id = logical_id;
+        idletime->pass1();
+    }
+    for (auto &ixb : input_xbar) ixb->pass1();
+    for (auto &hd : hash_dist) hd.pass1(this, HashDistribution::OTHER, false);
+    if (gateway) {
+        gateway->logical_id = logical_id;
+        gateway->pass1();
+    }
+}
+
+void Table::allocate_physical_ids(unsigned usable) {
+    if (physical_ids) {
+        auto unusable = physical_ids - bitvec(usable);
+        BUG_CHECK(unusable.empty(), "table %s using physical id %d which appears to be invalid",
+                  name(), *unusable.begin());
+        return;
+    }
+    if (!Target::MATCH_REQUIRES_PHYSID()) return;
+    for (int i = 0; i < PHYSICAL_TABLES_PER_STAGE; ++i) {
+        if (!((usable >> i) & 1)) continue;
+        if (stage->physical_id_use[i]) continue;
+        physical_ids[i] = 1;
+        stage->physical_id_use[i] = this;
+        return;
+    }
+    error(lineno, "No physical id available for table %s", name());
+}
+
+void MatchTable::pass3() {
+    if (gateway) {
+        gateway->pass3();
+    }
+}
+
+void MatchTable::gen_idletime_tbl_cfg(json::map &stage_tbl) const {
+    if (idletime) idletime->gen_stage_tbl_cfg(stage_tbl);
+}
+
+#include "jbay/match_table.cpp"    // NOLINT(build/include)
+#include "tofino/match_table.cpp"  // NOLINT(build/include)
+
+template <class TARGET>
+void MatchTable::write_common_regs(typename TARGET::mau_regs &regs, int type, Table *result) {
+    /* this follows the order and behavior in stage_match_entry_table.py
+     * it can be reorganized to be clearer */
+
+    /*------------------------
+     * data path
+     *-----------------------*/
+    if (gress == EGRESS) regs.dp.imem_table_addr_egress |= 1 << logical_id;
+
+    /*------------------------
+     * Match Merge
+     *-----------------------*/
+    auto &merge = regs.rams.match.merge;
+    auto &adrdist = regs.rams.match.adrdist;
+    if (gress != GHOST) merge.predication_ctl[gress].table_thread |= 1 << logical_id;
+    if (gress == INGRESS || gress == GHOST) {
+        merge.logical_table_thread[0].logical_table_thread_ingress |= 1 << logical_id;
+        merge.logical_table_thread[1].logical_table_thread_ingress |= 1 << logical_id;
+        merge.logical_table_thread[2].logical_table_thread_ingress |= 1 << logical_id;
+    } else if (gress == EGRESS) {
+        merge.logical_table_thread[0].logical_table_thread_egress |= 1 << logical_id;
+        merge.logical_table_thread[1].logical_table_thread_egress |= 1 << logical_id;
+        merge.logical_table_thread[2].logical_table_thread_egress |= 1 << logical_id;
+    }
+    adrdist.adr_dist_table_thread[timing_thread(gress)][0] |= 1 << logical_id;
+    adrdist.adr_dist_table_thread[timing_thread(gress)][1] |= 1 << logical_id;
+
+    Actions *actions = action && action->actions ? action->actions.get() : this->actions.get();
+
+    std::set<int> result_buses;
+    if (result) {
+        actions = result->action && result->action->actions ? result->action->actions.get()
+                                                            : result->actions.get();
+        for (auto &row : result->layout) {
+            int r_bus = row.row * 2;
+            if (row.bus.count(Layout::RESULT_BUS))
+                r_bus += row.bus.at(Layout::RESULT_BUS) & 1;
+            else if (row.bus.count(Layout::TIND_BUS))
+                r_bus += row.bus.at(Layout::TIND_BUS);
+            else
+                continue;
+            result_buses.insert(r_bus);
+        }
+    } else {
+        /* ternary match with no indirection table */
+        auto tern_table = this->to<TernaryMatchTable>();
+        BUG_CHECK(tern_table != nullptr);
+        if (tern_table->indirect_bus >= 0) result_buses.insert(tern_table->indirect_bus);
+        result = this;
+    }
+
+    for (auto r_bus : result_buses) {
+        auto &shift_en = merge.mau_payload_shifter_enable[type][r_bus];
+        setup_muxctl(merge.match_to_logical_table_ixbar_outputmap[type][r_bus], logical_id);
+        setup_muxctl(merge.match_to_logical_table_ixbar_outputmap[type + 2][r_bus], logical_id);
+
+        int default_action = 0;
+        unsigned adr_mask = 0;
+        unsigned adr_default = 0;
+        unsigned adr_per_entry_en = 0;
+
+        /**
+         * This section of code determines the registers required to determine the
+         * instruction code to run for this particular table.  This uses the information
+         * provided by the instruction code.
+         *
+         * The address is built of two parts, the instruction code and the per flow enable
+         * bit.  These can either come from overhead, or from the default register.
+         * The keyword $DEFAULT indicates that the value comes from the default
+         * register
+         */
+        auto instr_call = instruction_call();
+        // FIXME: Workaround until a format is provided on the gateway to find the
+        // action bit section.  This will be a quick add on.
+        if (instr_call.args[0] == "$DEFAULT") {
+            for (auto it = actions->begin(); it != actions->end(); it++) {
+                if (it->code != -1) {
+                    adr_default |= it->addr;
+                    break;
+                }
+            }
+        } else if (auto field = instr_call.args[0].field()) {
+            adr_mask |= (1U << field->size) - 1;
+        }
+
+        if (instr_call.args[1] == "$DEFAULT") {
+            adr_default |= ACTION_INSTRUCTION_ADR_ENABLE;
+        } else if (auto field = instr_call.args[1].field()) {
+            if (auto addr_field = instr_call.args[0].field()) {
+                adr_per_entry_en = field->bit(0) - addr_field->bit(0);
+            } else {
+                adr_per_entry_en = 0;
+            }
+        }
+        shift_en.action_instruction_adr_payload_shifter_en = 1;
+        merge.mau_action_instruction_adr_mask[type][r_bus] = adr_mask;
+        merge.mau_action_instruction_adr_default[type][r_bus] = adr_default;
+        merge.mau_action_instruction_adr_per_entry_en_mux_ctl[type][r_bus] = adr_per_entry_en;
+
+        if (idletime) idletime->write_merge_regs(regs, type, r_bus);
+        if (result->action) {
+            if (auto adt = result->action->to<ActionTable>()) {
+                merge.mau_actiondata_adr_default[type][r_bus] =
+                    adt->determine_default(result->action);
+            }
+            shift_en.actiondata_adr_payload_shifter_en = 1;
+        }
+        if (!get_attached()->stats.empty()) shift_en.stats_adr_payload_shifter_en = 1;
+        if (!get_attached()->meters.empty() || !get_attached()->statefuls.empty())
+            shift_en.meter_adr_payload_shifter_en = 1;
+
+        result->write_merge_regs(regs, type, r_bus);
+    }
+
+    /*------------------------
+     * Action instruction Address
+     *-----------------------*/
+    int max_code = actions->max_code;
+    if (options.match_compiler)
+        if (auto *action_format = lookup_field("action"))
+            max_code = (1 << (action_format->size - (gateway ? 1 : 0))) - 1;
+    /**
+     * The action map can be used if the choices for the instruction are < 8.  The map data
+     * table will be used if the number of choices are between 2 and 8, and references
+     * the instruction call to determine whether the instruction comes from the map
+     * data table or the default register.
+     */
+    auto instr_call = instruction_call();
+    bool use_action_map =
+        instr_call.args[0].field() && max_code < ACTION_INSTRUCTION_SUCCESSOR_TABLE_DEPTH;
+    // FIXME: Workaround until a format is provided on the gateway to find the
+    // action bit section.  This will be a quick add on.
+
+    if (use_action_map) {
+        merge.mau_action_instruction_adr_map_en[type] |= (1U << logical_id);
+        for (auto &act : *actions)
+            if ((act.name != result->default_action) || !result->default_only_action) {
+                merge.mau_action_instruction_adr_map_data[type][logical_id][act.code / 4]
+                    .set_subfield(act.addr + ACTION_INSTRUCTION_ADR_ENABLE,
+                                  (act.code % 4) * TARGET::ACTION_INSTRUCTION_MAP_WIDTH,
+                                  TARGET::ACTION_INSTRUCTION_MAP_WIDTH);
+            }
+    }
+
+    /**
+     * This register is now the responsiblity of the driver for all tables, as the driver
+     * will initialize this value from the initial default action.  If we ever want to
+     * move some of this responsibility back to the compiler, then this code can be used
+     * for this, but it is currently incorrect for tables that have been split across
+     * multiple stages for non noop default actions.
+    if (this->to<HashActionTable>()) {
+        merge.mau_action_instruction_adr_miss_value[logical_id] = 0;
+    } else if (!default_action.empty()) {
+        auto *act = actions->action(default_action);
+        merge.mau_action_instruction_adr_miss_value[logical_id] =
+            ACTION_INSTRUCTION_ADR_ENABLE + act->addr;
+    } else if (!result->default_action.empty()) {
+        auto *act = actions->action(result->default_action);
+        merge.mau_action_instruction_adr_miss_value[logical_id] =
+            ACTION_INSTRUCTION_ADR_ENABLE + act->addr; }
+    */
+
+    /**
+     * No direct call for a next table, like instruction.  The next table can be determined
+     * from other parameters.  If there is a next parameter in the format, then this is the
+     * field to be used as an extractor.
+     *
+     * If there is no next field, but there is more than one possible entry in the hitmap,
+     * then the action instruction is being used as the index.
+     *
+     * If necessary, i.e. something becomes more complex, then perhaps a call needs to be
+     * added.
+     *
+     * Also, a quick note that though the match_next_table_adr_default is not necessary to set,
+     * the diagram in 6.4.3.3. Next Table Processing, the default register is after the mask.
+     * However, in hardware, the default register is before the mask.
+     */
+    int next_field_size = result->get_format_field_size("next");
+    int action_field_size = result->get_format_field_size("action");
+
+    if (next_field_size > 0) {
+        next_table_adr_mask = ((1U << next_field_size) - 1);
+    } else if (result->get_hit_next().size() > 1) {
+        next_table_adr_mask = ((1U << action_field_size) - 1);
+    }
+    write_next_table_regs(regs, result);
+
+    /*------------------------
+     * Immediate data found in overhead
+     *-----------------------*/
+    if (result->format) {
+        for (auto &row : result->layout) {
+            int r_bus = row.row * 2;
+            if (row.bus.count(Layout::RESULT_BUS))
+                r_bus += row.bus.at(Layout::RESULT_BUS) & 1;
+            else if (row.bus.count(Layout::TIND_BUS))
+                r_bus += row.bus.at(Layout::TIND_BUS);
+            else
+                continue;
+            merge.mau_immediate_data_mask[type][r_bus] = bitMask(result->format->immed_size);
+            if (result->format->immed_size > 0)
+                merge.mau_payload_shifter_enable[type][r_bus].immediate_data_payload_shifter_en = 1;
+        }
+    }
+    if (result->action_bus) {
+        result->action_bus->write_immed_regs(regs, result);
+        for (auto &mtab : get_attached()->meters) {
+            // if the meter table outputs something on the action-bus of the meter
+            // home row, need to set up the action hv xbar properly
+            result->action_bus->write_action_regs(regs, result, mtab->home_row(), 0);
+        }
+        for (auto &stab : get_attached()->statefuls) {
+            // if the stateful table outputs something on the action-bus of the meter
+            // home row, need to set up the action hv xbar properly
+            result->action_bus->write_action_regs(regs, result, stab->home_row(), 0);
+        }
+    }
+
+    // FIXME:
+    // The action parameters that are stored as immediates in the match
+    // overhead need to be properly packed into this register. We had been
+    // previously assuming that the compiler would do that for us, specifying
+    // the bits needed here as the argument to the action call; eg assembly
+    // code like:
+    //         default_action: actname(0x100)
+    // for the default action being actname with the value 0x100 for its
+    // parameters stored as immediates (which might actually be several
+    // parameters in the P4 source code.) To get this from the
+    // default_action_parameters map, we need to look up those argument names
+    // in the match table format and action aliases and figure out which ones
+    // correspond to match immediates, and pack the values appropriately.
+    // Doable but non-trivial, probably requiring a small helper function. Need
+    // to deal with both exact match and ternary indirect.
+    //
+    // For now, most miss configuration registers are only written by the driver
+    // (since the user API says what miss behavior to perform). The compiler
+    // (glass) relies on the driver to write them but this could change in
+    // future. This particular register would only be set if the compiler chose
+    // to allocate action parameters in match overhead.
+    //
+    // if (default_action_parameters.size() > 0)
+    //     merge.mau_immediate_data_miss_value[logical_id] = default_action_parameters[0];
+    // else if (result->default_action_parameters.size() > 0)
+    //     merge.mau_immediate_data_miss_value[logical_id] = result->default_action_parameters[0];
+
+    for (auto &ixb : input_xbar) ixb->write_regs(regs);
+    /* DANGER -- you might think we should call write_regs on other related things here
+     * (actions, hash_dist, idletime, gateway) rather than just input_xbar, but those are
+     * all called by the various callers of this method.  Not clear why input_xbar is
+     * different */
+
+    if (gress == EGRESS) regs.cfg_regs.mau_cfg_lt_thread |= 1U << logical_id;
+    if (options.match_compiler && dynamic_cast<HashActionTable *>(this)) return;  // skip the rest
+
+    if (table_counter) {
+        merge.mau_table_counter_ctl[logical_id / 8U].set_subfield(table_counter,
+                                                                  3 * (logical_id % 8U), 3);
+    } else {  // Set to TABLE_HIT by default
+        merge.mau_table_counter_ctl[logical_id / 8U].set_subfield(TABLE_HIT, 3 * (logical_id % 8U),
+                                                                  3);
+    }
+}
+
+int MatchTable::get_address_mau_actiondata_adr_default(unsigned log2size, bool per_flow_enable) {
+    int huffman_ones = log2size > 2 ? log2size - 3 : 0;
+    BUG_CHECK(huffman_ones < 7);
+    int rv = (1 << huffman_ones) - 1;
+    rv = ((rv << 10) & 0xf8000) | (rv & 0x1f);
+    if (!per_flow_enable) rv |= 1 << 22;
+    return rv;
+}
+
+/**
+ * Generates the hash_bits node for a single hash_function node in the JSON.
+ *
+ * Will add the impact of a single hash_table (64 bit section of the input xbar) to the hash
+ * bits.  If the table requires multiple hash_tables, then the previous hash table value will
+ * be looked up and added.  FIXME: At some point refactor this function to not keep
+ * doing this rewrite.
+ *
+ * The JSON for each hash bit has the following:
+ *     hash_bit - The hash bit in which this is output on the Galois matrix.  (Really whatever
+ *         this bit position is just has to coordinate across the other driver structures, but
+ *         those are also based on the Galois matrix position).
+ *     seed - the bit that is xored in at the end of the calcuation
+ *     bits_to_xor - The field bits from the P4 API that will determine the value of this bit,
+ *         and must be XORed for this bit.  This is a vector of fields with 4 values.
+ *         - field_bit - The p4 field bit to be XORed
+ *         - field_name - The p4 field name to be XORed
+ *         The next two parameters are only needed for dynamic_key_masks, as they indicate
+ *         to the driver which bit to turn off
+ *         - hash_match_group - Which 128 bit input xbar group this bit is appearing in (0-7)
+ *         - hash_match_group_bit - The bit offset within the 128 bit input xbar group.
+ */
+void MatchTable::gen_hash_bits(const std::map<int, HashCol> &hash_table,
+                               InputXbar::HashTable hash_table_id, json::vector &hash_bits,
+                               unsigned hash_group_no, bitvec hash_bits_used) const {
+    for (auto &col : hash_table) {
+        if (!hash_bits_used.getbit(col.first)) continue;
+        json::map hash_bit;
+        bool hash_bit_added = false;
+        json::vector *bits_to_xor = nullptr;
+        // FIXME: This function has a lot of unnecessary copying and moving around.
+        for (auto &hb : hash_bits) {
+            if (hb->to<json::map>()["hash_bit"]->to<json::number>() == json::number(col.first)) {
+                bits_to_xor = &(hb->to<json::map>()["bits_to_xor"]->to<json::vector>());
+                hash_bit_added = true;
+            }
+        }
+        if (!hash_bit_added) bits_to_xor = &(hash_bit["bits_to_xor"] = json::vector());
+        hash_bit["hash_bit"] = col.first;
+        BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+        hash_bit["seed"] = input_xbar[0]->get_seed_bit(hash_group_no, col.first);
+        for (const auto &bit : col.second.data) {
+            if (auto ref = input_xbar[0]->get_hashtable_bit(hash_table_id, bit)) {
+                std::string field_name, global_name;
+                field_name = ref.name();
+
+                auto field_bit = remove_name_tail_range(field_name) + ref.lobit();
+                global_name = field_name;
+
+                // Look up this field in the param list to get a custom key
+                // name, if present.
+                auto p = find_p4_param(field_name);
+                if (!p && !p4_params_list.empty()) {
+                    warning(col.second.lineno,
+                            "Cannot find field name %s in p4_param_order "
+                            "for table %s",
+                            field_name.c_str(), name());
+                } else if (p && !p->key_name.empty()) {
+                    field_name = p->key_name;
+                }
+                auto group = input_xbar[0]->hashtable_input_group(hash_table_id);
+                int group_bit = bit;
+                // FIXME -- this adjustment is a hack for tofino1/2.  Should have a virtual
+                // method on InputXbar?  or something in Target?
+                if (group.index != hash_table_id.index && (hash_table_id.index & 1))
+                    group_bit += 64;
+                bits_to_xor->push_back(
+                    json::map{{"field_bit", json::number(field_bit)},
+                              {"field_name", json::string(field_name)},
+                              {"global_name", json::string(global_name)},
+                              {"hash_match_group", json::number(group.index)},
+                              {"hash_match_group_bit", json::number(group_bit)}});
+            }
+        }
+        if (!hash_bit_added) hash_bits.push_back(std::move(hash_bit));
+    }
+}
+
+void MatchTable::add_hash_functions(json::map &stage_tbl) const {
+    json::vector &hash_functions = stage_tbl["hash_functions"] = json::vector();
+    // TODO: Hash functions are not generated for ALPM atcams as the
+    // partition index bits used in hash which is a compiler generated field and
+    // should not be in 'match_key_fields'. The tests in p4factory are written
+    // with match_spec to not include the partition index field. Glass also
+    // generates an empty 'hash_functions' node
+    if (is_alpm()) return;
+    // Emit hash info only if p4_param_order (match_key_fields) are present
+    // FIXME: This input_xbar is populated if its a part of the hash_action
+    // table or the hash_distribution which is incorrect. This should move
+    // inside the hash_dist so this condition does not occur in the
+    // hash_action table
+    bitvec hash_matrix_req;
+    hash_matrix_req.setrange(0, EXACT_HASH_GROUP_SIZE);
+    if (!p4_params_list.empty() && !input_xbar.empty()) {
+        BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+        auto ht = input_xbar[0]->get_hash_tables();
+        if (ht.size() > 0) {
+            // Merge all bits to xor across multiple hash ways in single
+            // json::vector for each hash bit
+            for (const auto &hash_table : ht) {
+                json::map hash_function;
+                json::vector &hash_bits = hash_function["hash_bits"] = json::vector();
+                hash_function["hash_function_number"] = hash_table.first.uid();
+                gen_hash_bits(hash_table.second, hash_table.first, hash_bits,
+                              hash_table.first.uid(), hash_matrix_req);
+                hash_functions.push_back(std::move(hash_function));
+            }
+        }
+    }
+}
+
+void MatchTable::add_all_reference_tables(json::map &tbl, Table *match_table) const {
+    auto mt = (!match_table) ? this : match_table;
+    json::vector &action_data_table_refs = tbl["action_data_table_refs"];
+    json::vector &selection_table_refs = tbl["selection_table_refs"];
+    json::vector &meter_table_refs = tbl["meter_table_refs"];
+    json::vector &statistics_table_refs = tbl["statistics_table_refs"];
+    json::vector &stateful_table_refs = tbl["stateful_table_refs"];
+    add_reference_table(action_data_table_refs, mt->action);
+    if (auto a = mt->get_attached()) {
+        if (a->selector) {
+            unsigned sel_mask = (1U << METER_TYPE_START_BIT) - 1;
+            sel_mask &= ~((1U << SELECTOR_LOWER_HUFFMAN_BITS) - 1);
+            add_reference_table(selection_table_refs, a->selector);
+        }
+        for (auto &m : a->meters) {
+            add_reference_table(meter_table_refs, m);
+        }
+        for (auto &s : a->stats) {
+            add_reference_table(statistics_table_refs, s);
+        }
+        for (auto &s : a->statefuls) {
+            add_reference_table(stateful_table_refs, s);
+        }
+    }
+}
diff --git a/backends/tofino/bf-asm/meter.cpp b/backends/tofino/bf-asm/meter.cpp
new file mode 100644
index 00000000000..64cb1419823
--- /dev/null
+++ b/backends/tofino/bf-asm/meter.cpp
@@ -0,0 +1,1032 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "data_switchbox.h"
+#include "input_xbar.h"
+#include "misc.h"
+
+// target specific template specializations
+#include "jbay/meter.h"
+#include "tofino/meter.h"
+
+Table::Layout::bus_type_t MeterTable::default_bus_type() const {
+    // FIXME -- this is a bit of a hack -- if color_mapram_addr has been set, we want the
+    // bus_type for color maprams, not for the meter proper (which should not actually
+    // have a bus specified?)
+    if (color_mapram_addr == IDLE_MAP_ADDR) return Layout::IDLE_BUS;
+    warning(lineno, "meter table should not have bus:, will be ignored");
+    return Layout::SEARCH_BUS;
+}
+
+void MeterTable::setup(VECTOR(pair_t) & data) {
+    common_init_setup(data, false, P4Table::Stateful);
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (common_setup(kv, data, P4Table::Meter)) {
+        } else if (kv.key == "input_xbar") {
+            if (CHECKTYPE(kv.value, tMAP))
+                input_xbar.emplace_back(InputXbar::create(this, false, kv.key, kv.value.map));
+        } else if (kv.key == "color_aware") {
+            if (kv.value == "per_flow")
+                color_aware = color_aware_per_flow_enable = true;
+            else
+                color_aware = get_bool(kv.value);
+        } else if (kv.key == "color_maprams") {
+            if (CHECKTYPE(kv.value, tMAP)) {
+                if (auto addr_type = get(kv.value.map, "address")) {
+                    if (CHECKTYPE(*addr_type, tSTR)) {
+                        if (*addr_type == "idletime")
+                            color_mapram_addr = IDLE_MAP_ADDR;
+                        else if (*addr_type == "stats")
+                            color_mapram_addr = STATS_MAP_ADDR;
+                        else
+                            error(addr_type->lineno, "Unrecognized color mapram address type %s",
+                                  addr_type->s);
+                    }
+                }
+                setup_layout(color_maprams, kv.value.map, " color_maprams");
+                if (auto *vpn = get(kv.value.map, "vpns"))
+                    if (CHECKTYPE(*vpn, tVEC)) setup_vpns(color_maprams, &vpn->vec, true);
+            }
+        } else if (kv.key == "pre_color") {
+            if (CHECKTYPE(kv.value, tCMD)) {
+                if (kv.value != "hash_dist")
+                    error(kv.value.lineno, "Pre color must come from hash distribution");
+                if (kv.value.vec.size != 3)
+                    error(kv.value.lineno,
+                          "Pre color hash distribution requires two parameters,"
+                          " but has %d",
+                          kv.value.vec.size);
+                if (CHECKTYPE(kv.value.vec[1], tINT)) pre_color_hash_dist_unit = kv.value.vec[1].i;
+                if (CHECKTYPE(kv.value.vec[2], tRANGE)) {
+                    auto range = kv.value.vec[2];
+                    int diff = range.range.hi - range.range.lo + 1;
+                    if (diff != 2 || range.range.lo % 2 != 0)
+                        error(kv.value.lineno, "Invalid hash distribution range for precolor");
+                    pre_color_bit_lo = range.range.lo;
+                }
+            }
+        } else if (kv.key == "type") {
+            if (kv.value == "standard")
+                type = STANDARD;
+            else if (kv.value == "lpf")
+                type = LPF;
+            else if (kv.value == "wred")
+                type = RED;
+            else
+                error(kv.value.lineno, "Unknown meter type %s", value_desc(kv.value));
+        } else if (kv.key == "red_output") {
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto &v : kv.value.map) {
+                    if (CHECKTYPE(v.key, tSTR) && CHECKTYPE(v.value, tINT)) {
+                        if (v.key == "drop")
+                            red_drop_value = v.value.i;
+                        else if (v.key == "nodrop")
+                            red_nodrop_value = v.value.i;
+                        else
+                            error(kv.value.lineno, "Unknown meter red param: %s", v.key.s);
+                    }
+                }
+            }
+        } else if (kv.key == "count") {
+            if (kv.value == "bytes")
+                count = BYTES;
+            else if (kv.value == "packets")
+                count = PACKETS;
+            else
+                error(kv.value.lineno, "Unknown meter count %s", value_desc(kv.value));
+        } else if (kv.key == "teop") {
+            if (gress != EGRESS) error(kv.value.lineno, "tEOP can only be used in EGRESS");
+            if (!Target::SUPPORT_TRUE_EOP())
+                error(kv.value.lineno, "tEOP is not available on device");
+            if (CHECKTYPE(kv.value, tINT)) {
+                teop = kv.value.i;
+                if (teop < 0 || teop > 3)
+                    error(kv.value.lineno, "Invalid tEOP bus %d, valid values are 0-3", teop);
+            }
+            BUG_CHECK(!stage->teop[teop].first,
+                      "previously used tEOP bus %d used again in stage %d", teop, stage->stageno);
+            stage->teop[teop] = {true, stage->stageno};
+        } else if (kv.key == "green") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                green_value = kv.value.i;
+            }
+        } else if (kv.key == "yellow") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                yellow_value = kv.value.i;
+            }
+        } else if (kv.key == "red") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                red_value = kv.value.i;
+            }
+        } else if (kv.key == "profile") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                profile = kv.value.i;
+            }
+        } else if (kv.key == "sweep_interval") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                // sweep_interval value in assembly if present is from
+                // meter_sweep_interval pragma in p4 program. Allowed values for
+                // the meter_sweep_interval register are [0:20]. but [5:20] are
+                // only to be used with shifting meter time scale. We check and
+                // throw an error if value is present and not in range[0:4]
+                int intvl = kv.value.i;
+                if (intvl >= 0 && intvl <= 4)
+                    sweep_interval = intvl;
+                else
+                    error(
+                        lineno,
+                        "Invalid meter sweep interval of %d. Allowed values are in the range[0:4]",
+                        intvl);
+            }
+        } else if (kv.key == "bytecount_adjust") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                bytecount_adjust = kv.value.i;
+            }
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (teop >= 0 && count != BYTES) error(lineno, "tEOP bus can only used when counting bytes");
+    if (Target::SRAM_GLOBAL_ACCESS())
+        alloc_global_srams();
+    else
+        alloc_rams(true, stage->sram_use);
+}
+
+void MeterTable::pass1() {
+    LOG1("### Meter table " << name() << " pass1 " << loc());
+    if (!p4_table)
+        p4_table = P4Table::alloc(P4Table::Meter, this);
+    else
+        p4_table->check(this);
+    alloc_vpns();
+    alloc_maprams();
+    if (color_maprams.empty() && type != LPF && type != RED)
+        error(lineno, "Missing color_maprams in meter table %s", name());
+    if (uses_colormaprams() && color_mapram_addr == NO_COLOR_MAP)
+        error(lineno, "Missing color mapram address type in table %s", name());
+    for (auto &r : color_maprams) {
+        for (auto &memunit : r.memunits) {
+            BUG_CHECK(memunit.row == r.row, "memunit on wrong row");
+            if (Table *old = stage->mapram_use[r.row][memunit.col])
+                error(r.lineno,
+                      "Table %s trying to use mapram %d,%d for color, which is "
+                      "in use by table %s",
+                      name(), r.row, memunit.col, old->name());
+            stage->mapram_use[r.row][memunit.col] = this;
+        }
+    }
+    if (!no_vpns && !color_maprams.empty() && color_maprams[0].vpns.empty())
+        setup_vpns(color_maprams, 0);
+    std::sort(layout.begin(), layout.end(),
+              [](const Layout &a, const Layout &b) -> bool { return a.row > b.row; });
+    stage->table_use[timing_thread(gress)] |= Stage::USE_METER;
+    if (type == LPF || type == RED)
+        stage->table_use[timing_thread(gress)] |= Stage::USE_METER_LPF_RED;
+    for (auto &ixb : input_xbar) ixb->pass1();
+    for (auto &hd : hash_dist) hd.pass1(this, HashDistribution::OTHER, false);
+    int prev_row = -1;
+    for (auto &row : layout) {
+        if (home_rows.count(row.row)) prev_row = -1;
+
+        if (prev_row >= 0)
+            need_bus(lineno, stage->overflow_bus_use, row.row, "Overflow");
+        else
+            need_bus(lineno, stage->meter_bus_use, row.row, "Meter data");
+        for (int r = (row.row + 1) | 1; r < prev_row; r += 2)
+            need_bus(lineno, stage->overflow_bus_use, r, "Overflow");
+        prev_row = row.row;
+    }
+    Synth2Port::pass1();
+}
+
+void MeterTable::pass2() {
+    LOG1("### Meter table " << name() << " pass2 " << loc());
+    for (auto &ixb : input_xbar) ixb->pass2();
+
+    for (auto match_table : get_match_tables()) {
+        for (auto &hd : match_table->hash_dist) {
+            if (hd.id == pre_color_hash_dist_unit) {
+                hd.meter_pre_color = true;
+                hd.meter_mask_index = pre_color_bit_lo / 2;
+            }
+        }
+    }
+    if (get_match_tables().size() > 1 && color_mapram_addr == IDLE_MAP_ADDR)
+        error(lineno, "Shared meter cannot use idletime addressing for color maprams");
+    for (auto &hd : hash_dist) hd.pass2(this);
+}
+
+void MeterTable::pass3() { LOG1("### Meter table " << name() << " pass3 " << loc()); }
+
+int MeterTable::direct_shiftcount() const {
+    return 64 + METER_ADDRESS_ZERO_PAD - 7;  // meters are always 128 bits wide
+}
+
+int MeterTable::indirect_shiftcount() const {
+    return METER_ADDRESS_ZERO_PAD - 7;  // meters are always 128 bits wide
+}
+
+int MeterTable::address_shift() const {
+    return 7;  // meters are always 128 bits wide
+}
+
+int MeterTable::color_shiftcount(Table::Call &call, int group, int tcam_shift) const {
+    int extra_padding = 0;
+    int zero_pad = 0;
+    if (color_mapram_addr == IDLE_MAP_ADDR) {
+        extra_padding = IDLETIME_ADDRESS_ZERO_PAD - IDLETIME_HUFFMAN_BITS;
+        zero_pad = IDLETIME_ADDRESS_ZERO_PAD;
+    } else if (color_mapram_addr == STATS_MAP_ADDR) {
+        extra_padding = STAT_ADDRESS_ZERO_PAD - STAT_METER_COLOR_LOWER_HUFFMAN_BITS;
+        zero_pad = STAT_ADDRESS_ZERO_PAD;
+    }
+
+    if (call.args[0].name() && strcmp(call.args[0].name(), "$DIRECT") == 0) {
+        return 64 + tcam_shift + extra_padding;
+    } else if (auto f = call.args[0].field()) {
+        return f->by_group[group]->bit(0) % 128U + extra_padding;
+    } else if (auto f = call.args[1].field()) {
+        return f->bit(0) + zero_pad;
+    } else {
+        return 0;
+    }
+}
+
+unsigned MeterTable::determine_shiftcount(Table::Call &call, int group, unsigned word,
+                                          int tcam_shift) const {
+    return determine_meter_shiftcount(call, group, word, tcam_shift);
+}
+
+template <class REGS>
+void MeterTable::write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                                     const std::vector<Call::Arg> &args) {
+    auto &merge = regs.rams.match.merge;
+    unsigned adr_mask = 0U;
+    unsigned per_entry_en_mux_ctl = 0U;
+    unsigned adr_default = 0U;
+    unsigned meter_type_position = 0U;
+    METER_ACCESS_TYPE default_type = match->default_meter_access_type(false);
+    AttachedTable::determine_meter_merge_regs(match, type, bus, args, default_type, adr_mask,
+                                              per_entry_en_mux_ctl, adr_default,
+                                              meter_type_position);
+    merge.mau_meter_adr_default[type][bus] = adr_default;
+    merge.mau_meter_adr_mask[type][bus] = adr_mask;
+    merge.mau_meter_adr_per_entry_en_mux_ctl[type][bus] = per_entry_en_mux_ctl;
+    merge.mau_meter_adr_type_position[type][bus] = meter_type_position;
+}
+
+template <class REGS>
+void MeterTable::write_color_regs(REGS &regs, MatchTable *match, int type, int bus,
+                                  const std::vector<Call::Arg> &args) {
+    BUG_CHECK(uses_colormaprams(), "meter %s does not use color maprams, but uses color?", name());
+    auto &merge = regs.rams.match.merge;
+    unsigned adr_mask = 0U;
+    unsigned per_entry_en_mux_ctl = 0U;
+    unsigned adr_default = 0U;
+    unsigned meter_type_position = 0U;
+    AttachedTable::determine_meter_merge_regs(match, type, bus, args, METER_COLOR_ACCESS, adr_mask,
+                                              per_entry_en_mux_ctl, adr_default,
+                                              meter_type_position);
+
+    // Based on the uArch section 6.2.8.4.9 Map RAM Addressing, color maprams can be
+    // addressed by either idletime or stats based addresses.  Which address is used
+    // can be specified in the asm file, and is built according to the specification
+
+    if (color_mapram_addr == IDLE_MAP_ADDR) {
+        unsigned idle_mask = (1U << IDLETIME_ADDRESS_BITS) - 1;
+        unsigned full_idle_mask = (1U << IDLETIME_FULL_ADDRESS_BITS) - 1;
+        unsigned shift_diff = METER_LOWER_HUFFMAN_BITS - IDLETIME_HUFFMAN_BITS;
+        merge.mau_idletime_adr_mask[type][bus] = (adr_mask >> shift_diff) & idle_mask;
+        merge.mau_idletime_adr_default[type][bus] = (adr_default >> shift_diff) & full_idle_mask;
+        if (per_entry_en_mux_ctl > shift_diff)
+            merge.mau_idletime_adr_per_entry_en_mux_ctl[type][bus] =
+                per_entry_en_mux_ctl - shift_diff;
+        else
+            merge.mau_idletime_adr_per_entry_en_mux_ctl[type][bus] = 0;
+    } else if (color_mapram_addr == STATS_MAP_ADDR) {
+        unsigned stats_mask = (1U << STAT_ADDRESS_BITS) - 1;
+        unsigned full_stats_mask = (1U << STAT_FULL_ADDRESS_BITS) - 1;
+        unsigned shift_diff = METER_LOWER_HUFFMAN_BITS - STAT_METER_COLOR_LOWER_HUFFMAN_BITS;
+        merge.mau_stats_adr_mask[type][bus] = (adr_mask >> shift_diff) & stats_mask;
+        merge.mau_stats_adr_default[type][bus] = (adr_default >> shift_diff) & full_stats_mask;
+        if (per_entry_en_mux_ctl > shift_diff)
+            merge.mau_stats_adr_per_entry_en_mux_ctl[type][bus] = per_entry_en_mux_ctl - shift_diff;
+        else
+            merge.mau_stats_adr_per_entry_en_mux_ctl[type][bus] = 0;
+    } else {
+        BUG();
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void MeterTable::write_color_regs, mau_regs &,
+                      MatchTable *, int, int, const std::vector<Call::Arg> &);
+
+template <class REGS>
+void MeterTable::setup_exact_shift(REGS &regs, int bus, int group, int word, int word_group,
+                                   Call &meter_call, Call &color_call) {
+    auto &merge = regs.rams.match.merge;
+    int shiftcount = determine_shiftcount(meter_call, group, word, 0);
+    merge.mau_meter_adr_exact_shiftcount[bus][word_group] = shiftcount;
+    if (uses_colormaprams()) {
+        int color_shift = color_shiftcount(color_call, group, 0);
+        if (color_mapram_addr == IDLE_MAP_ADDR) {
+            merge.mau_idletime_adr_exact_shiftcount[bus][word_group] = color_shift;
+            merge.mau_payload_shifter_enable[0][bus].idletime_adr_payload_shifter_en = 1;
+        } else if (color_mapram_addr == STATS_MAP_ADDR) {
+            merge.mau_stats_adr_exact_shiftcount[bus][word_group] = color_shift;
+            merge.mau_payload_shifter_enable[0][bus].stats_adr_payload_shifter_en = 1;
+        }
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void MeterTable::setup_exact_shift, mau_regs &,
+                      int, int, int, int, Call &, Call &);
+
+template <class REGS>
+void MeterTable::setup_tcam_shift(REGS &regs, int bus, int tcam_shift, Call &meter_call,
+                                  Call &color_call) {
+    auto &merge = regs.rams.match.merge;
+    int shiftcount = determine_shiftcount(meter_call, 0, 0, tcam_shift);
+    merge.mau_meter_adr_tcam_shiftcount[bus] = shiftcount;
+    if (uses_colormaprams()) {
+        int color_shift = color_shiftcount(color_call, 0, tcam_shift);
+        if (color_mapram_addr == IDLE_MAP_ADDR) {
+            merge.mau_idletime_adr_tcam_shiftcount[bus] = color_shift;
+            merge.mau_payload_shifter_enable[1][bus].idletime_adr_payload_shifter_en = 1;
+        } else if (color_mapram_addr == STATS_MAP_ADDR) {
+            merge.mau_stats_adr_tcam_shiftcount[bus] = color_shift;
+            merge.mau_payload_shifter_enable[1][bus].stats_adr_payload_shifter_en = 1;
+        }
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void MeterTable::setup_tcam_shift, mau_regs &,
+                      int, int, Call &, Call &);
+
+template <class REGS>
+void MeterTable::write_regs_home_row(REGS &regs, unsigned row) {
+    auto &map_alu = regs.rams.map_alu;
+    auto &map_alu_row = map_alu.row[row];
+    auto &adrdist = regs.rams.match.adrdist;
+    unsigned side = 1;  // Meter can only be on right side
+    int minvpn, maxvpn;
+    layout_vpn_bounds(minvpn, maxvpn, true);
+
+    if (home_rows.size() > 1) {
+        int sparevpn;
+        layout_vpn_bounds(minvpn, sparevpn, false);
+        bool block_start = false;
+        bool block_end = false;
+        minvpn = INT_MAX;
+        maxvpn = INT_MIN;
+        for (Layout &logical_row : layout) {
+            // Block Start with the home row and End with the Spare VPN
+            if (logical_row.row / 2U == row) block_start = true;
+
+            if (block_start) {
+                for (auto v : logical_row.vpns) {
+                    if (v == sparevpn) {
+                        block_end = true;
+                        break;
+                    }
+
+                    if (v < minvpn) minvpn = v;
+                    if (v > maxvpn) maxvpn = v;
+                }
+            }
+            if (block_end) {
+                BUG_CHECK(minvpn != INT_MAX && maxvpn != INT_MIN);
+                break;
+            }
+        }
+        BUG_CHECK(block_start && block_end);
+    }
+
+    int meter_group_index = row / 2U;
+    auto &meter = map_alu.meter_group[meter_group_index].meter;
+    auto &meter_ctl = meter.meter_ctl;
+    auto &red_value_ctl = meter.red_value_ctl;
+
+    int first_home_row = *home_rows.begin();
+    if (count == BYTES) {
+        auto meter_bytecount_adjust_size = meter_ctl.meter_bytecount_adjust.size();
+        auto meter_bytecount_adjust_mask = ((1U << meter_bytecount_adjust_size) - 1);
+        int bytecount_adjust_max = (1U << (meter_bytecount_adjust_size - 1)) - 1;
+        int bytecount_adjust_min = -1 * (1U << (meter_bytecount_adjust_size - 1));
+        if (bytecount_adjust > bytecount_adjust_max || bytecount_adjust < bytecount_adjust_min) {
+            error(lineno,
+                  "The bytecount adjust value of %d on meter %s "
+                  "does not fit within allowed range for %d bits - { %d, %d }",
+                  bytecount_adjust, name(), meter_bytecount_adjust_size, bytecount_adjust_min,
+                  bytecount_adjust_max);
+        }
+        meter_ctl.meter_bytecount_adjust = bytecount_adjust & meter_bytecount_adjust_mask;
+    }
+    auto &delay_ctl = map_alu.meter_alu_group_data_delay_ctl[meter_group_index];
+    delay_ctl.meter_alu_right_group_delay =
+        Target::METER_ALU_GROUP_DATA_DELAY() + row / 4 + stage->tcam_delay(gress);
+    switch (type) {
+        case LPF:
+            meter_ctl.lpf_enable = 1;
+            delay_ctl.meter_alu_right_group_enable = 1;
+            break;
+        case RED:
+            meter_ctl.lpf_enable = 1;
+            meter_ctl.red_enable = 1;
+            delay_ctl.meter_alu_right_group_enable = 1;
+            red_value_ctl.red_nodrop_value = red_nodrop_value;
+            red_value_ctl.red_drop_value = red_drop_value;
+            break;
+        default:
+            meter_ctl.meter_enable = 1;
+            // RNG:
+            // Enables random number generator for meter probabilistic charging
+            // when green/yellow burst size exponent > 14.  This should be set
+            // when any meter entry in the table has a burstsize exponent > 14
+            // RNG is also enabled whenever red_enable config bit is set.
+
+            // this should always be turned on
+            // for color-based meters, to handle an issue with large burst
+            // sizes.  This applies to both packet-based and byte-based meters.
+            // Mike F said, "The hardware adjusts the rate under the hood to
+            // match the desired rate. Without enabling the RNG, the hardware
+            // will always overcharge the buckets thereby reducing the rate."
+            meter_ctl.meter_rng_enable = 1;
+            meter_ctl.meter_time_scale = profile;
+            break;
+    }
+    if (count == BYTES) meter_ctl.meter_byte = 1;
+    if (gress == EGRESS) meter_ctl.meter_alu_egress = 1;
+    auto &error_ctl = map_alu.meter_alu_group_error_ctl[meter_group_index];
+    error_ctl.meter_alu_group_ecc_error_enable = 1;
+    error_ctl.meter_alu_group_thread = gress;
+    auto &meter_sweep_ctl = adrdist.meter_sweep_ctl[meter_group_index];
+    // The driver will manage turning on the meter sweep enable,
+    // so the compiler should not configure this value (check glass
+    // code)
+    // meter_sweep_ctl.meter_sweep_en = 1;
+    meter_sweep_ctl.meter_sweep_offset = minvpn;
+    meter_sweep_ctl.meter_sweep_size = maxvpn;
+    meter_sweep_ctl.meter_sweep_remove_hole_pos = 0;  // FIXME -- see CSR?
+    meter_sweep_ctl.meter_sweep_remove_hole_en = 0;   // FIXME
+    meter_sweep_ctl.meter_sweep_interval = sweep_interval + profile;
+    for (auto &ixb : input_xbar) {
+        auto &vh_adr_xbar = regs.rams.array.row[row].vh_adr_xbar;
+        auto &data_ctl = regs.rams.array.row[row].vh_xbar[side].stateful_meter_alu_data_ctl;
+        // FIXME: Currently in the compiler, the data headed to the meter alu/stateful alu
+        // can only come from hash or the search bus, but not both, thus it is
+        // currenlty safe for them to be mutually exclusive.  If the compiler was to
+        // allocate fields to both, this would have to interpret the information
+        // correctly
+        auto hashdata_bytemask = bitmask2bytemask(ixb->hash_group_bituse());
+        if (hashdata_bytemask != 0U) {
+            vh_adr_xbar.alu_hashdata_bytemask.alu_hashdata_bytemask_right = hashdata_bytemask;
+            setup_muxctl(vh_adr_xbar.exactmatch_row_hashadr_xbar_ctl[2 + side], ixb->hash_group());
+        } else {
+            // FIXME: Need to be some validation between Tofino and JBay if the input
+            // xbar is valid for these meters.
+            bitvec bytemask = ixb->bytemask();
+            bytemask >>= bytemask.min().index();
+            unsigned u_bytemask = bytemask.getrange(0, bytemask.max().index() + 1);
+            data_ctl.stateful_meter_alu_data_bytemask = u_bytemask;
+            data_ctl.stateful_meter_alu_data_xbar_ctl = 8 | ixb->match_group();
+        }
+    }
+    if (output_used) {
+        auto &action_ctl = map_alu.meter_alu_group_action_ctl[meter_group_index];
+        action_ctl.right_alu_action_enable = 1;
+        action_ctl.right_alu_action_delay = stage->meter_alu_delay(gress, false);
+        auto &switch_ctl = regs.rams.array.switchbox.row[row].ctl;
+        switch_ctl.r_action_o_mux_select.r_action_o_sel_action_rd_r_i = 1;
+        // disable action data address huffman decoding, on the assumtion we're not
+        // trying to combine this with an action data table on the same home row.
+        // Otherwise, the huffman decoding will think this is an 8-bit value and
+        // replicate it.
+        regs.rams.array.row[row]
+            .action_hv_xbar.action_hv_xbar_disable_ram_adr.action_hv_xbar_disable_ram_adr_right = 1;
+    }
+    map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_base = minvpn;
+    map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_limit = maxvpn;
+    auto &movereg_meter_ctl = adrdist.movereg_meter_ctl[meter_group_index];
+    if (run_at_eop()) movereg_meter_ctl.movereg_meter_ctl_deferred = 1;
+    movereg_meter_ctl.movereg_ad_meter_shift = 7;
+    movereg_meter_ctl.movereg_meter_ctl_lt = logical_id;
+    if (direct) movereg_meter_ctl.movereg_meter_ctl_direct = 1;
+    movereg_meter_ctl.movereg_meter_ctl_color_en = 1;
+    for (MatchTable *m : match_tables) {
+        if (direct) adrdist.movereg_ad_direct[1] |= 1U << m->logical_id;
+        // The first ALU will drive this xbar register
+        if (first_home_row / 4U == meter_group_index) {
+            adrdist.movereg_ad_meter_alu_to_logical_xbar_ctl[m->logical_id / 8U].set_subfield(
+                4 | meter_group_index, 3 * (m->logical_id % 8U), 3);
+        }
+    }
+}
+
+template <class REGS>
+void MeterTable::write_mapram_color_regs(REGS &regs, bool &push_on_overflow) {
+    auto &map_alu = regs.rams.map_alu;
+    auto &adrdist = regs.rams.match.adrdist;
+    auto &merge = regs.rams.match.merge;
+    int curr_home_row = -1;
+
+    for (Layout &row : color_maprams) {
+        curr_home_row = get_home_row_for_row(row.row * 2);
+        // Allocating color maprams above home row is invalid
+        // as color writes can only be distributed to maprams
+        // via buses going on the home row or below
+        BUG_CHECK(curr_home_row / 4U >= row.row / 2U);
+
+        int color_map_color = color_maprams.empty() ? 0 : (curr_home_row / 4U) & 1;
+        if (row.row == curr_home_row / 2) { /* on the home row */
+            if (color_map_color)
+                map_alu.mapram_color_switchbox.row[row.row]
+                    .ctl.r_color1_mux_select.r_color1_sel_color_r_i = 1;
+            else
+                map_alu.mapram_color_switchbox.row[row.row]
+                    .ctl.r_color0_mux_select.r_color0_sel_color_r_i = 1;
+        } else if (row.row / 4U == curr_home_row / 8U) { /* same half as home */
+            if (color_map_color)
+                map_alu.mapram_color_switchbox.row[row.row]
+                    .ctl.r_color1_mux_select.r_color1_sel_oflo_color_r_i = 1;
+            else
+                map_alu.mapram_color_switchbox.row[row.row]
+                    .ctl.r_color0_mux_select.r_color0_sel_oflo_color_r_i = 1;
+        } else { /* other half from home */
+            map_alu.mapram_color_switchbox.row[row.row].ctl.t_oflo_color_o_mux_select = 1;
+            merge.mau_match_central_mapram_read_color_oflo_ctl |= 1U << color_map_color;
+        }
+
+        /*
+         * Below diagrams show how select bits are set to
+         * route color data from meter alu located on the home
+         * row down to the color maprams
+         * *********************************************
+         * - ROUTE FROM RIGHT TO BOTTOM
+         * Bus coming from Meter ALU on current home row
+         *   .------------
+         *   | r_color_write_i
+         *   v
+         * .---.
+         * |   |<---- (select = 1'b1)
+         * |   | b_oflo_color_write_o_sel_r_color_write_i
+         * .___.
+         *   |
+         *   | b_oflo_color_write_o
+         *   v
+         * Bus going to color map rams below
+         *
+         * *********************************************
+         * - ROUTE FROM TOP TO BOTTOM
+         * Bus coming from home row above
+         *   |
+         *   | t_oflo_color_write_i
+         *   v
+         * .---.
+         * |   |<---- (select = 1'b1)
+         * |   | b_oflo_color_write_o_sel_t_oflo_color_write_i
+         * .___.
+         *   |
+         *   | b_oflo_color_write_o
+         *   v
+         * Bus going to color map rams below
+         *
+         * *********************************************
+         * - ROUTE FROM TOP TO RIGHT
+         * Bus coming from home row above
+         *   |
+         *   | t_oflo_color_write_i
+         *   v
+         * .---.
+         * |   |<---- (select = 1'b1)
+         * |   | r_oflo_color_write_o_mux_select
+         * .___.
+         *   |
+         *   | r_oflo_color_write_o
+         *   .---------------->
+         * Bus going to color map rams on right
+         *
+         * *********************************************
+         *
+         * A - Meter 1 Map Rams
+         * a - Meter 1 Color Map Rams
+         * B - Meter 1 Map Rams
+         * b - Meter 1 Color Map Rams
+         *
+         * Log Phy            Columns            SW   Mtr
+         * Row Row   0    1    2    3    4    5  Box  ALU
+         *         .---..---..---..---..---..---.
+         *  15  7  | A || A || A || A || A || A | 3    3
+         *         .___..___..___..___..___..___.
+         *         .---..---..---..---..---..---.
+         *  13  6  | A || A || A || A || a || a |
+         *         .___..___..___..___..___..___.
+         *         .---..---..---..---..---..---.
+         *  11  5  | B || B || B || B || B || a | 2    2
+         *         .___..___..___..___..___..___.
+         *         .---..---..---..---..---..---.
+         *   9  4  | B || B || B || B || B || b |
+         *         .___..___..___..___..___..___.
+         *         .---..---..---..---..---..---.
+         *   7  3  | b || b || - || - || - || - | 1    1
+         *         .___..___..___..___..___..___.
+         *
+         * Meter Color Write Switchbox is configured to
+         * - set b_oflo_color_write_o_sel_r_color_write_i (1'b1)
+         *   This routes meter alu 3 data down to rows 6 & 5 where
+         *   meter 1 color maprams are located [6,4] [6,5] [5,5]
+         *
+         * Meter ALU 2 is configured to
+         * - set b_oflo_color_write_o_sel_t_oflo_color_write_i (1'b1)
+         *   This routes meter alu data from above to the
+         *   meter 1 color mapram located at [5,5]
+         * - set b_oflo_color_write_o_sel_r_color_write_i (1'b1)
+         *   This routes meter alu 2 data down to rows 4 & 3 where
+         *   meter 2 color maprams are located [4,5] [3,0] [3,1]
+         */
+        if (row.row != curr_home_row / 2) { /* ALU home row */
+            map_alu.mapram_color_write_switchbox[curr_home_row / 4U]
+                .ctl.b_oflo_color_write_o_mux_select.b_oflo_color_write_o_sel_r_color_write_i = 1;
+            map_alu.mapram_color_write_switchbox[row.row / 2U].ctl.r_oflo_color_write_o_mux_select =
+                1;
+            BUG_CHECK(curr_home_row / 4U >= row.row / 2U);
+            /* b_oflo_color_write_o_sel_t_oflo_color_write_i must be set for all
+             * switchboxes below the homerow and above current row
+             * It should never be set for a switchbox above the home row
+             * It should never be set on the switchbox on the current row
+             * as that would drive the top overflow down to any color maprams below.
+             * This is invalid and can cause corruption if there is another meter occupying
+             * color maprams on the below row.
+             */
+            // Switch box below home row
+            int switchbox_upper = curr_home_row / 4U - 1;
+            // Switch box above current row
+            int switchbox_lower = row.row % 2 ? (int)row.row / 2U + 1 : (int)row.row / 2U;
+            for (int i = switchbox_upper; i >= switchbox_lower; i--) {
+                if (i == 3) continue;  // Never set on top switchbox
+
+                map_alu.mapram_color_write_switchbox[i]
+                    .ctl.b_oflo_color_write_o_mux_select
+                    .b_oflo_color_write_o_sel_t_oflo_color_write_i = 1;
+            }
+        }
+        auto &map_alu_row = map_alu.row[row.row];
+        auto vpn = row.vpns.begin();
+        if (color_mapram_addr == STATS_MAP_ADDR) {
+            BUG_CHECK((row.row % 2) == 0);
+            for (MatchTable *m : match_tables)
+                adrdist.mau_ad_stats_virt_lt[row.row / 2] |= (1U << m->logical_id);
+        }
+        // Enable the row to be used (even if only color maprams are on this row)
+        map_alu_row.i2portctl.synth2port_ctl.synth2port_enable = 1;
+        // If the color mapram is not on the same row as the meter ALU, even if no meter
+        // RAMs are on the same row, the address still needs to overflow to that row
+        if (row.row < curr_home_row / 2) {
+            auto &adr_ctl = map_alu_row.vh_xbars.adr_dist_oflo_adr_xbar_ctl[1];
+            // Mapram rows are 0-7, not 0-15 like logical rows
+            if (curr_home_row >= UPPER_MATCH_CENTRAL_FIRST_LOGICAL_ROW &&
+                row.row < UPPER_MATCH_CENTRAL_FIRST_ROW) {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = 0;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::OVERFLOW;
+                push_on_overflow = true;
+                BUG_CHECK(options.target == TOFINO);
+            } else {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = curr_home_row % 8;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::METER;
+            }
+            adr_ctl.adr_dist_oflo_adr_xbar_enable = 1;
+        }
+
+        for (auto &memunit : row.memunits) {
+            int col = memunit.col;
+            BUG_CHECK(memunit.stage == INT_MIN && memunit.row == row.row, "bogus %s in row %d",
+                      memunit.desc(), row.row);
+            auto &mapram_config = map_alu_row.adrmux.mapram_config[col];
+            if (row.row == curr_home_row / 2)
+                mapram_config.mapram_color_bus_select = MapRam::ColorBus::COLOR;
+            else
+                mapram_config.mapram_color_bus_select = MapRam::ColorBus::OVERFLOW;
+            mapram_config.mapram_type = MapRam::COLOR;
+            mapram_config.mapram_logical_table = logical_id;
+            BUG_CHECK(vpn != row.vpns.end(), "vpn not found!");
+            mapram_config.mapram_vpn = *vpn;
+            // These two registers must be programmed for meter-color map rams in this way as a
+            // work-around for hardware issue as described in TOF-1944
+            // The basic problem is that software reads of the meter color map ram are only
+            // returning 6-bits of data instead of the necessary 8-bits.  Hardware defaults to
+            // 6 bits, since the meter color map ram case is not explicitly called out.
+            // By setting these bits, all 8-bits will be returned.
+            mapram_config.mapram_parity_generate = 1;
+            mapram_config.mapram_parity_check = 0;
+            // glass does not set ecc for color maprams?
+            // mapram_config.mapram_ecc_check = 1;
+            // mapram_config.mapram_ecc_generate = 1;
+            if (gress == INGRESS)
+                mapram_config.mapram_ingress = 1;
+            else
+                mapram_config.mapram_egress = 1;
+            mapram_config.mapram_enable = 1;
+            if (row.row != curr_home_row / 2) { /* ALU home row */
+                mapram_config.mapram_color_write_bus_select = 1;
+            }
+            auto &ram_address_mux_ctl = map_alu_row.adrmux.ram_address_mux_ctl[1][col];
+            if (row.row == curr_home_row / 2) { /* ALU home row */
+                ram_address_mux_ctl.synth2port_radr_mux_select_home_row = 1;
+            } else {
+                ram_address_mux_ctl.synth2port_radr_mux_select_oflo = 1;
+            }
+            map_alu_row.i2portctl.synth2port_ctl.synth2port_mapram_color |= 1U << col;
+            ram_address_mux_ctl.map_ram_wadr_shift = 1;
+            ram_address_mux_ctl.map_ram_wadr_mux_select = MapRam::Mux::COLOR;
+            ram_address_mux_ctl.map_ram_wadr_mux_enable = 1;
+            ram_address_mux_ctl.map_ram_radr_mux_select_color = 1;
+            ram_address_mux_ctl.ram_ofo_stats_mux_select_statsmeter = 1;
+            // Indicating what bus to pull from, either stats or idletime for the color mapram
+            if (color_mapram_addr == IDLE_MAP_ADDR) {
+                ram_address_mux_ctl.ram_stats_meter_adr_mux_select_idlet = 1;
+                setup_muxctl(map_alu_row.vh_xbars.adr_dist_idletime_adr_xbar_ctl[col],
+                             row.bus.at(Layout::IDLE_BUS) % 10);
+            } else if (color_mapram_addr == STATS_MAP_ADDR) {
+                ram_address_mux_ctl.ram_stats_meter_adr_mux_select_stats = 1;
+            }
+            if (gress)
+                regs.cfg_regs.mau_cfg_mram_thread[col / 3U] |= 1U << (col % 3U * 8U + row.row);
+            ++vpn;
+        }
+    }
+
+    // Additional BUG_CHECK to verify that both these regs are not set on a switchbox
+    // - map_alu.mapram_color_write_switchbox[x].ctl.b_oflo_color_write_o_sel_r_color_write_i
+    // - map_alu.mapram_color_write_switchbox[x].ctl.b_oflo_color_write_o_sel_t_oflo_color_write_i
+    // Both these regs should never be set on a swithbox as it implies routing from both top and
+    // right map alu to the bottom rows. This leads to corruption of color data.
+    // Additional BUG_CHECK to verify that top row switchbox does not have
+    // this regs set
+    // - map_alu.mapram_color_write_switchbox[x].ctl.b_oflo_color_write_o_sel_t_oflo_color_write_i
+    for (int i = 0; i <= 3; i++) {
+        auto t_oflo_write_i = map_alu.mapram_color_write_switchbox[i]
+                                  .ctl.b_oflo_color_write_o_mux_select
+                                  .b_oflo_color_write_o_sel_t_oflo_color_write_i == 1;
+        if (i == 3) {
+            BUG_CHECK(!t_oflo_write_i,
+                      "Color maprams have invalid configuration"
+                      " may cause corruption of color data from meter");
+        }
+        auto r_oflo_write_i =
+            map_alu.mapram_color_write_switchbox[i]
+                .ctl.b_oflo_color_write_o_mux_select.b_oflo_color_write_o_sel_r_color_write_i == 1;
+        LOG5("i: " << i << "t_oflo: " << t_oflo_write_i << ", r_oflo: " << r_oflo_write_i);
+        BUG_CHECK(!(t_oflo_write_i & r_oflo_write_i),
+                  "Color maprams have invalid configuration"
+                  " may cause corruption of color data from meter");
+    }
+}
+
+template <class REGS>
+void MeterTable::write_regs_vt(REGS &regs) {
+    LOG1("### Meter table " << name() << " write_regs " << loc());
+    for (auto &ixb : input_xbar) ixb->write_regs(regs);
+    Layout *home = nullptr;
+    bool push_on_overflow = false;
+    auto &map_alu = regs.rams.map_alu;
+    auto &adrdist = regs.rams.match.adrdist;
+    DataSwitchboxSetup<REGS> *swbox = nullptr;
+    for (Layout &logical_row : layout) {
+        unsigned row = logical_row.row / 2U;
+        unsigned side = logical_row.row & 1; /* 0 == left  1 == right */
+        BUG_CHECK(side == 1);                /* no map rams or alus on left side anymore */
+        auto vpn = logical_row.vpns.begin();
+        auto mapram = logical_row.maprams.begin();
+        auto &map_alu_row = map_alu.row[row];
+        auto home_it = home_rows.find(logical_row.row);
+        if (home_it != home_rows.end()) {
+            home = &logical_row;
+            swbox = new DataSwitchboxSetup<REGS>(regs, this, logical_row.row,
+                                                 (++home_it == home_rows.end()) ? -1 : *home_it);
+        }
+        BUG_CHECK(home != nullptr);
+        LOG2("# DataSwitchbox.setup_row(" << row << ") home=" << home->row / 2U);
+        swbox->setup_row(row);
+        for (auto &memunit : logical_row.memunits) {
+            int logical_col = memunit.col;
+            BUG_CHECK(memunit.stage == INT_MIN && memunit.row == logical_row.row,
+                      "bogus %s in logical row %d", memunit.desc(), logical_row.row);
+            unsigned col = logical_col + 6 * side;
+            LOG2("# DataSwitchbox.setup_row_col(" << row << ", " << col << ", vpn=" << *vpn
+                                                  << ") home=" << home->row / 2U);
+            swbox->setup_row_col(row, col, *vpn);
+            write_mapram_regs(regs, row, *mapram, *vpn, MapRam::METER);
+            if (gress) regs.cfg_regs.mau_cfg_uram_thread[col / 4U] |= 1U << (col % 4U * 8U + row);
+            ++mapram, ++vpn;
+        }
+        if (&logical_row == home) {
+            write_regs_home_row(regs, row);
+        } else {
+            auto &adr_ctl = map_alu_row.vh_xbars.adr_dist_oflo_adr_xbar_ctl[side];
+            if (home->row >= UPPER_MATCH_CENTRAL_FIRST_LOGICAL_ROW &&
+                logical_row.row < UPPER_MATCH_CENTRAL_FIRST_LOGICAL_ROW) {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = 0;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::OVERFLOW;
+                push_on_overflow = true;
+                BUG_CHECK(options.target == TOFINO);
+            } else {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = home->row % 8;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::METER;
+            }
+            adr_ctl.adr_dist_oflo_adr_xbar_enable = 1;
+        }
+    }
+    auto &merge = regs.rams.match.merge;
+    write_mapram_color_regs(regs, push_on_overflow);
+    if (home_rows.size() > 1) write_alu_vpn_range(regs);
+
+    for (int home_row : home_rows) {
+        for (MatchTable *m : match_tables) {
+            adrdist.adr_dist_meter_adr_icxbar_ctl[m->logical_id] |= 1U << (home_row / 4U);
+            // auto &icxbar = adrdist.adr_dist_meter_adr_icxbar_ctl[m->logical_id];
+            // icxbar.address_distr_to_logical_rows = 1U << home->row;
+            // icxbar.address_distr_to_overflow = push_on_overflow;
+            // if (direct)
+            //     regs.cfg_regs.mau_cfg_lt_meter_are_direct |= 1 << m->logical_id;
+            adrdist.meter_color_output_map[m->logical_id].set_subfield(green_value, 0, 8);
+            adrdist.meter_color_output_map[m->logical_id].set_subfield(yellow_value, 8, 8);
+            adrdist.meter_color_output_map[m->logical_id].set_subfield(yellow_value, 16, 8);
+            adrdist.meter_color_output_map[m->logical_id].set_subfield(red_value, 24, 8);
+            if (type != LPF) adrdist.meter_enable |= 1U << m->logical_id;
+            /*auto &movereg_ad_ctl = adrdist.movereg_ad_ctl[m->logical_id];
+            movereg_ad_ctl.movereg_meter_deferred = 1;
+            if (!color_maprams.empty())
+                movereg_ad_ctl.movereg_ad_idle_as_mc = 1;
+            else
+                movereg_ad_ctl.movereg_ad_stats_as_mc = 1;
+            movereg_ad_ctl.movereg_ad_direct_meter = direct;
+            movereg_ad_ctl.movereg_ad_meter_shift = 7; */
+            meter_color_logical_to_phys(regs, m->logical_id, home_row / 4U);
+            adrdist.mau_ad_meter_virt_lt[home_row / 4U] |= 1 << m->logical_id;
+        }
+        if (run_at_eop()) {
+            if (teop >= 0) {
+                setup_teop_regs(regs, home_row / 4U);
+            } else {
+                adrdist.deferred_ram_ctl[1][home_row / 4U].deferred_ram_en = 1;
+                adrdist.deferred_ram_ctl[1][home_row / 4U].deferred_ram_thread = gress;
+                if (gress) regs.cfg_regs.mau_cfg_dram_thread |= 0x10 << (home_row / 4U);
+            }
+            adrdist.meter_bubble_req[timing_thread(gress)].bubble_req_1x_class_en |=
+                1 << ((home_row / 4U) + 4);
+        } else {
+            adrdist.meter_bubble_req[timing_thread(gress)].bubble_req_1x_class_en |=
+                1 << (home_row / 4U);
+            adrdist.packet_action_at_headertime[1][home_row / 4U] = 1;
+        }
+        if (push_on_overflow) {
+            adrdist.oflo_adr_user[0] = adrdist.oflo_adr_user[1] = AdrDist::METER;
+            adrdist.deferred_oflo_ctl = 1 << ((home_row - 8) / 2U);
+        }
+        if (gress == INGRESS || gress == GHOST) {
+            merge.meter_alu_thread[0].meter_alu_thread_ingress |= 1U << home_row / 4U;
+            merge.meter_alu_thread[1].meter_alu_thread_ingress |= 1U << home_row / 4U;
+        } else if (gress == EGRESS) {
+            merge.meter_alu_thread[0].meter_alu_thread_egress |= 1U << home_row / 4U;
+            merge.meter_alu_thread[1].meter_alu_thread_egress |= 1U << home_row / 4U;
+        }
+    }
+
+    for (auto &hd : hash_dist) hd.write_regs(regs, this);
+}
+
+// FIXME -- refactor these specializations better
+template <>
+void MeterTable::meter_color_logical_to_phys(Target::Tofino::mau_regs &regs, int logical_id,
+                                             int alu) {
+    auto &merge = regs.rams.match.merge;
+    auto &adrdist = regs.rams.match.adrdist;
+    if (!color_maprams.empty()) {
+        merge.mau_mapram_color_map_to_logical_ctl[logical_id / 8].set_subfield(
+            0x4 | alu, 3 * (logical_id % 8U), 3);
+
+        // Determining which buses to send the color mapram address to
+        if (color_mapram_addr == IDLE_MAP_ADDR) {
+            adrdist.movereg_idle_ctl[logical_id].movereg_idle_ctl_mc = 1;
+            for (auto lo : color_maprams) {
+                int bus_index = lo.bus.at(Layout::IDLE_BUS);
+                // upper and lower idletime busses appear to be independent with
+                // no overflow between them
+                if (lo.row >= UPPER_MATCH_CENTRAL_FIRST_ROW) bus_index += IDLETIME_BUSSES_PER_HALF;
+                adrdist.adr_dist_idletime_adr_oxbar_ctl[bus_index / 4].set_subfield(
+                    logical_id | 0x10, 5 * (bus_index % 4), 5);
+            }
+
+        } else if (color_mapram_addr == STATS_MAP_ADDR) {
+            for (auto lo : color_maprams) {
+                adrdist.adr_dist_stats_adr_icxbar_ctl[logical_id] |= (1U << (lo.row / 2));
+                adrdist.packet_action_at_headertime[0][lo.row / 2] = 1;
+            }
+        } else {
+            BUG();
+        }
+        setup_muxctl(adrdist.meter_color_logical_to_phys_ixbar_ctl[logical_id], alu);
+    }
+}
+
+template <>
+void MeterTable::meter_color_logical_to_phys(Target::JBay::mau_regs &regs, int logical_id,
+                                             int alu) {
+    auto &merge = regs.rams.match.merge;
+    auto &adrdist = regs.rams.match.adrdist;
+    if (!color_maprams.empty()) {
+        merge.mau_mapram_color_map_to_logical_ctl[alu] |= 1 << logical_id;
+        // Determining which buses to send the color mapram address to
+        if (color_mapram_addr == IDLE_MAP_ADDR) {
+            adrdist.movereg_idle_ctl[logical_id].movereg_idle_ctl_mc = 1;
+            for (auto lo : color_maprams) {
+                int bus_index = lo.bus.at(Layout::IDLE_BUS);
+                // No overflow bus exist between upper and lower half so every color mapram have
+                // to use their respective bus
+                if (lo.row >= UPPER_MATCH_CENTRAL_FIRST_ROW) bus_index += IDLETIME_BUSSES_PER_HALF;
+                adrdist.adr_dist_idletime_adr_oxbar_ctl[bus_index / 4].set_subfield(
+                    logical_id | 0x10, 5 * (bus_index % 4), 5);
+            }
+
+        } else if (color_mapram_addr == STATS_MAP_ADDR) {
+            for (auto lo : color_maprams) {
+                adrdist.adr_dist_stats_adr_icxbar_ctl[logical_id] |= (1U << (lo.row / 2));
+                adrdist.packet_action_at_headertime[0][lo.row / 2] = 1;
+            }
+        } else {
+            BUG();
+        }
+    }
+    adrdist.meter_color_logical_to_phys_icxbar_ctl[logical_id] |= 1 << alu;
+}
+
+void MeterTable::gen_tbl_cfg(json::vector &out) const {
+    // FIXME -- factor common Synth2Port stuff
+    auto spare_mems = determine_spare_bank_memory_units();
+    int size = (layout_size() - spare_mems.size()) * SRAM_DEPTH;
+    json::map &tbl = *base_tbl_cfg(out, "meter", size);
+    json::map &stage_tbl = *add_stage_tbl_cfg(tbl, "meter", size);
+    stage_tbl["color_memory_resource_allocation"] =
+        gen_memory_resource_allocation_tbl_cfg("map_ram", color_maprams);
+    switch (type) {
+        case STANDARD:
+            tbl["meter_type"] = "standard";
+            tbl["meter_profile"] = profile;
+            break;
+        case LPF:
+            tbl["meter_type"] = "lpf";
+            break;
+        case RED:
+            tbl["meter_type"] = "red";
+            break;
+        default:
+            tbl["meter_type"] = "standard";
+            break;
+    }
+    switch (count) {
+        case PACKETS:
+            tbl["meter_granularity"] = "packets";
+            break;
+        case BYTES:
+            tbl["meter_granularity"] = "bytes";
+            break;
+        default:
+            tbl["meter_granularity"] = "packets";
+            break;
+    }
+    tbl["enable_color_aware_pfe"] = color_aware_per_flow_enable;
+    /* this is not needed. but the driver asserts on existence of
+     * this or enable_color_aware which both seem to be redundant */
+    tbl["pre_color_field_name"] = "";
+    tbl["enable_pfe"] = per_flow_enable;
+    tbl["pfe_bit_position"] = per_flow_enable_bit();
+    tbl["color_aware_pfe_address_type_bit_position"] = 0;  // FIXME
+    tbl["reference_dictionary"] = json::map();             // To be removed in future
+    stage_tbl["default_lower_huffman_bits_included"] = METER_LOWER_HUFFMAN_BITS;
+    if (home_rows.size() > 1)
+        add_alu_indexes(stage_tbl, "meter_alu_index");
+    else
+        add_alu_index(stage_tbl, "meter_alu_index");
+    if (context_json) stage_tbl.merge(*context_json);
+}
+
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(MeterTable, TARGET_CLASS)
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void MeterTable::write_merge_regs,
+                      (mau_regs & regs, MatchTable *match, int type, int bus,
+                       const std::vector<Call::Arg> &args),
+                      { write_merge_regs_vt(regs, match, type, bus, args); })
diff --git a/backends/tofino/bf-asm/misc.cpp b/backends/tofino/bf-asm/misc.cpp
new file mode 100644
index 00000000000..28cddb8932f
--- /dev/null
+++ b/backends/tofino/bf-asm/misc.cpp
@@ -0,0 +1,223 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "misc.h"
+
+#include <regex>
+#include <sstream>
+#include <string>
+
+#include "backends/tofino/bf-asm/target.h"
+#include "bfas.h"
+
+int remove_name_tail_range(std::string &name, int *size) {
+    auto tail = name.rfind('.');
+    if (tail == std::string::npos) return 0;
+    unsigned lo, hi;
+    int len = -1;
+    if (sscanf(&name[tail], ".%u-%u%n", &lo, &hi, &len) >= 2 && tail + len == name.size() &&
+        hi >= lo) {
+        name.erase(tail);
+        if (size) *size = hi - lo + 1;
+        return lo;
+    }
+    return 0;
+}
+
+std::string int_to_hex_string(unsigned val, unsigned width) {
+    std::stringstream sval;
+    sval << std::setfill('0') << std::setw(width) << std::hex << val << std::setfill(' ');
+    return sval.str();
+}
+
+void add_cfg_reg(json::vector &cfg_cache, std::string full_name, std::string name,
+                 std::string val) {
+    json::map cfg_cache_reg;
+    cfg_cache_reg["fully_qualified_name"] = full_name;
+    cfg_cache_reg["name"] = name;
+    cfg_cache_reg["value"] = val;
+    cfg_cache.push_back(std::move(cfg_cache_reg));
+}
+
+bool check_zero_string(const std::string &s) {
+    char zero = '0';
+    return s.find_first_not_of(zero) == std::string::npos;
+}
+
+std::string get_filename(const char *s) {
+    std::string fname = s;
+    fname = fname.substr(fname.find_last_of("/") + 1);
+    fname = fname.substr(0, fname.find_last_of("."));
+    return fname;
+}
+
+std::string get_directory(const char *s) {
+    std::string fname = s;
+    auto tail = fname.find_last_of("/");
+    if (tail == std::string::npos)
+        fname = ".";
+    else
+        fname = fname.substr(0, tail);
+    return fname;
+}
+
+/* Given a p4 name, split into instance and field names if possible
+ *  - else return a copy of the original name */
+void gen_instfield_name(const std::string &fullname, std::string &instname,
+                        std::string &field_name) {
+    auto dotpos = fullname.rfind('.');
+    if (dotpos == std::string::npos) {
+        instname = fullname;
+        field_name = std::string();
+    } else {
+        instname = fullname.substr(0, dotpos);
+        field_name = fullname.substr(dotpos + 1, fullname.size());
+    }
+}
+
+uint64_t bitMask(unsigned size) {
+    BUG_CHECK(size <= 64 && "bitMask(size), maximum size is 64");
+    if (size == 64) return ~UINT64_C(0);
+    return (UINT64_C(1) << size) - 1;
+}
+
+uint64_t bitRange(unsigned lo, unsigned hi) {
+    BUG_CHECK(hi >= lo && hi < 64, "bitRange(%u,%u) invalid", lo, hi);
+    if (lo == 0 && hi + 1 == 64) return ~UINT64_C(0);
+    return ((UINT64_C(1) << (hi - lo + 1)) - 1) << lo;
+}
+
+int parity(uint32_t v) {
+    v ^= v >> 16;
+    v ^= v >> 8;
+    v ^= v >> 4;
+    v ^= v >> 2;
+    v ^= v >> 1;
+    return v & 1;
+}
+
+int parity_2b(uint32_t v) {
+    v ^= v >> 16;
+    v ^= v >> 8;
+    v ^= v >> 4;
+    v ^= v >> 2;
+    return v & 3;
+}
+
+bool check_bigint_unsigned(value_t value, uint32_t byte_width) {
+    BUG_CHECK(value.type == tBIGINT);
+
+    /* -- zero is in the range */
+    if (value.bigi.size == 0) return true;
+
+    constexpr uint64_t size_bigint_item(sizeof(value.bigi.data[0]));
+
+    bool overflow(false);
+
+    /* -- all items above the max_index must by zero */
+    const uint64_t max_index(((byte_width + size_bigint_item - 1) / size_bigint_item) - 1);
+    for (int i(max_index + 1); i < value.bigi.size; ++i) {
+        if (value.bigi.data[i] != 0) {
+            overflow = true;
+        }
+    }
+    /* -- check limit in the boundary bigint part */
+    if (value.bigi.size > max_index) {
+        const uint64_t ext_width(byte_width % size_bigint_item);
+        if (ext_width > 0 && value.bigi.data[max_index] >= (1 << (ext_width * 8))) {
+            overflow = true;
+        }
+    }
+    if (overflow) {
+        error(value.lineno, "the integer constant is wider than the requested width %u bytes",
+              byte_width);
+        return false;
+    }
+
+    return true;
+}
+
+bool input_int_match(const value_t value, match_t &match, int width) {
+    BUG_CHECK(width <= sizeof(match_t::word0) * 8);
+
+    using MatchType = decltype(match_t::word0);
+    MatchType mask;
+    if (width < sizeof(MatchType) * 8)
+        mask = (1ULL << width) - 1;
+    else
+        mask = std::numeric_limits<MatchType>::max();
+    if (value.type == tINT) {
+        if (!check_range_strict<MatchType>(value, 0, mask)) return false;
+        convert_i2m(value.i, match);
+    } else if (value.type == tBIGINT) {
+        /* -- As the match type is uint64_t and value_t::i is int64_t, constants
+         *    above 0x7fffffffffffffff are passed as big integers. */
+        if (value.bigi.size > 1) {
+            error(value.lineno, "the match constant is out of the expected range <0, %lu>", mask);
+            return false;
+        }
+        MatchType v(0);
+        if (value.bigi.size > 0) v = value.bigi.data[0];
+        if (v > mask) {
+            error(value.lineno, "the match constant is out of the expected range <0, %lu>", mask);
+            return false;
+        }
+        convert_i2m(v, match);
+    } else {
+        value_t fixed_value = value;
+        fixed_value.m = value.m;
+        fix_match_star(fixed_value.m, mask);
+        if (!check_range_match(fixed_value, mask, width)) return false;
+        match = fixed_value.m;
+    }
+    return true;
+}
+
+unsigned match_t::dirtcam(unsigned width, unsigned bit) {
+    static unsigned masks[] = {0x5555, 0x3333, 0xf0f0, 0xffff};
+    BUG_CHECK(width <= 4, "dirtcam of more than 4 bits?");
+    unsigned rv = (1U << (1U << width)) - 1;
+    for (unsigned i = 0; i < width; ++i, ++bit) {
+        if (!((word0 >> bit) & 1)) rv &= ~masks[i];
+        if (!((word1 >> bit) & 1)) rv &= masks[i];
+    }
+    return rv;
+}
+
+unsigned wmatch_t::dirtcam(unsigned width, unsigned bit) {
+    static unsigned masks[] = {0x5555, 0x3333, 0xf0f0, 0xffff};
+    BUG_CHECK(width <= 4, "dirtcam of more than 4 bits?");
+    unsigned rv = (1U << (1U << width)) - 1;
+    for (unsigned i = 0; i < width; ++i, ++bit) {
+        // treat both bits 0 as don't care rather than never match
+        if (!word0[bit] && !word1[bit]) continue;
+        if (!word0[bit]) rv &= ~masks[i];
+        if (!word1[bit]) rv &= masks[i];
+    }
+    return rv;
+}
+
+bool require_keys(const value_t &data, std::set<const char *> keys) {
+    for (auto key : keys) {
+        pair_t *kv = data.map[key];
+        if (!kv) {
+            error(data.lineno, "missing required key '%s'", key);
+            return false;
+        }
+    }
+    return true;
+}
diff --git a/backends/tofino/bf-asm/misc.h b/backends/tofino/bf-asm/misc.h
new file mode 100644
index 00000000000..910e666a632
--- /dev/null
+++ b/backends/tofino/bf-asm/misc.h
@@ -0,0 +1,218 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_MISC_H_
+#define BACKENDS_TOFINO_BF_ASM_MISC_H_
+
+#include <iomanip>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <boost/numeric/conversion/converter.hpp>
+
+#include "asm-types.h"
+#include "backends/tofino/bf-asm/json.h"
+
+template <class T>
+auto setup_muxctl(T &reg, int val) -> decltype((void)reg.enabled_2bit_muxctl_enable) {
+    reg.enabled_2bit_muxctl_select = val;
+    reg.enabled_2bit_muxctl_enable = 1;
+}
+template <class T>
+auto setup_muxctl(T &reg, int val) -> decltype((void)reg.enabled_3bit_muxctl_enable) {
+    reg.enabled_3bit_muxctl_select = val;
+    reg.enabled_3bit_muxctl_enable = 1;
+}
+template <class T>
+auto setup_muxctl(T &reg, int val) -> decltype((void)reg.enabled_4bit_muxctl_enable) {
+    reg.enabled_4bit_muxctl_select = val;
+    reg.enabled_4bit_muxctl_enable = 1;
+}
+template <class T>
+auto setup_muxctl(T &reg, int val) -> decltype((void)reg.enabled_5bit_muxctl_enable) {
+    reg.enabled_5bit_muxctl_select = val;
+    reg.enabled_5bit_muxctl_enable = 1;
+}
+template <class T>
+auto setup_muxctl(T &reg, int val) -> decltype((void)reg.exactmatch_row_vh_xbar_enable) {
+    reg.exactmatch_row_vh_xbar_select = val;
+    reg.exactmatch_row_vh_xbar_enable = 1;
+}
+
+template <class T, class Alloc>
+void append(std::vector<T, Alloc> &a, const std::vector<T, Alloc> &b) {
+    for (auto &e : b) a.push_back(e);
+}
+
+template <class T, class U>
+T join(const std::vector<T> &vec, U sep) {
+    T rv;
+    bool first = true;
+    for (auto &el : vec) {
+        if (first)
+            first = false;
+        else
+            rv += sep;
+        rv += el;
+    }
+    return rv;
+}
+
+extern int remove_name_tail_range(std::string &, int *size = nullptr);
+
+// Convert an integer to hex string of specified width (in bytes)
+std::string int_to_hex_string(unsigned val, unsigned width);
+
+// Add a reg to CJSON Configuration Cache
+void add_cfg_reg(json::vector &cfg_cache, std::string full_name, std::string name, std::string val);
+
+bool check_zero_string(const std::string &s);
+
+// Get filename
+std::string get_filename(const char *s);
+std::string get_directory(const char *s);
+
+/** Given a p4 name, eg. "inst.field", write "inst" to @instname and "field" to
+ * @fieldname.  If @fullname cannot be split, writes @fullname to @instname and
+ * "" to @fieldname.
+ */
+void gen_instfield_name(const std::string &fullname, std::string &instname, std::string &fieldname);
+
+/// Compare pointers based on the pointed at type
+/// For use as a Comparator for map/set types
+template <class T>
+struct ptrless {
+    bool operator()(const T *a, const T *b) const { return b ? a ? *a < *b : true : false; }
+    bool operator()(const std::unique_ptr<T> &a, const std::unique_ptr<T> &b) const {
+        return b ? a ? *a < *b : true : false;
+    }
+};
+
+/* word with size (lowest) bits set */
+uint64_t bitMask(unsigned size);
+/* word with range of bits from lo to hi (inclusive) set */
+uint64_t bitRange(unsigned lo, unsigned hi);
+
+int parity(uint32_t v);
+int parity_2b(uint32_t v);  // two-bit parity (parity of pairs in the word)
+
+inline bool check_value(const value_t value, const decltype(value_t::i) expected) {
+    if (!CHECKTYPE(value, tINT)) return false;
+    if (value.i != expected) {
+        error(value.lineno, "unexpected value %ld; expected %ld", value.i, expected);
+        return false;
+    }
+    return true;
+}
+
+/**
+ * @brief Check range of an input integer value (tINT)
+ *
+ * This method is designated mainly for checking input integer constants. The template
+ * parameter defines target type in which the value is going to be stored. As the
+ * higher limit is quite often 0xffff... we must handle signed and unsigned integers
+ * correctly.
+ *
+ * @tparam IntType Target type which the value will be stored in.
+ * @param value The checked value
+ * @param lo lower inclusive limit
+ * @param hi higher include limit
+ * @return False if the value is out of the specified limits
+ */
+template <typename IntType,
+          typename = typename std::enable_if<std::is_integral<IntType>::value>::type>
+bool check_range_strict(value_t value, IntType lo, IntType hi) {
+    auto format_error_message([](value_t value, IntType lo, IntType hi) {
+        /* -- As we don't know actual type of the IntType, we cannot use the printf-like
+         *    formatting. */
+        std::ostringstream oss;
+        oss << "value " << value.i << " is out of allowed range <" << +lo << "; " << +hi << ">";
+        error(value.lineno, "%s", oss.str().c_str());
+    });
+
+    if (!CHECKTYPE(value, tINT)) return false;
+
+    /* -- Handle different ranges (signed, unsigned, different size) of the value_t::i
+     *    and IntType. */
+    typedef boost::numeric::converter<IntType, decltype(value_t::i)> Converter;
+    if (Converter::out_of_range(value.i)) {
+        format_error_message(value, lo, hi);
+        return false;
+    }
+
+    /* -- Now check requested limits */
+    IntType converted(static_cast<IntType>(value.i));
+    if (converted < lo || converted > hi) {
+        format_error_message(value, lo, hi);
+        return false;
+    }
+    return true;
+}
+
+inline bool check_range(const value_t value, const decltype(value_t::i) lo,
+                        const decltype(value_t::i) hi) {
+    return check_range_strict<decltype(value_t::i)>(value, lo, hi);
+}
+
+inline bool check_range_match(const value_t &match, const decltype(match_t::word0) mask,
+                              int width) {
+    if (!CHECKTYPE(match, tMATCH)) return false;
+    if ((match.m.word0 | match.m.word1) != mask) {
+        error(match.lineno, "invalid match width; expected %i bits", width);
+        return false;
+    }
+    return true;
+}
+
+template <typename IntType>
+void convert_i2m(IntType i, match_t &m) {
+    static_assert(sizeof(IntType) == sizeof(match_t::word0));
+    static_assert(std::is_integral<IntType>::value);
+
+    m.word0 = ~static_cast<decltype(match_t::word0)>(i);
+    m.word1 = static_cast<decltype(match_t::word0)>(i);
+}
+
+bool check_bigint_unsigned(value_t value, uint32_t byte_width);
+
+/// * is parsed as match_t::word0 == 0 && match_t::word1 == 0.
+/// The function converts the match according to the specified with @p mask.
+inline void fix_match_star(match_t &match, const decltype(match_t::word0) mask) {
+    if (match.word0 == 0 && match.word1 == 0) match.word0 = match.word1 = mask;
+}
+
+/// The function reads a tINT or tMATCH value, performs range checks, and converts
+/// the value to a new tMATCH value.
+/// @param value Input value
+/// @param match Output value
+/// @param width Expected width of the input value used for range checks
+/// @pre @p value must be a tINT or tMATCH value.
+/// @return True if the value is correctly parsed
+bool input_int_match(const value_t value, match_t &match, int width);
+
+/// Check if a tMAP value contains all the given keys.
+/// @param value A tMAP value
+/// @param keys A set of keys
+/// @pre @p value must be a tMAP
+/// @return True if the given keys are a subset of the map's keys
+bool require_keys(const value_t &data, std::set<const char *> keys);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_MISC_H_ */
diff --git a/backends/tofino/bf-asm/mksizes.cpp b/backends/tofino/bf-asm/mksizes.cpp
new file mode 100644
index 00000000000..746d1509e66
--- /dev/null
+++ b/backends/tofino/bf-asm/mksizes.cpp
@@ -0,0 +1,40 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stdint.h>
+
+#include <iostream>
+
+int main() {
+    if (sizeof(unsigned long long) == 2 * sizeof(uintptr_t))
+        std::cout << "#define uint2ptr_t unsigned long long" << std::endl;
+    else if (sizeof(unsigned long) == 2 * sizeof(uintptr_t))
+        std::cout << "#define uint2ptr_t unsigned long" << std::endl;
+    else if (sizeof(unsigned) == 2 * sizeof(uintptr_t))
+        std::cout << "#define uint2ptr_t unsigned" << std::endl;
+    else if (sizeof(unsigned long) * 2 == sizeof(uintptr_t))
+        std::cout << "#define uinthptr_t unsigned long" << std::endl;
+    else if (sizeof(unsigned) * 2 == sizeof(uintptr_t))
+        std::cout << "#define uinthptr_t unsigned" << std::endl;
+    else if (sizeof(unsigned short) * 2 == sizeof(uintptr_t))
+        std::cout << "#define uinthptr_t unsigned short" << std::endl;
+    else {
+        std::cerr << "Can't find a type that is 2x or 1/2x a uinputr_t" << std::endl;
+        return 1;
+    }
+    return 0;
+}
diff --git a/backends/tofino/bf-asm/mktags b/backends/tofino/bf-asm/mktags
new file mode 100755
index 00000000000..fb7fd6e555b
--- /dev/null
+++ b/backends/tofino/bf-asm/mktags
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+ctags -R -I VECTOR --exclude=test --exclude=submodules \
+    --regex-C++='/^DECLARE_(ABSTRACT_)?TABLE_TYPE\(([a-zA-Z0-9_]+)/\2/c/'
+
+ctags -a -R $HOME/bf-utils/include/bfutils
diff --git a/backends/tofino/bf-asm/p4_table.cpp b/backends/tofino/bf-asm/p4_table.cpp
new file mode 100644
index 00000000000..e6461273822
--- /dev/null
+++ b/backends/tofino/bf-asm/p4_table.cpp
@@ -0,0 +1,254 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "p4_table.h"
+
+#include "backends/tofino/bf-asm/tables.h"
+
+static std::map<const P4Table *, alpm_t> alpms;
+
+std::map<unsigned, P4Table *> P4Table::by_handle;
+std::map<P4Table::type, std::map<std::string, P4Table *>> P4Table::by_name;
+unsigned P4Table::max_handle[7];
+
+// handle[29:24] is used as type field.
+const char *P4Table::type_name[] = {0,       "match",   "action", "selection", "statistics",
+                                    "meter", "stateful"};
+
+// handle[19:16] is used as handle offset field for multipipe
+static unsigned apply_handle_offset(unsigned handle, unsigned offset) {
+    return handle | (offset & 0xff) << 16;
+}
+
+// clear bit[19:16] which is used to encode pipe_id.
+static unsigned clear_handle_offset(unsigned handle) { return handle & 0xff00ffff; }
+
+P4Table *P4Table::get(P4Table::type t, VECTOR(pair_t) & data) {
+    BUG_CHECK(t < NUM_TABLE_TYPES);
+    P4Table *rv;
+    auto *h = ::get(data, "handle");
+    auto *n = ::get(data, "name");
+    if (h) {
+        if (!CHECKTYPE(*h, tINT)) return nullptr;
+        unsigned handle = h->i;
+        handle = clear_handle_offset(handle);
+        if (handle >> 24 && handle >> 24 != t) {
+            error(h->lineno, "Incorrect handle type %d for %s table", handle >> 24, type_name[t]);
+            return 0;
+        }
+        handle &= 0xffffff;
+        if (!handle) {
+            error(h->lineno, "zero handle");
+            return 0;
+        }
+        if (handle > max_handle[t]) max_handle[t] = handle;
+        handle |= t << 24;
+        handle = apply_handle_offset(handle, unique_table_offset);
+        if (!(rv = by_handle[handle])) {
+            if (!n || !CHECKTYPE(*n, tSTR) || !by_name[t].count(n->s) ||
+                (rv = by_name[t][n->s])->handle != (unsigned)t << 24)
+                rv = by_handle[handle] = new P4Table;
+            rv->handle = handle;
+        }
+    } else if (n) {
+        if (!CHECKTYPE(*n, tSTR)) return 0;
+        if (!(rv = by_name[t][n->s])) {
+            rv = by_name[t][n->s] = new P4Table;
+            rv->name = n->s;
+            rv->handle = apply_handle_offset(++max_handle[t] | (t << 24), unique_table_offset);
+        }
+    } else {
+        error(data.size ? data[0].key.lineno : 0, "no handle or name in p4 info");
+        return 0;
+    }
+    for (auto &kv : MapIterChecked(data)) {
+        if (rv->lineno <= 0 || rv->lineno > kv.key.lineno) rv->lineno = kv.key.lineno;
+        if (kv.key == "handle") {
+        } else if (kv.key == "name") {
+            if (CHECKTYPE(kv.value, tSTR)) {
+                if (!rv->name.empty() && rv->name != kv.value.s) {
+                    error(kv.value.lineno, "Inconsistent P4 name for handle 0x%x", rv->handle);
+                    warning(rv->lineno, "Previously set here");
+                } else if (rv->name.empty()) {
+                    rv->name = kv.value.s;
+                    if (!by_name[t].count(rv->name)) by_name[t][rv->name] = rv;
+                }
+            }
+        } else if (kv.key == "size") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                if (rv->explicit_size && rv->size != (unsigned)kv.value.i) {
+                    error(kv.value.lineno, "Inconsistent size for P4 handle 0x%x", rv->handle);
+                    warning(rv->lineno, "Previously set here");
+                } else {
+                    rv->size = kv.value.i;
+                    rv->explicit_size = true;
+                }
+            }
+        } else if (kv.key == "action_profile") {
+            if (CHECKTYPE(kv.value, tSTR)) rv->action_profile = kv.value.s;
+        } else if (kv.key == "match_type") {
+            if (CHECKTYPE(kv.value, tSTR)) rv->match_type = kv.value.s;
+        } else if (kv.key == "preferred_match_type") {
+            if (CHECKTYPE(kv.value, tSTR)) rv->preferred_match_type = kv.value.s;
+        } else if (kv.key == "disable_atomic_modify") {
+            if (CHECKTYPE(kv.value, tSTR))
+                if (strncmp(kv.value.s, "true", 4) == 0) rv->disable_atomic_modify = true;
+        } else if (kv.key == "stage_table_type") {
+            if (CHECKTYPE(kv.value, tSTR)) rv->stage_table_type = kv.value.s;
+        } else if (kv.key == "how_referenced") {
+            if (CHECKTYPE(kv.value, tSTR)) {
+                if (strcmp(kv.value.s, "direct") != 0 && strcmp(kv.value.s, "indirect") != 0)
+                    error(kv.value.lineno, "how_referenced must be either direct or indirect");
+                else
+                    rv->how_referenced = kv.value.s;
+            }
+        } else if (kv.key == "hidden") {
+            rv->hidden = get_bool(kv.value);
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in p4 info", value_desc(kv.key));
+        }
+    }
+    return rv;
+}
+
+P4Table *P4Table::alloc(P4Table::type t, Table *tbl) {
+    unsigned handle = apply_handle_offset(++max_handle[t] | (t << 24), unique_table_offset);
+    P4Table *rv = by_handle[handle] = new P4Table;
+    rv->handle = handle;
+    rv->name = tbl->name();
+    return rv;
+}
+
+void P4Table::check(Table *tbl) {
+    if (name.empty()) name = tbl->name();
+    if (!(handle & 0xffffff)) {
+        auto table_type = (handle >> 24) & 0x3f;
+        handle += ++max_handle[table_type];
+    }
+}
+
+json::map *P4Table::base_tbl_cfg(json::vector &out, int size, const Table *table) const {
+    json::map *tbl_ptr = nullptr;
+    for (auto &_table_o : out) {
+        auto &_table = _table_o->to<json::map>();
+        if (_table["name"] == name) {
+            if (_table["handle"] && _table["handle"] != handle) continue;
+            tbl_ptr = &_table;
+            break;
+        }
+    }
+    if (!tbl_ptr) {
+        tbl_ptr = new json::map();
+        out.emplace_back(tbl_ptr);
+    }
+    json::map &tbl = *tbl_ptr;
+    tbl["direction"] = direction_name(table->gress);
+    if (handle) tbl["handle"] = handle;
+    auto table_type = (handle >> 24) & 0x3f;
+    BUG_CHECK(table_type < NUM_TABLE_TYPES);
+    tbl["name"] = p4_name();
+    tbl["table_type"] = type_name[table_type];
+    if (!explicit_size && tbl["size"])
+        tbl["size"]->as_number()->val += size;
+    else
+        tbl["size"] = explicit_size ? this->size : size;
+    if (hidden) tbl["p4_hidden"] = true;
+    return &tbl;
+}
+
+void P4Table::base_alpm_tbl_cfg(json::map &out, int size, const Table *table,
+                                P4Table::alpm_type atype) const {
+    if (is_alpm()) {
+        json::map **alpm_cfg = nullptr;
+        unsigned *alpm_table_handle = nullptr;
+        auto *alpm = &alpms[this];
+        if (alpm) {
+            auto p4Name = p4_name();
+            if (!p4Name) {
+                error(table->lineno, "No p4 table found for alpm table : %s", table->name());
+                return;
+            }
+            std::string name = p4Name;
+            if (atype == P4Table::PreClassifier) {
+                alpm_cfg = &alpm->alpm_pre_classifier_table_cfg;
+                alpm_table_handle = &alpm->alpm_pre_classifier_table_handle;
+                // Both alpm pre-classifier and atcam tables share the same
+                // table name. For driver to uniquely distinguish a
+                // pre-classifier from the atcam table during snapshot, we add a
+                // suffix to the p4 name
+                name += "_pre_classifier";
+            } else if (atype == P4Table::Atcam) {
+                alpm_cfg = &alpm->alpm_atcam_table_cfg;
+                alpm_table_handle = &alpm->alpm_atcam_table_handle;
+            }
+            *alpm_cfg = &out;
+            json::map &tbl = out;
+            tbl["direction"] = direction_name(table->gress);
+            auto table_type = (handle >> 24) & 0x3f;
+            BUG_CHECK(table_type < NUM_TABLE_TYPES);
+            if (!(*alpm_table_handle & 0xffffff))
+                *alpm_table_handle = apply_handle_offset(
+                    (P4Table::MatchEntry << 24) + (++max_handle[table_type]), unique_table_offset);
+            if (*alpm_table_handle) tbl["handle"] = *alpm_table_handle;
+            tbl["name"] = name;
+            tbl["table_type"] = type_name[table_type];
+            tbl["size"] = explicit_size ? this->size : size;
+        }
+    }
+}
+
+void P4Table::set_partition_action_handle(unsigned handle) {
+    alpms[this].set_partition_action_handle.insert(handle);
+}
+
+void P4Table::set_partition_field_name(std::string name) {
+    alpms[this].partition_field_name = name;
+}
+
+std::string P4Table::get_partition_field_name() const {
+    if (alpms.count(this)) return alpms[this].partition_field_name;
+    return "";
+}
+
+std::set<unsigned> P4Table::get_partition_action_handle() const {
+    if (alpms.count(this)) {
+        return alpms[this].set_partition_action_handle;
+    }
+    return {};
+}
+
+unsigned P4Table::get_alpm_atcam_table_handle() const {
+    if (alpms.count(this)) return alpms[this].alpm_atcam_table_handle;
+    return 0;
+}
+
+std::string P4Table::direction_name(gress_t gress) {
+    switch (gress) {
+        case INGRESS:
+            return "ingress";
+            break;
+        case EGRESS:
+            return "egress";
+            break;
+        case GHOST:
+            return "ghost";
+            break;
+        default:
+            BUG();
+    }
+    return "";
+}
diff --git a/backends/tofino/bf-asm/p4_table.h b/backends/tofino/bf-asm/p4_table.h
new file mode 100644
index 00000000000..6b937c7539d
--- /dev/null
+++ b/backends/tofino/bf-asm/p4_table.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_P4_TABLE_H_
+#define BACKENDS_TOFINO_BF_ASM_P4_TABLE_H_
+
+#include <map>
+#include <string>
+
+#include "asm-types.h"
+#include "backends/tofino/bf-asm/json.h"
+
+class Table;
+class P4Table;
+
+struct alpm_t {
+    std::string partition_field_name = "";
+    unsigned alpm_atcam_table_handle = 0;
+    unsigned alpm_pre_classifier_table_handle = 0;
+    std::set<unsigned> set_partition_action_handle;
+    json::map *alpm_atcam_table_cfg = 0;           // handle to cjson alpm table
+    json::map *alpm_pre_classifier_table_cfg = 0;  // handle to cjson ternary pre classifier table
+};
+
+class P4Table {
+    int lineno = -1;
+    std::string name, preferred_match_type;
+    std::string stage_table_type;
+    unsigned handle = 0;
+    bool explicit_size = false;
+    bool hidden = false;
+    json::map *config = 0;
+    P4Table() {}
+
+ public:
+    bool disable_atomic_modify = false;
+    unsigned size = 0;
+    std::string match_type, action_profile, how_referenced;
+    enum type {
+        None = 0,
+        MatchEntry = 1,
+        ActionData = 2,
+        Selection = 3,
+        Statistics = 4,
+        Meter = 5,
+        Stateful = 6,
+        NUM_TABLE_TYPES = 7
+    };
+    enum alpm_type { PreClassifier = 1, Atcam = 2 };
+    static const char *type_name[];
+
+ private:
+    static std::map<unsigned, P4Table *> by_handle;
+    static std::map<type, std::map<std::string, P4Table *>> by_name;
+    static unsigned max_handle[];
+
+ public:
+    static P4Table *get(type t, VECTOR(pair_t) & d);
+    static P4Table *alloc(type t, Table *tbl);
+    void check(Table *tbl);
+    const char *p4_name() const { return name.empty() ? nullptr : name.c_str(); }
+    unsigned get_handle() { return handle; }
+    unsigned p4_size() { return size; }
+    std::string p4_stage_table_type() { return stage_table_type; }
+    json::map *base_tbl_cfg(json::vector &out, int size, const Table *table) const;
+    void base_alpm_tbl_cfg(json::map &out, int size, const Table *table,
+                           P4Table::alpm_type atype) const;
+    bool is_alpm() const {
+        if (match_type == "alpm") return true;
+        return false;
+    }
+    void set_partition_action_handle(unsigned handle);
+    void set_partition_field_name(std::string name);
+    std::string get_partition_field_name() const;
+    std::set<unsigned> get_partition_action_handle() const;
+    unsigned get_alpm_atcam_table_handle() const;
+    static std::string direction_name(gress_t);
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_P4_TABLE_H_ */
diff --git a/backends/tofino/bf-asm/parser-tofino-jbay.cpp b/backends/tofino/bf-asm/parser-tofino-jbay.cpp
new file mode 100644
index 00000000000..fbe83db78a4
--- /dev/null
+++ b/backends/tofino/bf-asm/parser-tofino-jbay.cpp
@@ -0,0 +1,2026 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "parser-tofino-jbay.h"
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/target.h"
+#include "constants.h"
+#include "lib/algorithm.h"
+#include "lib/ordered_set.h"
+#include "lib/range.h"
+#include "misc.h"
+#include "phv.h"
+#include "top_level.h"
+#include "vector.h"
+
+/* Dummy specializations so that all specializations are covered */
+
+void AsmParser::init_port_use(bitvec &port_use, const value_t &arg) {
+    if (arg.type == tVEC) {
+        for (int i = 0; i < arg.vec.size; i++) {
+            init_port_use(port_use, arg[i]);
+        }
+    } else if (arg.type == tRANGE) {
+        if (arg.range.hi > arg.range.lo)
+            error(arg.lineno, "port range hi index %d cannot be smaller than lo index %d",
+                  arg.range.hi, arg.range.lo);
+        port_use.setrange(arg.range.lo, arg.range.hi - arg.range.lo + 1);
+    } else if (arg.type == tINT) {
+        port_use.setbit(arg.i);
+    }
+}
+
+void AsmParser::start(int lineno, VECTOR(value_t) args) {
+    if (args.size != 0 && args[0] != "ingress" && args[0] != "egress" &&
+        (args[0] != "ghost" || options.target < JBAY))
+        error(lineno, "parser must specify ingress%s or egress",
+              options.target >= JBAY ? ", ghost" : "");
+}
+
+void AsmParser::input(VECTOR(value_t) args, value_t data) {
+    if (args.size > 0 && args[0] == "ghost") {
+        // Backward compatibility for old ghost parser syntax
+        // ghost parser : W0
+        if (data.type == tVEC) {
+            for (int i = 0; i < data.vec.size; i++) {
+                ghost_parser.push_back(Phv::Ref(GHOST, 0, data[i]));
+            }
+            // New ghost parser syntax
+            // parser ghost:
+            //   ghost_md: W0
+            //   pipe_mask: 0
+        } else if (data.type == tMAP) {
+            for (auto &kv : MapIterChecked(data.map, true)) {
+                if (kv.key == "ghost_md") {
+                    if (kv.value.type == tVEC) {
+                        for (int i = 0; i < kv.value.vec.size; i++) {
+                            ghost_parser.push_back(Phv::Ref(GHOST, 0, data[i]));
+                        }
+                    } else {
+                        ghost_parser.push_back(Phv::Ref(GHOST, 0, kv.value));
+                    }
+                } else if (kv.key == "pipe_mask") {
+                    if (!CHECKTYPE(kv.value, tINT)) continue;
+                    ghost_pipe_mask = kv.value.i;
+                }
+            }
+        } else {
+            ghost_parser.push_back(Phv::Ref(GHOST, 0, data));
+        }
+        return;
+    }
+
+    gress_t gress = (args.size > 0 && args[0] == "egress") ? EGRESS : INGRESS;
+    auto *p = new Parser(phv_use, gress, parser[gress].size());
+    parser[gress].push_back(p);
+    if (args.size == 1) {
+        p->port_use.setrange(0, Target::NUM_PARSERS());
+    } else if (args.size == 2) {
+        init_port_use(p->port_use, args[1]);
+    }
+    p->input(args, data);
+}
+
+void AsmParser::process() {
+    for (auto gress : Range(INGRESS, EGRESS)) {
+        for (auto p : parser[gress]) {
+            p->ghost_parser = ghost_parser;
+            p->ghost_pipe_mask = ghost_pipe_mask;
+            p->process();
+        }
+    }
+
+    bitvec phv_allow_bitwise_or;
+    for (auto p : parser[INGRESS]) {
+        phv_allow_bitwise_or |= p->phv_allow_bitwise_or;
+    }
+    for (auto p : parser[EGRESS]) {
+        phv_allow_bitwise_or |= p->phv_allow_bitwise_or;
+    }
+    for (auto p : parser[INGRESS]) {
+        p->phv_allow_bitwise_or = phv_allow_bitwise_or;
+    }
+    for (auto p : parser[EGRESS]) {
+        p->phv_allow_bitwise_or = phv_allow_bitwise_or;
+    }
+
+    bitvec phv_allow_clear_on_write;
+    for (auto p : parser[INGRESS]) {
+        phv_allow_clear_on_write |= p->phv_allow_clear_on_write;
+    }
+    for (auto p : parser[EGRESS]) {
+        phv_allow_clear_on_write |= p->phv_allow_clear_on_write;
+    }
+    for (auto p : parser[INGRESS]) {
+        p->phv_allow_clear_on_write = phv_allow_clear_on_write;
+    }
+    for (auto p : parser[EGRESS]) {
+        p->phv_allow_clear_on_write = phv_allow_clear_on_write;
+    }
+
+    bitvec phv_init_valid;
+    for (auto p : parser[INGRESS]) {
+        phv_init_valid |= p->phv_init_valid;
+    }
+    for (auto p : parser[EGRESS]) {
+        phv_init_valid |= p->phv_init_valid;
+    }
+    for (auto p : parser[INGRESS]) {
+        p->phv_init_valid = phv_init_valid;
+    }
+    for (auto p : parser[EGRESS]) {
+        p->phv_init_valid = phv_init_valid;
+    }
+}
+
+void AsmParser::output(json::map &ctxt_json) {
+    ctxt_json["parser"]["ingress"] = json::vector();
+    ctxt_json["parser"]["egress"] = json::vector();
+
+    bool use_multiple_parser_impl = false;
+
+    for (auto gress : Range(INGRESS, EGRESS)) {
+        if (parser[gress].size() > 1) use_multiple_parser_impl = true;
+    }
+    /// We use the 'parsers' node in ctxt json to implement
+    /// multiple parser instances support.
+    /// We use the 'parser' node for all single parser
+    /// instance support.
+    for (auto gress : Range(INGRESS, EGRESS)) {
+        /// remove after multi-parser support is fully-tested.
+        if (use_multiple_parser_impl) {
+            for (auto p : parser[gress]) {
+                p->output(ctxt_json);
+            }
+        } else {
+            if (!parser[gress].empty() && parser[gress][0] != nullptr)
+                parser[gress][0]->output_legacy(ctxt_json);
+        }
+    }
+}
+
+std::vector<Parser *> AsmParser::test_get_parser(gress_t gress) {
+    if ((gress == INGRESS) || (gress == EGRESS)) return parser[gress];
+    return std::vector<Parser *>();
+}
+
+std::map<gress_t, std::map<std::string, std::vector<Parser::State::Match::Clot *>>> Parser::clots;
+std::array<std::vector<Parser::State::Match::Clot *>, PARSER_MAX_CLOTS> Parser::clot_use;
+unsigned Parser::max_handle = 0;
+
+static void collect_phv_vector(value_t value, gress_t gress, bitvec &bv) {
+    for (auto &el : value.vec) {
+        Phv::Ref reg(gress, 0, el);
+        if (reg.check()) {
+            int id = reg->reg.uid;
+            bv[id] = 1;
+        }
+    }
+}
+
+void Parser::input(VECTOR(value_t) args, value_t data) {
+    lineno = data.lineno;
+    if (!CHECKTYPE(data, tMAP)) return;
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        if (args.size > 0) {
+            if (args[0] == "ingress" && gress != INGRESS) continue;
+            if (args[0] == "egress" && gress != EGRESS) continue;
+        } else if (error_count > 0) {
+            break;
+        }
+        for (auto &kv : MapIterChecked(data.map, true)) {
+            if (kv.key == "name" && (kv.value.type == tSTR)) {
+                name = kv.value.s;
+                continue;
+            }
+            if (kv.key == "start" && (kv.value.type == tVEC || kv.value.type == tSTR)) {
+                if (kv.value.type == tVEC) {
+                    for (int i = 0; i < 4 && i < kv.value.vec.size; i++)
+                        start_state[i] = kv.value[i];
+                } else {
+                    for (int i = 0; i < 4; i++) start_state[i] = kv.value;
+                }
+                continue;
+            }
+            if (kv.key == "priority" && (kv.value.type == tVEC || kv.value.type == tINT)) {
+                if (kv.value.type == tVEC) {
+                    for (int i = 0; i < 4 && i < kv.value.vec.size; i++)
+                        if (CHECKTYPE(kv.value[i], tINT)) priority[i] = kv.value[i].i;
+                } else {
+                    for (int i = 0; i < 4; i++) priority[i] = kv.value.i;
+                }
+                continue;
+            }
+            if (kv.key == "priority_threshold" &&
+                (kv.value.type == tVEC || kv.value.type == tINT)) {
+                if (kv.value.type == tVEC) {
+                    for (int i = 0; i < 4 && i < kv.value.vec.size; i++)
+                        if (CHECKTYPE(kv.value[i], tINT)) pri_thresh[i] = kv.value[i].i;
+                } else {
+                    for (int i = 0; i < 4; i++) pri_thresh[i] = kv.value.i;
+                }
+                continue;
+            }
+            if (kv.key == "parser_error") {
+                if (parser_error.lineno >= 0) {
+                    error(kv.key.lineno, "Multiple parser_error declarations");
+                    warning(parser_error.lineno, "Previous was here");
+                } else {
+                    parser_error = Phv::Ref(gress, 0, kv.value);
+                }
+                continue;
+            }
+            if (kv.key == "bitwise_or") {
+                if (CHECKTYPE(kv.value, tVEC))
+                    collect_phv_vector(kv.value, gress, phv_allow_bitwise_or);
+
+                continue;
+            }
+            if (kv.key == "clear_on_write") {
+                if (options.target == TOFINO)
+                    error(kv.key.lineno, "Tofino parser does not support clear-on-write semantic");
+
+                if (CHECKTYPE(kv.value, tVEC))
+                    collect_phv_vector(kv.value, gress, phv_allow_clear_on_write);
+
+                continue;
+            }
+            if (kv.key == "init_zero") {
+                if (CHECKTYPE(kv.value, tVEC)) {
+                    collect_phv_vector(kv.value, gress, phv_init_valid);
+                    collect_phv_vector(kv.value, gress, phv_use[gress]);
+                }
+
+                continue;
+            }
+            if (kv.key == "hdr_len_adj") {
+                if (CHECKTYPE(kv.value, tINT)) hdr_len_adj = kv.value.i;
+                continue;
+            }
+            if (kv.key == "states") {
+                if (CHECKTYPE(kv.value, tMAP))
+                    for (auto &st : kv.value.map) define_state(gress, st);
+                continue;
+            }
+            if (kv.key == "bubble") {  // obfuscated name for reverse engineering
+                if (CHECKTYPE(kv.value, tMAP)) {
+                    rate_limit.lineno = kv.key.lineno;
+                    rate_limit.parse(kv.value.map);
+                }
+                continue;
+            }
+            if (gress == EGRESS && kv.key == "meta_opt") {
+                if (CHECKTYPE(kv.value, tINT)) meta_opt = kv.value.i;
+                continue;
+            }
+            if (kv.key == "parse_depth_checks_disabled") {
+                if (options.target == TOFINO)
+                    options.tof1_egr_parse_depth_checks_disabled = get_bool(kv.value);
+                else
+                    warning(kv.key.lineno,
+                            "parse_depth_checks_disabled unexpected: supported only by Tofino");
+                continue;
+            }
+            define_state(gress, kv);
+        }
+
+        // process the CLOTs immediately rather than in Parser::process() so that it
+        // happens before Deparser::process()
+        for (auto &map : Values(clots)) {
+            for (auto &vec : Values(map)) {
+                State::Match::Clot *maxlen = 0;
+                for (auto *cl : vec) {
+                    if (cl->tag >= 0) clot_use[cl->tag].push_back(cl);
+                    if (!maxlen || cl->max_length > maxlen->max_length) maxlen = cl;
+                }
+                for (auto *cl : vec) cl->max_length = maxlen->max_length;
+            }
+        }
+
+        for (auto &map : Values(clots)) {
+            std::map<std::string, unsigned> clot_alloc;
+            unsigned free_clot_tag = 0;
+            while (free_clot_tag < PARSER_MAX_CLOTS && !clot_use[free_clot_tag].empty())
+                ++free_clot_tag;
+
+            for (auto &vec : Values(map)) {
+                for (auto *cl : vec) {
+                    if (cl->tag >= 0) continue;
+                    if (clot_alloc.count(cl->name)) {
+                        cl->tag = clot_alloc.at(cl->name);
+                        clot_use[cl->tag].push_back(cl);
+                    } else if (free_clot_tag >= PARSER_MAX_CLOTS) {
+                        error(cl->lineno, "Too many CLOTs (%d max)", PARSER_MAX_CLOTS);
+                    } else {
+                        clot_alloc[cl->name] = cl->tag = free_clot_tag++;
+                        clot_use[cl->tag].push_back(cl);
+                        while (free_clot_tag < PARSER_MAX_CLOTS && !clot_use[free_clot_tag].empty())
+                            ++free_clot_tag;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void Parser::define_state(gress_t gress, pair_t &kv) {
+    if (!CHECKTYPE2M(kv.key, tSTR, tCMD, "state declaration")) return;
+    const char *name = kv.key.s;
+    match_t stateno = {0, 0};
+    if (kv.key.type == tCMD) {
+        name = kv.key[0].s;
+        if (!CHECKTYPE2(kv.key[1], tINT, tMATCH)) return;
+        if (kv.key[1].type == tINT) {
+            if (kv.key[1].i > PARSER_STATE_MASK)
+                error(kv.key.lineno, "Explicit state out of range");
+            stateno.word1 = kv.key[1].i;
+            stateno.word0 = (~kv.key[1].i) & PARSER_STATE_MASK;
+        } else {
+            stateno = kv.key[1].m;
+            if ((stateno.word0 | stateno.word1) > PARSER_STATE_MASK)
+                error(kv.key.lineno, "Explicit state out of range");
+            stateno.word0 |= ~(stateno.word0 | stateno.word1) & PARSER_STATE_MASK;
+        }
+    }
+    if (!CHECKTYPE(kv.value, tMAP)) return;
+    auto n = states.emplace(name, new State(kv.key.lineno, name, gress, stateno, kv.value.map));
+    if (n.second) {
+        all.push_back(n.first->second);
+    } else {
+        error(kv.key.lineno, "State %s already defined in %sgress", name, gress ? "e" : "in");
+        warning(n.first->second->lineno, "previously defined here");
+    }
+}
+
+void Parser::process() {
+    if (all.empty()) return;
+    for (auto st : all) st->pass1(this);
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        if (states.empty()) continue;
+        if (start_state[0].lineno < 0) {
+            State *start = get_start_state();
+            if (!start) {
+                error(lineno, "No %sgress parser start state", gress ? "e" : "in");
+                continue;
+            } else {
+                for (int i = 0; i < 4; i++) {
+                    start_state[i].name = start->name;
+                    start_state[i].lineno = start->lineno;
+                    start_state[i].ptr.push_back(start);
+                }
+            }
+        } else {
+            for (int i = 0; i < 4; i++) start_state[i].check(gress, this, 0);
+        }
+        for (int i = 0; i < 4 && !start_state[i]; i++)
+            if (!start_state[i]->can_be_start()) {
+                std::string name = std::string("<start") + char('0' + i) + '>';
+                LOG1("Creating new " << gress << " " << name << " state");
+                auto n = states.emplace(name, new State(lineno, name.c_str(), gress, match_t{0, 0},
+                                                        VECTOR(pair_t){0, 0, 0}));
+                BUG_CHECK(n.second);
+                State *state = n.first->second;
+                state->def = new State::Match(lineno, gress, *start_state[i]);
+                for (int j = 3; j >= i; j--)
+                    if (start_state[j] == start_state[i]) {
+                        start_state[j].name = name;
+                        start_state[j].ptr[0] = state;
+                    }
+                all.insert(all.begin(), state);
+            }
+        if (parser_error.lineno >= 0)
+            if (parser_error.check() && parser_error.gress() == gress)
+                phv_use[gress][parser_error->reg.uid] = 1;
+    }
+    if (ghost_parser.size()) {
+        int total_size = 0;
+        int curr_parser_id = -1;
+        std::sort(ghost_parser.begin(), ghost_parser.end());
+        for (Phv::Ref &r : ghost_parser) {
+            r.check();
+            total_size += r.size();
+            if (curr_parser_id >= 0) {
+                if ((curr_parser_id + 1) != r->reg.parser_id())
+                    error(ghost_parser[0].lineno, "ghost thread input must be 32 consecutive bits");
+            }
+            curr_parser_id = r->reg.parser_id();
+        }
+        if (total_size != 32) error(ghost_parser[0].lineno, "ghost thread input must be 32 bits");
+    }
+    if (error_count > 0) return;
+    int all_index = 0;
+    for (auto st : all) st->all_idx = all_index++;
+    bitvec unreach(0, all_index);
+    for (int i = 0; i < 4; i++)
+        if (!states.empty()) start_state[i]->unmark_reachable(this, unreach);
+    for (auto u : unreach)
+        warning(all[u]->lineno, "%sgress state %s unreachable", all[u]->gress ? "E" : "In",
+                all[u]->name.c_str());
+    if (phv_use[INGRESS].intersects(phv_use[EGRESS])) {
+        bitvec tmp = phv_use[INGRESS];
+        tmp &= phv_use[EGRESS];
+        for (int reg : tmp)
+            error(lineno, "Phv register %s(R%d) used by both ingress and egress",
+                  Phv::reg(reg)->name, reg);
+    }
+    if (options.match_compiler || 1) { /* FIXME -- need proper liveness analysis */
+        Phv::setuse(INGRESS, phv_use[INGRESS]);
+        Phv::setuse(EGRESS, phv_use[EGRESS]);
+    }
+}
+
+int Parser::get_header_stack_size_from_valid_bits(std::vector<State::Match::Set *> sets) {
+    // Find Set operation that holds the stack valid bits, then
+    // find the largest value of "$<value>.$valid".
+    for (const auto *set : sets) {
+        auto reg = Phv::reg(set->where.name());
+        if (reg) {
+            auto aliases = Phv::aliases(reg, 0);
+            if (std::find_if(aliases.begin(), aliases.end(), [](const std::string &s) {
+                    return s.find(".$stkvalid") != std::string::npos;
+                }) != aliases.end()) {
+                int stack_size = 0;
+                while (std::find_if(
+                           aliases.begin(), aliases.end(), [&stack_size](const std::string &s) {
+                               return s.find("$" + std::to_string(stack_size) + ".$valid") !=
+                                      std::string::npos;
+                           }) != aliases.end())
+                    stack_size++;
+                return stack_size;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * @brief  Returns the deepest parser depth, starting from state s.
+ *         Returned value in bits.
+ */
+int Parser::state_prsr_dph_max(const State *s) {
+    std::map<const State *, std::pair<int, int>> visited;  // pair: first=curr_dph_bits
+                                                           //       second=recurse count
+    return state_prsr_dph_max(s, visited, -hdr_len_adj * 8);
+}
+
+/**
+ * @brief Returns the deepest parser depth for state s, considering the depth
+ *        is already at curr_dph_bits at the time it's being called.
+ *        Returned value in bits.
+ */
+int Parser::state_prsr_dph_max(const State *s,
+                               std::map<const State *, std::pair<int, int>> &visited,
+                               int curr_dph_bits) {
+    int parser_depth_max_bits = parser_depth_max_bytes * 8;
+    int parser_depth_min_bits = parser_depth_min_bytes * 8;
+    if (!s) return 0;
+    if (s->ignore_max_depth && curr_dph_bits >= parser_depth_min_bits) return curr_dph_bits;
+    // Keep track of states visited along with the parser depth at time of visit
+    // and the number of times the state was called recursively.  Return 0 if current
+    // curr_dph_bits value is smaller or equal to the largest value seen so far,
+    // or if the state was called enough times to fill the header stack if one
+    // is used.
+    if (visited.count(s) && (visited.at(s).first >= curr_dph_bits)) {
+        LOG5(" State : " << s->name << " --> largest depth : " << visited[s].first
+                         << " >= current depth : " << curr_dph_bits << " --> Ignore.");
+        return 0;
+    }
+    visited[s].first = curr_dph_bits;
+    visited[s].second++;
+    int curr_state_prsr_dph_max = 0;
+    for (const auto *m : s->match) {
+        auto local_bits_shifted = curr_dph_bits + (m->shift * 8);
+        std::string next_name = m->next ? m->next->name : std::string("END");
+        LOG5(" State : " << s->name << " --> " << m->match << " --> " << next_name
+                         << " | Bits: " << curr_dph_bits << ", shift : " << m->shift * 8
+                         << ", intr_md_bits : " << m->intr_md_bits
+                         << ", Total Bits : " << local_bits_shifted);
+        // Look for non-unrolled loops that save in header stacks.  In that case, use
+        // header stack size to limit parser depth calculation.
+        if (m->offset_inc) {
+            // One of the Set operations will set the header stack entries $valid bits.
+            // Get stack size information from these valid bits.
+            int stack_size = get_header_stack_size_from_valid_bits(m->set);
+            LOG5(" State : stack_size = " << stack_size
+                                          << ", visited count = " << visited[s].second);
+            // Do not go beyond header stack size to find parser depth.
+            if (visited[s].second > stack_size) {
+                LOG5(" State : reached end of header stack, size = " << stack_size);
+                continue;
+            }
+        }
+
+        if (local_bits_shifted < parser_depth_max_bits) {
+            if (m->next) {
+                for (auto n : m->next.ptr) {
+                    int prsr_dph = state_prsr_dph_max(n, visited, local_bits_shifted);
+                    curr_state_prsr_dph_max = std::max(curr_state_prsr_dph_max, prsr_dph);
+                }
+            } else {
+                curr_state_prsr_dph_max = std::max(curr_state_prsr_dph_max, local_bits_shifted);
+            }
+        } else {
+            LOG5(" State : " << s->name << " --> " << m->match << " --> " << next_name
+                             << " | Reached " << parser_depth_max_bits
+                             << " bits, maximum supported by target.");
+            curr_state_prsr_dph_max = parser_depth_max_bits;
+        }
+
+        // No point in going any further with the other matches
+        // if we reached the maximum allowed by the target.
+        if (curr_state_prsr_dph_max >= parser_depth_max_bits) break;
+
+        // If the current match is a default or catch-all transition, then
+        // break out of the loop as any following transitions will never
+        // be taken.
+        uint64_t mask = bitMask(s->key.width);
+        if ((m->match.word0 & m->match.word1 & mask) == mask) {
+            LOG5(" State : catch-all transition, break out of loop.");
+            break;
+        }
+    }
+    visited[s].second--;
+    return curr_state_prsr_dph_max;
+}
+
+int Parser::get_prsr_max_dph() {
+    // Look for the longest parser depth from all configured start states.
+    // Return the longest one found.
+    //
+    // Note: at this point start_state[] contains the start states either
+    //       read from the bfa file, or deduced from the standard/typical
+    //       start state names returned from get_start_state() during
+    //       Parser::process.
+    //
+    int prsr_dph_max = 0;
+    std::set<std::string> visited;
+    for (auto &state : start_state) {
+        if (state) {
+            BUG_CHECK(states[state.name], "Start state %s not found in states table.",
+                      state.name.c_str());
+            if (visited.count(state.name)) continue;
+            visited.insert(state.name);
+            int prsr_dph = state_prsr_dph_max(states[state.name]);
+            LOG4("state " << state.name << " dph=" << prsr_dph);
+            prsr_dph_max = std::max(prsr_dph_max, prsr_dph);
+        }
+    }
+    prsr_dph_max = (prsr_dph_max + 0x7) & ~0x7;
+    prsr_dph_max /= 8;
+    prsr_dph_max = (prsr_dph_max + 0xf) & ~0xf;
+    prsr_dph_max /= 16;
+    // P4C-5341/5342: For Tofino EPB, the one additional word is sent beyond prsr_dph_max.
+    if (options.target == TOFINO && gress == EGRESS) prsr_dph_max -= 1;
+    return std::max(prsr_dph_max, 4);
+}
+
+void Parser::output_default_ports(json::vector &vec, bitvec port_use) {
+    while (!port_use.empty()) {
+        auto idx = port_use.ffs(0);
+        vec.push_back(idx);
+        port_use.clrbit(idx);
+    }
+}
+
+std::map<std::string, unsigned> Parser::parser_handles;
+
+void Parser::write_config(RegisterSetBase &regs, json::map &json, bool legacy) {
+    if (auto *tofino_regs = dynamic_cast<Target::Tofino::parser_regs *>(&regs))
+        write_config(*tofino_regs, json, legacy);
+    else if (auto *jbay_regs = dynamic_cast<Target::JBay::parser_regs *>(&regs))
+        write_config(*jbay_regs, json, legacy);
+}
+
+// output context.json format with multiple parser support
+void Parser::output(json::map &ctxt_json) {
+    json::vector &cjson = ctxt_json["parsers"][gress ? "egress" : "ingress"];
+    if (all.empty()) return;
+    for (auto st : all) st->pass2(this);
+    if (error_count > 0) return;
+    tcam_row_use = PARSER_TCAM_DEPTH;
+    SWITCH_FOREACH_TARGET(
+        options.target, auto *regs = new TARGET::parser_regs; declare_registers(regs);
+        json::map parser_ctxt_json;
+        // Parser Handles are generated in the assembler. Since the assembler
+        // has no idea about multipipe program (since assembler is separately
+        // invoked for each pipe bfa) the parser handles generated can be same
+        // across multiple pipes. Here, we rely on the driver to prefix a pipe id
+        // (profile id) to make the handles unique. The upper 2 bits are
+        // reserved for this id.
+        parser_handle = next_handle();
+        parser_handles[name] = parser_handle;  // store parser handles
+        parser_ctxt_json["name"] = name; parser_ctxt_json["handle"] = parser_handle;
+        json::vector default_ports; output_default_ports(default_ports, port_use);
+        parser_ctxt_json["default_parser_id"] = std::move(default_ports);
+        write_config(dynamic_cast<RegisterSetBase &>(*regs), parser_ctxt_json, false);
+        // FIXME -- rate limit config regs are per-pipe, not per parser, so if more than
+        // one parser wants to set different rate limits, there will be a problem
+        if (rate_limit) rate_limit.write_config(TopLevel::regs<TARGET>()->reg_pipe, gress);
+        cjson.push_back(std::move(parser_ctxt_json));
+        gen_configuration_cache(*regs, ctxt_json["configuration_cache"]);)
+}
+
+// output context.json format prior to multiple parser support
+// TODO: remove after multi-parser support is fully-tested.
+void Parser::output_legacy(json::map &ctxt_json) {
+    if (all.empty()) return;
+    for (auto st : all) st->pass2(this);
+    if (error_count > 0) return;
+    tcam_row_use = PARSER_TCAM_DEPTH;
+    SWITCH_FOREACH_TARGET(
+        options.target, auto *regs = new TARGET::parser_regs; declare_registers(regs);
+        parser_handle = next_handle();
+        write_config(dynamic_cast<RegisterSetBase &>(*regs), ctxt_json["parser"], true);
+        if (rate_limit) rate_limit.write_config(TopLevel::regs<TARGET>()->reg_pipe, gress);
+        gen_configuration_cache(*regs, ctxt_json["configuration_cache"]);)
+}
+
+Parser::Checksum::Checksum(gress_t gress, pair_t data) : lineno(data.key.lineno), gress(gress) {
+    if (!CHECKTYPE2(data.key, tSTR, tCMD)) return;
+    if (!CHECKTYPE(data.value, tMAP)) return;
+    if (data.key.vec.size == 2) {
+        if ((unit = data.key[1].i) >= Target::PARSER_CHECKSUM_UNITS())
+            error(lineno, "Ran out of %sgress parser checksum units (%d available)",
+                  gress ? "e" : "in", Target::PARSER_CHECKSUM_UNITS());
+    } else {
+        error(data.key.lineno, "Syntax error");
+    }
+    for (auto &kv : MapIterChecked(data.value.map, true)) {
+        if (kv.key == "type") {
+            if (CHECKTYPE(kv.value, tSTR)) {
+                if (kv.value == "VERIFY")
+                    type = 0;
+                else if (kv.value == "RESIDUAL")
+                    type = 1;
+                else if (kv.value == "CLOT")
+                    type = 2;
+                else
+                    error(kv.value.lineno, "Unknown parser checksum type");
+            }
+            if (kv.value == "clot") {
+                if (unit < 2 || unit > 4)
+                    error(kv.value.lineno, "CLOT can only use checksum engine 2-4");
+            }
+        } else if (kv.key == "start") {
+            if (CHECKTYPE(kv.value, tINT)) start = kv.value.i;
+        } else if (kv.key == "end") {
+            if (CHECKTYPE(kv.value, tINT)) end = kv.value.i;
+        } else if (kv.key == "addr") {
+            if (CHECKTYPE(kv.value, tINT)) addr = kv.value.i;
+        } else if (kv.key == "add") {
+            if (CHECKTYPE(kv.value, tINT)) add = kv.value.i;
+        } else if (kv.key == "dest") {
+            if (kv.value.type == tCMD && kv.value == "clot" && kv.value.vec.size == 2)
+                tag = kv.value[1].i;
+            else
+                dest = Phv::Ref(gress, 0, kv.value);
+        } else if (kv.key == "end_pos") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                if (kv.value.i > PARSER_INPUT_BUFFER_SIZE)
+                    error(kv.value.lineno, "Header end position is out of input buffer");
+                if (kv.value.i < 0) error(kv.value.lineno, "Header end postion cannot be negative");
+                dst_bit_hdr_end_pos = kv.value.i;
+            }
+        } else if (kv.key == "mask") {
+            if (CHECKTYPE(kv.value, tVEC)) {
+                for (int i = 0; i < kv.value.vec.size; i++) {
+                    auto range = kv.value[i];
+                    unsigned lo = 0, hi = 0;
+                    if (range.type == tRANGE) {
+                        lo = range.range.lo;
+                        hi = range.range.hi;
+                    } else if (range.type == tINT) {
+                        lo = hi = range.i;
+                    } else {
+                        error(kv.value.lineno, "Syntax error, expecting range or int");
+                    }
+
+                    if (lo > hi) error(kv.value.lineno, "Invalid parser checksum input");
+                    if ((hi + 1) > PARSER_INPUT_BUFFER_SIZE)
+                        error(kv.value.lineno, "Parser checksum out of input buffer?");
+
+                    for (unsigned byte = lo; byte <= hi; ++byte) {
+                        if (kv.key == "mask") mask |= (1 << byte);
+                    }
+                }
+            }
+        } else if (kv.key == "swap") {
+            if (CHECKTYPE(kv.value, tINT)) swap = kv.value.i;
+        } else if (kv.key == "mul_2") {
+            if (options.target == TOFINO) {
+                error(kv.value.lineno, "multiply by 2 feature is available for Tofino2 and higher");
+            }
+            if (CHECKTYPE(kv.value, tINT)) mul_2 = kv.value.i;
+        } else if (kv.key == "shift") {
+            shift = get_bool(kv.value);
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in checksum", value_desc(kv.key));
+        }
+    }
+}
+
+bool Parser::Checksum::equiv(const Checksum &a) const {
+    if (unit != a.unit) return false;
+    if (tag != a.tag) return false;
+    if (dest && a.dest) {
+        if (dest != a.dest) return false;
+    } else if (dest || a.dest) {
+        return false;
+    }
+    return add == a.add && mask == a.mask && swap == a.swap && mul_2 == a.mul_2 &&
+           dst_bit_hdr_end_pos == a.dst_bit_hdr_end_pos && start == a.start && end == a.end &&
+           shift == a.shift && type == a.type;
+}
+
+void Parser::Checksum::pass1(Parser *parser) {
+    if (parser->checksum_use.empty())
+        parser->checksum_use.resize(Target::PARSER_CHECKSUM_UNITS(), {});
+    if (addr >= 0) {
+        if (addr >= PARSER_CHECKSUM_ROWS) {
+            error(lineno, "invalid %sgress parser checksum address %d", gress ? "e" : "in", addr);
+        } else if (parser->checksum_use[unit][addr]) {
+            if (!equiv(*parser->checksum_use[unit][addr])) {
+                error(lineno, "incompatible %sgress parser checksum use at address %d",
+                      gress ? "e" : "in", addr);
+                warning(parser->checksum_use[unit][addr]->lineno, "previous use");
+            }
+        } else {
+            parser->checksum_use[unit][addr] = this;
+        }
+    }
+    if (dest.check() && dest->reg.parser_id() < 0)
+        error(dest.lineno, "%s is not accessable in the parser", dest->reg.name);
+    if (dest && dest->reg.size == 32)
+        error(dest.lineno, "checksum unit cannot write to 32-bit container");
+    if (type == 0 && dest) {
+        if (dest->lo != dest->hi)
+            error(dest.lineno, "checksum verification destination must be single bit");
+        else
+            dst_bit_hdr_end_pos = dest->lo;
+        if (options.target == JBAY && dest->reg.size == 8 && dest->reg.deparser_id() % 2)
+            dst_bit_hdr_end_pos += 8;
+    } else if (type == 1 && dest && dest.size() != dest->reg.size) {
+        error(dest.lineno, "residual checksum must write whole container");
+    }
+}
+
+void Parser::Checksum::pass2(Parser *parser) {
+    if (addr < 0) {
+        int avail = -1;
+        for (int i = 0; i < PARSER_CHECKSUM_ROWS; ++i) {
+            if (parser->checksum_use[unit][i]) {
+                if (equiv(*parser->checksum_use[unit][i])) {
+                    addr = i;
+                    break;
+                }
+            } else if (avail < 0) {
+                avail = i;
+            }
+        }
+        if (addr < 0) {
+            if (avail >= 0) {
+                parser->checksum_use[unit][addr = avail] = this;
+            } else {
+                error(lineno,
+                      "Ran out of room in parser checksum control RAM of"
+                      " %sgress unit %d (%d rows available)",
+                      gress ? "e" : "in", unit, PARSER_CHECKSUM_ROWS);
+            }
+        }
+    }
+}
+
+Parser::CounterInit::CounterInit(gress_t gress, pair_t data)
+    : gress(gress), lineno(data.key.lineno) {
+    if (!CHECKTYPE2(data.key, tSTR, tCMD)) return;
+    if (!CHECKTYPE(data.value, tMAP)) return;
+
+    if (options.target == TOFINO) mask = 7;
+
+    for (auto &kv : MapIterChecked(data.value.map, true)) {
+        if (kv.key == "add" && CHECKTYPE(kv.value, tINT)) {
+            add = kv.value.i;
+            if (add > 255) error(lineno, "Parser counter add value out of range (0-255)");
+        } else if (kv.key == "max" && CHECKTYPE(kv.value, tINT)) {
+            max = kv.value.i;
+            if (max > 255) error(lineno, "Parser counter max value out of range (0-255)");
+        } else if (kv.key == "rotate" && CHECKTYPE(kv.value, tINT)) {
+            rot = kv.value.i;
+            if (rot > 7) error(lineno, "Parser counter rotate value out of range (0-7)");
+        } else if (kv.key == "mask" && CHECKTYPE(kv.value, tINT)) {
+            mask = kv.value.i;
+            if (options.target == TOFINO && mask > 7) {
+                error(lineno, "Parser counter mask value out of range (0-7)");
+            } else if (options.target == JBAY && mask > 255) {
+                error(lineno, "Parser counter mask value out of range (0-255)");
+            }
+        } else if (kv.key == "src") {
+            if (CHECKTYPE(kv.value, tSTR)) {
+                if (options.target == TOFINO) {
+                    if (kv.value == "half_lo")
+                        src = 0;
+                    else if (kv.value == "half_hi")
+                        src = 1;
+                    else if (kv.value == "byte0")
+                        src = 2;
+                    else if (kv.value == "byte1")
+                        src = 3;
+                    else
+                        error(lineno, "Unexpected counter load source");
+                } else if (options.target != TOFINO) {
+                    if (kv.value == "byte0")
+                        src = 0;
+                    else if (kv.value == "byte1")
+                        src = 1;
+                    else if (kv.value == "byte2")
+                        src = 2;
+                    else if (kv.value == "byte3")
+                        src = 3;
+                    else
+                        error(lineno, "Unexpected counter load source");
+                }
+            }
+        } else if (kv.key != "push" && kv.key != "update_with_top") {
+            error(lineno, "Syntax error in parser counter init expression");
+        }
+    }
+}
+
+bool Parser::CounterInit::equiv(const CounterInit &a) const {
+    return add == a.add && mask == a.mask && rot == a.rot && max == a.max && src == a.src;
+}
+
+void Parser::CounterInit::pass2(Parser *parser) {
+    if (addr < 0) {
+        int avail = -1;
+        for (int i = 0; i < PARSER_CTRINIT_ROWS; ++i) {
+            if (parser->counter_init[i]) {
+                if (equiv(*parser->counter_init[i])) {
+                    addr = i;
+                    break;
+                }
+            } else if (avail < 0) {
+                avail = i;
+            }
+        }
+        if (addr < 0) {
+            if (avail >= 0)
+                parser->counter_init[addr = avail] = this;
+            else
+                error(lineno,
+                      "Ran out of room in parser counter init RAM of"
+                      " %sgress (%d rows available)",
+                      gress ? "e" : "in", PARSER_CTRINIT_ROWS);
+        }
+    }
+}
+
+Parser::PriorityUpdate::PriorityUpdate(const value_t &exp) {
+    lineno = exp.lineno;
+    if (!parse(exp)) error(lineno, "Syntax error in priority expression");
+}
+
+bool Parser::PriorityUpdate::parse(const value_t &exp, int what) {
+    enum { START, MASK, SHIFT, LOAD };
+    if (exp.type == tCMD) {
+        if (exp[0] == ">>") {
+            return what < SHIFT && parse(exp[1], LOAD) && parse(exp[2], SHIFT);
+        } else if (exp[0] == "&") {
+            return what < SHIFT && parse(exp[1], MASK) && parse(exp[2], MASK);
+        }
+    } else if (exp.type == tINT) {
+        switch (what) {
+            case START:
+            case MASK:
+                if (mask >= 0) return false;
+                if ((mask = exp.i) < 0 || mask > 7) {
+                    error(exp.lineno, "priority mask %d out of range", mask);
+                    return false;
+                }
+                return true;
+            case SHIFT:
+                if (shift >= 0) return false;
+                if ((shift = exp.i) < 0 || shift > 15) {
+                    error(exp.lineno, "priority shift %d out of range", shift);
+                    return false;
+                }
+                return true;
+            default:
+                return false;
+        }
+    } else if (exp.type == tSTR && exp.s[0] == '@' && isdigit(exp.s[1])) {
+        char *end;
+        if (what == SHIFT || offset >= 0 || (offset = strtol(exp.s + 1, &end, 10)) < 0 || *end)
+            return false;
+        return true;
+    }
+    return false;
+}
+
+void Parser::RateLimit::parse(const VECTOR(pair_t) & data) {
+    inc = dec = 1;
+    for (auto &kv : MapIterChecked(data)) {
+        if (kv.key == "inc") {
+            if (CHECKTYPE(kv.value, tINT)) inc = kv.value.i;
+        } else if (kv.key == "dec") {
+            if (CHECKTYPE(kv.value, tINT)) dec = kv.value.i;
+        } else if (kv.key == "max") {
+            if (CHECKTYPE(kv.value, tINT)) max = kv.value.i;
+        } else if (kv.key == "interval") {
+            if (CHECKTYPE(kv.value, tINT)) interval = kv.value.i;
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in bubble spec", value_desc(kv.key));
+        }
+    }
+    if (max < 0) error(lineno, "no max limit in bubble spec");
+}
+
+Parser::State::Ref &Parser::State::Ref::operator=(const value_t &v) {
+    lineno = v.lineno;
+    ptr.clear();
+    if (v.type == tSTR) {
+        name = v.s;
+        pattern.word0 = pattern.word1 = 0;
+    } else if (CHECKTYPE2M(v, tINT, tMATCH, "state reference")) {
+        name.clear();
+        if (v.type == tINT) {
+            pattern.word0 = ~v.i;
+            pattern.word1 = v.i;
+        } else {
+            pattern = v.m;
+        }
+        if ((pattern.word0 | pattern.word1) > PARSER_STATE_MASK) {
+            error(lineno, "Parser state out of range");
+            pattern.word0 &= PARSER_STATE_MASK;
+            pattern.word1 &= PARSER_STATE_MASK;
+        } else {
+            pattern.word1 |= ~(pattern.word0 | pattern.word1) & PARSER_STATE_MASK;
+        }
+    }
+    return *this;
+}
+
+void Parser::State::Ref::check(gress_t gress, Parser *pa, State *state) {
+    if (ptr.empty()) {
+        if (name.size()) {
+            auto it = pa->states.find(name);
+            if (it != pa->states.end())
+                ptr.push_back(it->second);
+            else if (name != "END" && name != "end")
+                error(lineno, "No state named %s in %sgress parser", name.c_str(),
+                      gress ? "e" : "in");
+        } else if (pattern) {
+            match_t tmp = pattern;
+            unsigned wc = tmp.word0 & tmp.word1;
+            if (wc && !state->stateno) {
+                warning(lineno,
+                        "Using next state pattern in state without an explicit "
+                        "state number");
+                wc = 0;
+            }
+            tmp.word0 &= ~wc | state->stateno.word0;
+            tmp.word1 &= ~wc | state->stateno.word1;
+            for (auto *st : pa->all) {
+                if (st->gress != state->gress) continue;
+                if (st == state) continue;
+                if (tmp.matches(st->stateno)) ptr.push_back(st);
+            }
+        }
+    }
+}
+
+const char *Parser::match_key_loc_name(int loc) {
+    if (options.target == TOFINO) {
+        if (loc == 0 || loc == 1) return "half";
+        if (loc == 2) return "byte0";
+        if (loc == 3) return "byte1";
+    } else {
+        if (loc == 0) return "byte0";
+        if (loc == 1) return "byte1";
+        if (loc == 2) return "byte2";
+        if (loc == 3) return "byte3";
+    }
+
+    error(-1, "Invalid match key loc");
+    return nullptr;
+}
+
+int Parser::match_key_loc(value_t &key, bool errchk) {
+    if (errchk && !CHECKTYPE(key, tSTR)) return -1;
+    int loc = Parser::match_key_loc(key.s);
+    if (loc < 0) error(key.lineno, "Invalid matcher location %s", key.s);
+    return loc;
+}
+
+int Parser::match_key_loc(const char *key) {
+    if (options.target == TOFINO) {
+        if (!strcmp(key, "half") || !strcmp(key, "half0")) return 0;
+        if (!strcmp(key, "byte0")) return 2;
+        if (!strcmp(key, "byte1")) return 3;
+    } else {
+        if (!strcmp(key, "byte0")) return 0;
+        if (!strcmp(key, "byte1")) return 1;
+        if (!strcmp(key, "byte2")) return 2;
+        if (!strcmp(key, "byte3")) return 3;
+    }
+
+    error(-1, "Invalid match key %s", key);
+    return -1;
+}
+
+int Parser::match_key_size(const char *key) {
+    if (!strncmp(key, "half", 4)) return 16;
+    if (!strncmp(key, "byte", 4)) return 8;
+
+    error(-1, "Invalid match key %s", key);
+    return -1;
+}
+
+int Parser::State::MatchKey::move_down(int loc) {
+    int to = loc;
+    while (to >= 0 && ((specified >> to) & 1)) to--;
+    if (to < 0) return -1;
+    if (data[to].bit >= 0 && move_down(to) < 0) return -1;
+    data[to] = data[loc];
+    data[loc].bit = -1;
+    return 0;
+}
+
+int Parser::State::MatchKey::add_byte(int loc, int byte, bool use_saved) {
+    // FIXME: Parameter "byte" is an offset in the input packet buffer.
+    //        It seems strange to specify a negative value when checking
+    //        for the lower range (i.e. -64): when bytes are shifted
+    //        out of the input buffer, they can't be read anymore.
+    //        Should the lower range value be 0 instead?
+    if (options.target == TOFINO) {
+        if (byte <= -64 || byte >= 32) {
+            error(lineno, "Match key index out of range");
+            return -1;
+        }
+    } else {
+        // Valid offset ranges:
+        //   -63..31 : Input packet
+        //   60..63  : Scratch registers
+        if ((byte <= -64) || ((byte > 31) && (byte < 60)) || (byte > 63)) {
+            error(lineno, "Match key index out of range");
+            return -1;
+        }
+    }
+
+    if (loc >= 0) {
+        if ((specified >> loc) & 1)
+            error(lineno, "Multiple matches in %s matcher", Parser::match_key_loc_name(loc));
+        specified |= (1 << loc);
+        if (data[loc].bit >= 0 && move_down(loc) < 0) return -1;
+    } else {
+        for (int i = 3; i >= 0; i--)
+            if (data[i].bit < 0) {
+                loc = i;
+                break;
+            }
+        if (loc < 0) {
+            error(lineno, "Too much data for parse matcher");
+            return -1;
+        }
+    }
+    data[loc].bit = width;
+    data[loc].byte = use_saved ? USE_SAVED : byte;
+    width += 8;
+    return 0;
+}
+
+int Parser::State::MatchKey::setup_match_el(int at, value_t &spec) {
+    switch (spec.type) {
+        case tINT:
+            return add_byte(at, spec.i);
+        case tRANGE:
+            if (spec.range.lo >= spec.range.hi) {
+                error(spec.lineno, "Invalid match range");
+                return -1;
+            }
+            if (at >= 0) at += spec.range.hi - spec.range.lo;
+            for (int i = spec.range.hi; i >= spec.range.lo; i--) {
+                if (add_byte(at, i) < 0) return -1;
+                if (at >= 0) at--;
+            }
+            return 0;
+        case tMAP:
+            if (at >= 0) goto error;
+            for (int i = spec.map.size - 1; i >= 0; i--)
+                if (setup_match_el(Parser::match_key_loc(spec.map[i].key), spec.map[i].value) < 0)
+                    return -1;
+            return 0;
+        case tSTR:
+            if (spec == "ctr_zero") {
+                if (ctr_zero >= 0) {
+                    error(spec.lineno, "'ctr_zero' specified twice");
+                    return -1;
+                }
+                ctr_zero = width++;
+                return 0;
+            } else if (spec == "ctr_neg") {
+                if (ctr_neg >= 0) {
+                    error(spec.lineno, "'ctr_neg' specified twice");
+                    return -1;
+                }
+                ctr_neg = width++;
+                return 0;
+            } else if (!strncmp(spec.s, "save_byte", 9)) {
+                if (options.target == TOFINO)
+                    error(spec.lineno, "Tofino does not have scratch registers in the parser");
+
+                int i = spec.s[9] - '0';
+                if (i < 0 || i > 4) error(spec.lineno, "Invalid parser save source %s", spec.s);
+                save = 1 << i;
+                width += 8;
+                return 0;
+            } else if (at < 0 && (at = Parser::match_key_loc(spec, false)) >= 0) {
+                if (options.target == TOFINO && at == 0 && add_byte(1, 0, true) < 0) return -1;
+                return add_byte(at, 0, true);
+            }
+            /* fall through */
+        default:
+        error:
+            error(spec.lineno, "Syntax error in match spec");
+            return -1;
+    }
+}
+
+void Parser::State::MatchKey::setup(value_t &spec) {
+    lineno = spec.lineno;
+    if (spec.type == tVEC) {
+        /* allocate the keys bits for the least significant match bits first... */
+        for (int i = spec.vec.size - 1; i >= 0; i--)
+            if (setup_match_el(-1, spec[i]) < 0) return;
+    } else {
+        setup_match_el(-1, spec);
+    }
+
+    // For TOFINO, the first match byte pair must be an adjacent 16 bit pair. We
+    // check and re-arrange the bytes for a 16 bit extractor. In JBAY this check
+    // is not necessary as we can have independent byte extractors
+    if (Target::MATCH_BYTE_16BIT_PAIRS() && (data[0].byte & data[1].byte) != USE_SAVED) {
+        if (data[0].bit >= 0 && data[1].bit >= 0 && data[0].byte + 1 != data[1].byte) {
+            BUG_CHECK((data[0].byte | data[1].byte) != USE_SAVED);
+            int unused = -1;  // unused slot
+            for (int i = 0; i < 4; i++) {
+                if (data[i].bit < 0) {
+                    if (unused < 0) unused = i;
+                    continue;
+                }
+                for (int j = 0; j < 4; j++) {
+                    if (data[j].bit >= 0 && data[i].byte + 1 == data[j].byte) {
+                        if (i == 1 && j == 0) {
+                            std::swap(data[i], data[j]);
+                        } else {
+                            std::swap(data[0], data[i]);
+                            std::swap(data[1], data[j]);
+                        }
+                        return;
+                    }
+                }
+            }
+            if (unused >= 0) {
+                BUG_CHECK(unused > 1);
+                std::swap(data[1], data[unused]);
+            } else {
+                error(spec.lineno, "Must have a 16-bit pair in match bytes");
+            }
+        }
+        if (data[0].bit < 0 && data[1].bit >= 0) {
+            /* if we're using half of the 16-bit match, use the upper (first) half */
+            std::swap(data[0], data[1]);
+        }
+    }
+}
+
+Parser::State::Match::Match(int l, gress_t gress, State *s, match_t m, VECTOR(pair_t) & data)
+    : lineno(l), state(s), match(m) {
+    for (auto &kv : data) {
+        if (kv.key == "counter") {
+            if (kv.value.type == tMAP) {
+                ctr_load = 1;
+
+                bool from_ctr_init_ram = false;
+
+                for (auto &kkv : MapIterChecked(kv.value.map, true)) {
+                    if (kkv.key == "src") {
+                        from_ctr_init_ram = true;
+                    } else if (kkv.key == "push" && CHECKTYPE(kkv.value, tINT)) {
+                        if (options.target == TOFINO)
+                            error(kkv.key.lineno, "Tofino does not have counter stack");
+                        ctr_stack_push = kkv.value.i;
+                    } else if (kkv.key == "update_with_top" && CHECKTYPE(kkv.value, tINT)) {
+                        if (options.target == TOFINO)
+                            error(kkv.key.lineno, "Tofino does not have counter stack");
+                        ctr_stack_upd_w_top = kkv.value.i;
+                    }
+                }
+
+                if (from_ctr_init_ram) {
+                    ctr_ld_src = 1;
+                    if (ctr_instr) {
+                        error(kv.key.lineno, "Tofino does not allow multiple counters on a match");
+                        continue;
+                    }
+                    ctr_instr = new CounterInit(gress, kv);
+                } else {  // load from immediate
+                    for (auto &kkv : MapIterChecked(kv.value.map, true)) {
+                        if (kkv.key == "imm" && CHECKTYPE(kkv.value, tINT))
+                            ctr_imm_amt = kkv.value.i;
+                        else if (kkv.key != "push" && kkv.key != "update_with_top")
+                            error(kkv.value.lineno, "Unknown parser counter init command");
+                    }
+                }
+            } else if (kv.value.type == tCMD) {
+                if (kv.value[0] == "inc" || kv.value[0] == "increment") {
+                    if (CHECKTYPE(kv.value[1], tINT)) ctr_imm_amt = kv.value[1].i;
+                } else if (kv.value[0] == "dec" || kv.value[0] == "decrement") {
+                    if (CHECKTYPE(kv.value[1], tINT)) ctr_imm_amt = ~kv.value[1].i + 1;
+                } else {
+                    error(kv.value.lineno, "Unknown parser counter command");
+                }
+            } else if (kv.value.type == tSTR) {
+                if (kv.value == "pop") {
+                    if (options.target == TOFINO)
+                        error(kv.key.lineno, "Tofino does not have counter stack");
+                    ctr_stack_pop = true;
+                } else {
+                    error(kv.value.lineno, "Unknown parser counter command");
+                }
+            } else {
+                error(kv.value.lineno, "Syntax error for parser counter");
+            }
+        } else if (kv.key == "hdr_len_inc_stop") {
+            if (options.target == TOFINO)
+                error(kv.key.lineno, "Tofino does not support hdr_len_inc_stop");
+            else if (hdr_len_inc_stop)
+                error(kv.key.lineno, "Mulitple hdr_len_inc_stop in match");
+            hdr_len_inc_stop = HdrLenIncStop(kv.value);
+        } else if (kv.key == "priority") {
+            if (priority)
+                error(kv.key.lineno, "Mulitple priority updates in match");
+            else
+                priority = PriorityUpdate(kv.value);
+        } else if (kv.key == "shift") {
+            if (shift) error(kv.key.lineno, "Multiple shift settings in match");
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if ((shift = kv.value.i) < 0 || shift > PARSER_INPUT_BUFFER_SIZE)
+                error(kv.value.lineno, "shift value %d out of range", shift);
+        } else if (kv.key == "intr_md") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if ((intr_md_bits = kv.value.i) < 0)
+                error(kv.value.lineno, "intr_md value %d is -ve", intr_md_bits);
+        } else if (kv.key == "offset_inc") {
+            if (offset_inc) error(kv.key.lineno, "Multiple offset_inc settings in match");
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            offset_inc = kv.value.i;
+        } else if (kv.key == "buf_req") {
+            if (buf_req >= 0) error(kv.key.lineno, "Multiple buf_req settings in match");
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if ((buf_req = kv.value.i) < 0 || shift > PARSER_INPUT_BUFFER_SIZE)
+                error(kv.value.lineno, "buf_req value %d out of range", shift);
+        } else if (kv.key == "next") {
+            if (next.lineno >= 0) {
+                error(kv.key.lineno, "Multiple next settings in match");
+                error(next.lineno, "previously set here");
+            }
+            next = kv.value;
+        } else if (kv.key == "load") {
+            if (load.lineno) {
+                error(kv.value.lineno, "Multiple load entries in match");
+                error(load.lineno, "previous specified here");
+            } else {
+                load.setup(kv.value);
+            }
+        } else if (kv.key == "save") {
+            if (options.target == TOFINO)
+                error(kv.key.lineno, "Tofino does not have scratch registers in the parser");
+
+            if (load.save) error(kv.value.lineno, "Multiple save entries in match");
+
+            if (CHECKTYPE(kv.value, tVEC)) {
+                for (int i = 0; i < kv.value.vec.size; i++) {
+                    if (CHECKTYPE(kv.value[i], tSTR)) {
+                        if (kv.value[i] == "byte0")
+                            load.save |= 1 << 0;
+                        else if (kv.value[i] == "byte1")
+                            load.save |= 1 << 1;
+                        else if (kv.value[i] == "byte2")
+                            load.save |= 1 << 2;
+                        else if (kv.value[i] == "byte3")
+                            load.save |= 1 << 3;
+                        else
+                            error(lineno, "Unexpected parser save source");
+                    }
+                }
+            }
+        } else if (kv.key == "checksum") {
+            csum.emplace_back(gress, kv);
+        } else if (kv.key == "field_mapping") {
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto map : kv.value.map) {
+                    auto ref = Phv::Ref(gress, 0, map.key);
+                    auto fm = FieldMapping(ref, map.value);
+                    field_mapping.emplace_back(fm);
+                }
+            }
+        } else if (kv.key == "handle") {
+            if (CHECKTYPE(kv.value, tINT)) value_set_handle = kv.value.i;
+        } else if (kv.key == "disable_partial_hdr_err") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            if (options.target != TOFINO2)
+                error(kv.key.lineno, "disable_partial_hdr_err only available for Tofino2");
+
+            if (disable_partial_hdr_err != -1)
+                error(kv.key.lineno, "Multiple disable_partial_hdr_err settings in match");
+            if (kv.value.i < 0 || kv.value.i > 1)
+                error(kv.value.lineno, "disable_partial_hdr_err value %ld out of range",
+                      kv.value.i);
+            disable_partial_hdr_err = kv.value.i;
+        } else if (kv.key == "partial_hdr_err_proc") {
+            if (!CHECKTYPE(kv.value, tINT)) continue;
+            error(kv.key.lineno, "partial_hdr_err_proc is unsupported");
+            if (partial_hdr_err_proc != -1)
+                error(kv.key.lineno, "Multiple partial_hdr_err_proc settings in match");
+            if (kv.value.i < 0 || kv.value.i > 1)
+                error(kv.value.lineno, "partial_hdr_err_proc value %ld out of range", kv.value.i);
+            partial_hdr_err_proc = kv.value.i;
+        } else if (kv.key.type == tCMD && kv.key == "clot" && kv.key.vec.size == 2) {
+            clots.push_back(new Clot(gress, kv.key.vec[1], kv.value));
+        } else if (kv.key.type == tINT) {
+            save.push_back(new Save(gress, this, kv.key.i, kv.key.i, kv.value));
+        } else if (kv.key.type == tRANGE) {
+            save.push_back(new Save(gress, this, kv.key.range.lo, kv.key.range.hi, kv.value));
+        } else if (kv.value.type == tINT) {
+            set.push_back(new Set(gress, this, kv.key, kv.value.i));
+        } else if (kv.value.type == tCMD && kv.value[0] == "rotate") {
+            if (CHECKTYPE(kv.value[1], tINT))
+                set.push_back(new Set(gress, this, kv.key, kv.value[1].i, ROTATE));
+        } else {
+            error(kv.key.lineno, "Syntax error");
+        }
+    }
+
+    for (auto c : csum) {
+        if (c.type == 1 && c.end) {
+            if (c.dst_bit_hdr_end_pos >= shift)  // see MODEL-542
+                error(c.lineno, "Residual checksum end_pos must be less than state shift amount");
+        }
+    }
+}
+
+Parser::State::Match::Match(int l, gress_t gress, State *n) : lineno(l) {
+    /* build a default match for a synthetic start state */
+    offset_inc = shift = 0;
+    offset_rst = true;
+    next.name = n->name;
+    next.ptr.push_back(n);
+}
+
+static value_t &extract_save_phv(value_t &data) {
+    if (data.type == tVEC) return data[0];
+    if (data.type == tCMD && (data[0] == "offset" || data[0] == "rotate")) return data[1];
+    return data;
+}
+
+Parser::State::Match::Save::Save(gress_t gress, Match *m, int l, int h, value_t &data, int flgs)
+    : match(m), lo(l), hi(h), where(gress, 0, extract_save_phv(data)), flags(flgs) {
+    if (hi < lo || hi - lo > 3 || (hi - lo == 2 && !Target::PARSER_EXTRACT_BYTES()))
+        error(data.lineno, "Invalid parser extraction size");
+    if (data.type == tVEC) {
+        if (data.vec.size > 2 || data.vec.size < 1)
+            error(data.lineno, "Can only extract into single or pair");
+        if (data.vec.size == 2) second = Phv::Ref(gress, 0, data[1]);
+    }
+    if (data.type == tCMD) {
+        if (data[0] == "offset")
+            flags |= OFFSET;
+        else if (data[0] == "rotate")
+            flags |= ROTATE;
+    }
+}
+
+Parser::State::Match::Set::Set(gress_t gress, Match *m, value_t &data, int v, int flgs)
+    : match(m), where(gress, 0, extract_save_phv(data)), what(v), flags(flgs) {
+    if (data.type == tCMD) {
+        if (data[0] == "offset")
+            flags |= OFFSET;
+        else if (data[0] == "rotate")
+            flags |= ROTATE;
+    }
+}
+
+bool Parser::State::Match::Clot::parse_length(const value_t &exp, int what) {
+    enum { START, MASK, SHIFT, LOAD };
+    if (exp.type == tCMD) {
+        if (exp[0] == ">>") {
+            return what < SHIFT && parse_length(exp[1], LOAD) && parse_length(exp[2], SHIFT);
+        } else if (exp[0] == "&") {
+            return what < SHIFT && parse_length(exp[1], MASK) && parse_length(exp[2], MASK);
+        }
+    } else if (exp.type == tINT) {
+        switch (what) {
+            case START:
+            case MASK:
+                if (length_mask >= 0) return false;
+                if ((length_mask = exp.i) < 0 || length_mask > 0x3f) {
+                    error(exp.lineno, "length mask %d out of range", length_mask);
+                    return false;
+                }
+                return true;
+            case SHIFT:
+                if (length_shift >= 0) return false;
+                if ((length_shift = exp.i) < 0 || length_shift > 15) {
+                    error(exp.lineno, "length shift %d out of range", length_shift);
+                    return false;
+                }
+                return true;
+            default:
+                return false;
+        }
+    } else if (exp.type == tSTR && exp.s[0] == '@' && isdigit(exp.s[1])) {
+        char *end;
+        if (what == SHIFT || length >= 0 || (length = strtol(exp.s + 1, &end, 10)) < 0 || *end)
+            return false;
+        load_length = true;
+        return true;
+    }
+    return false;
+}
+
+Parser::State::Match::Clot::Clot(gress_t gress, const value_t &tag, const value_t &data)
+    : lineno(tag.lineno) {
+    if (CHECKTYPE2(tag, tINT, tSTR)) {
+        if (tag.type == tINT) {
+            this->tag = tag.i;
+            name = std::to_string(tag.i);
+        } else {
+            this->tag = -1;
+            name = tag.s;
+        }
+    }
+    Parser::clots[gress][name].push_back(this);
+    if (!CHECKTYPE3(data, tINT, tRANGE, tMAP)) return;
+    if (data.type == tINT) {
+        start = data.i;
+        length = 1;
+    } else if (data.type == tRANGE) {
+        start = data.range.lo;
+        length = data.range.hi - data.range.lo + 1;
+    } else {
+        for (auto &kv : data.map) {
+            if (kv.key == "start") {
+                if (CHECKTYPE(kv.value, tINT)) start = kv.value.i;
+            } else if (kv.key == "length") {
+                if (kv.value.type == tINT) {
+                    length = kv.value.i;
+                } else if (!parse_length(kv.value) || !load_length) {
+                    error(kv.value.lineno, "Syntax error");
+                }
+                if (length_mask < 0) length_mask = 0x3f;
+                if (length_shift < 0) length_shift = 0;
+            } else if (kv.key == "max_length") {
+                if (CHECKTYPE(kv.value, tINT)) max_length = kv.value.i;
+            } else if (kv.key == "checksum") {
+                if (CHECKTYPE(kv.value, tINT)) csum_unit = kv.value.i;
+            } else if (kv.key == "stack_depth") {
+                if (CHECKTYPE(kv.value, tINT)) stack_depth = kv.value.i;
+            } else if (kv.key == "stack_inc") {
+                if (CHECKTYPE(kv.value, tINT)) stack_inc = kv.value.i;
+            } else {
+                error(kv.key.lineno, "Unknown CLOT key %s", value_desc(kv.key));
+            }
+        }
+    }
+    if (start < 0) error(data.lineno, "No start in clot %s", name.c_str());
+    if (length < 0) error(data.lineno, "No length in clot %s", name.c_str());
+    if (max_length < 0) {
+        if (load_length)
+            max_length = 64;
+        else
+            max_length = length;
+    } else if (!load_length && max_length != length) {
+        error(data.lineno, "Inconsistent constant length and max_length in clot");
+    }
+    // Create objects for each element in the stack. Only the first element
+    // creates the additional stack elements, and this should only be done
+    // for clot instances in parser loops.
+    for (int i = stack_inc; i < stack_depth; i += stack_inc) new Clot(gress, *this, i);
+}
+
+/// Clone a clot to create a new stack instance. Should only be used
+/// for clot extrcts in non-unrolled parser loops.
+Parser::State::Match::Clot::Clot(gress_t gress, const Clot &src, int instance) {
+    if (src.tag >= 0) {
+        this->tag = src.tag + instance;
+        name = std::to_string(this->tag);
+    } else {
+        this->tag = -1;
+        name = src.name + "." + std::to_string(instance);
+    }
+    Parser::clots[gress][name].push_back(this);
+    lineno = src.lineno;
+    load_length = src.load_length;
+    start = src.start;
+    length = src.length;
+    length_shift = src.length_shift;
+    length_mask = src.length_mask;
+    max_length = src.max_length;
+    csum_unit = src.csum_unit;
+    stack_depth = src.stack_depth;
+}
+
+Parser::State::Match::FieldMapping::FieldMapping(Phv::Ref &ref, const value_t &a) {
+    if (CHECKTYPE(a, tCMD)) {
+        where = ref;
+        container_id = a.vec[0].s;
+        lo = a.vec[1].range.lo;
+        hi = a.vec[1].range.hi;
+    } else {
+        error(a.lineno, "Syntax error");
+    }
+}
+
+Parser::State::Match::HdrLenIncStop::HdrLenIncStop(const value_t &data) {
+    if (CHECKTYPE(data, tINT)) {
+        if (data.i < 0 || data.i > PARSER_INPUT_BUFFER_SIZE)
+            error(data.lineno, "hdr_len_inc_stop %" PRId64 " out of range", data.i);
+        lineno = data.lineno;
+        final_amt = data.i;
+    }
+}
+
+Parser::State::State(int l, const char *n, gress_t gr, match_t sno, const VECTOR(pair_t) & data)
+    : name(n), gress(gr), stateno(sno), def(0), lineno(l) {
+    VECTOR(pair_t) default_data = EMPTY_VECTOR_INIT;
+    bool have_default = data["default"] != 0;
+    for (auto &kv : data) {
+        if (kv.key.type == tINT && kv.value.type == tMAP) {
+            match_t m = {~(unsigned)kv.key.i, (unsigned)kv.key.i};
+            match.push_back(new Match(kv.key.lineno, gress, this, m, kv.value.map));
+        } else if (kv.key.type == tBIGINT && kv.value.type == tMAP) {
+            match_t m = {~(unsigned)kv.key.bigi.data[0], (unsigned)kv.key.bigi.data[0]};
+            match.push_back(new Match(kv.key.lineno, gress, this, m, kv.value.map));
+        } else if (kv.key == "value_set" && kv.value.type == tMAP) {
+            match_t m = {0, 0};
+            match.push_back(new Match(kv.key.lineno, gress, this, m, kv.value.map));
+            if (kv.key.type == tCMD) {
+                if (CHECKTYPE(kv.key[1], tSTR)) match.back()->value_set_name = kv.key[1].s;
+                if (kv.key.vec.size > 2 && CHECKTYPE(kv.key[2], tINT))
+                    match.back()->value_set_size = kv.key[2].i;
+                else
+                    match.back()->value_set_size = 1;
+            } else {
+                match.back()->value_set_size = 1;
+            }
+        } else if (kv.key.type == tMATCH) {
+            if (!CHECKTYPE(kv.value, tMAP)) continue;
+            match.push_back(new Match(kv.key.lineno, gress, this, kv.key.m, kv.value.map));
+        } else if (kv.key == "match") {
+            if (key.lineno) {
+                error(kv.value.lineno, "Multiple match entries in state %s", n);
+                error(key.lineno, "previous specified here");
+            } else {
+                key.setup(kv.value);
+            }
+        } else if (kv.key == "option") {
+            if (kv.value == "ignore_max_depth")
+                ignore_max_depth = true;
+            else
+                error(kv.value.lineno, "Unknown state option %s", value_desc(kv.value));
+        } else if (kv.key == "default") {
+            if (!CHECKTYPE(kv.value, tMAP)) continue;
+            if (def) {
+                error(kv.key.lineno, "Multiple defaults in state %s", n);
+                error(def->lineno, "previous specified here");
+            } else {
+                match_t m = {0, 0};
+                def = new Match(kv.key.lineno, gress, this, m, kv.value.map);
+            }
+        } else if (!have_default) {
+            VECTOR_add(default_data, kv);
+        } else {
+            error(kv.key.lineno, "Syntax error");
+        }
+    }
+    if (default_data.size) {
+        BUG_CHECK(!def);
+        match_t m = {0, 0};
+        def = new Match(default_data[0].key.lineno, gress, this, m, default_data);
+    }
+    VECTOR_fini(default_data);
+}
+
+bool Parser::State::can_be_start() {
+    if (match.size()) return false;
+    if (!def) return true;
+    // if (def->counter || def->offset || def->shift) return false;
+    // if (def->counter_reset || def->offset_reset) return false;
+    // if (def->save.size() || def->set.size()) return false;
+    return true;
+}
+
+void Parser::State::unmark_reachable(Parser *pa, bitvec &unreach) {
+    if (!unreach[all_idx]) return;
+    unreach[all_idx] = 0;
+    for (auto m : match) m->unmark_reachable(pa, this, unreach);
+    if (def) def->unmark_reachable(pa, this, unreach);
+}
+
+void Parser::State::Match::unmark_reachable(Parser *pa, Parser::State *state, bitvec &unreach) {
+    for (auto succ : next) succ->unmark_reachable(pa, unreach);
+}
+
+/********* pass 1 *********/
+
+void Parser::State::Match::pass1(Parser *pa, State *state) {
+    next.check(state->gress, pa, state);
+    for (auto s : save) {
+        if (!s->where.check()) continue;
+        if (s->where->reg.parser_id() < 0)
+            error(s->where.lineno, "%s is not accessable in the parser", s->where->reg.name);
+        if (options.target == TOFINO && s->lo >= 32 && s->lo < 54)
+            error(s->where.lineno, "byte 32-53 of input buffer cannot be used for output");
+        if (options.target == JBAY && s->lo >= 32 && s->lo < 48)
+            error(s->where.lineno, "byte 32-47 of input buffer cannot be used for output");
+        pa->phv_use[state->gress][s->where->reg.uid] = 1;
+        int size = s->where.size();
+        if (s->second) {
+            if (!s->second.check()) continue;
+            if (s->second->reg.parser_id() < 0)
+                error(s->second.lineno, "%s is not accessable in the parser", s->second->reg.name);
+            else if (s->second->lo >= 32 && s->second->lo < 54)
+                error(s->where.lineno, "byte 32-53 of input buffer cannot be used for output");
+            else if (s->second->reg.parser_id() != s->where->reg.parser_id() + 1 ||
+                     (s->where->reg.parser_id() & 1))
+                error(s->second.lineno, "Can only write into even/odd register pair");
+            else if (s->second->lo || s->second->hi != size - 1)
+                error(s->second.lineno, "Can only write data into whole phv registers in parser");
+            else
+                size *= 2;
+        }
+        if (!Target::PARSER_EXTRACT_BYTES() && s->where.size() != s->where->reg.size)
+            error(s->where.lineno, "Can only write data into whole phv registers in parser");
+        else if ((s->hi - s->lo + 1) * 8 != size)
+            error(s->where.lineno, "Data to write doesn't match phv register size");
+    }
+    for (auto s : set) {
+        if (!s->where.check()) continue;
+        if (s->where->reg.parser_id() < 0)
+            error(s->where.lineno, "%s is not accessable in the parser", s->where->reg.name);
+        pa->phv_use[state->gress][s->where->reg.uid] = 1;
+    }
+    if (value_set_size == 0) {
+        uint64_t match_mask = bitMask(state->key.width);
+        uint64_t not_covered = match_mask & ~(match.word0 | match.word1);
+        if (not_covered != 0) {
+            warning(lineno,
+                    "Match pattern does not cover all bits of match key, "
+                    "assuming the rest are don't care");
+            match.word0 |= not_covered;
+            match.word1 |= not_covered;
+        }
+        if ((match.word1 & ~match.word0 & ~match_mask) != 0)
+            error(lineno, "Matching on bits not in the match of state %s", state->name.c_str());
+        for (auto m : state->match) {
+            if (m == this) break;
+            if (m->match == match) {
+                warning(lineno, "Can't match parser state due to previous match");
+                warning(m->lineno, "here");
+                break;
+            }
+        }
+    }
+    for (auto &c : csum) c.pass1(pa);
+}
+
+bool Parser::State::Match::Set::merge(gress_t gress, const Set &a) {
+    auto orig = where;
+    if (where->reg != a.where->reg) return false;
+    if (!(where->hi < a.where->lo || a.where->hi < where->lo)) {
+        warning(where.lineno, "Phv slices %s and %s overlapping", where.name(), a.where.name());
+    }
+    what = ((what << where->lo) | (a.what << a.where->lo)) >> (std::min(where->lo, a.where->lo));
+    where = Phv::Ref(where->reg, gress, std::min(where->lo, a.where->lo),
+                     std::max(where->hi, a.where->hi));
+    LOG1("Merging phv slices " << orig << " + " << a.where << " = " << where);
+    return true;
+}
+
+void Parser::State::pass1(Parser *pa) {
+    for (auto m : match) m->pass1(pa, this);
+    if (def) def->pass1(pa, this);
+    for (auto code : MatchIter(stateno)) {
+        if (pa->state_use[code]) {
+            error(lineno, "%sgress state %s uses state code %d, already in use", gress ? "E" : "In",
+                  name.c_str(), code);
+            for (auto *state : pa->all) {
+                if (state != this && state->gress == gress && state->stateno.matches(code))
+                    error(state->lineno, "also used by state %s", state->name.c_str());
+            }
+        }
+        pa->state_use[code] = 1;
+    }
+
+    for (auto m : match)
+        for (auto succ : m->next) succ->pred.insert(m);
+
+    if (def)
+        for (auto succ : def->next) succ->pred.insert(def);
+}
+
+/********* pass 2 *********/
+
+void Parser::State::MatchKey::preserve_saved(unsigned saved) {
+    for (int i = 3; i >= 0; i--) {
+        if (!((saved >> i) & 1)) continue;
+        if (data[i].bit < 0 || data[i].byte == USE_SAVED) continue;
+        if ((specified >> i) & 1) {
+            error(lineno,
+                  "match in %s matcher conflicts with previous state save "
+                  "action",
+                  Parser::match_key_loc_name(i));
+        } else if (move_down(i) < 0) {
+            error(lineno,
+                  "Ran out of matching space due to preserved values from "
+                  "previous states");
+            break;
+        }
+    }
+}
+
+void Parser::State::Match::pass2(Parser *pa, State *state) {
+    for (auto &c : csum) c.pass2(pa);
+
+    if (ctr_instr) ctr_instr->pass2(pa);
+
+    if (clots.size() > 0) {
+        if (options.target == TOFINO)
+            error(clots[0]->lineno, "clots not supported on tofino");
+        else if (clots.size() > 2)
+            error(clots[2]->lineno, "no more than two clots per state");
+    }
+}
+
+void Parser::State::pass2(Parser *pa) {
+    if (!stateno) {
+        unsigned s;
+        for (s = 0; pa->state_use[s]; s++) {
+        }
+        if (s > PARSER_STATE_MASK) {
+            error(lineno, "Can't allocate state number for %sgress state %s", gress ? "e" : "in",
+                  name.c_str());
+        } else {
+            stateno.word0 = s ^ PARSER_STATE_MASK;
+            stateno.word1 = s;
+            pa->state_use[s] = 1;
+        }
+    }
+    unsigned def_saved = 0;
+    if (def && def->load.lineno >= 0) {
+        for (int i = 0; i < 4; i++)
+            if (def->load.data[i].bit >= 0) def_saved |= 1 << i;
+        if (def_saved && def->next) def->next->key.preserve_saved(def_saved);
+    }
+    for (auto m : match) {
+        m->pass2(pa, this);
+        unsigned saved = def_saved;
+        if (m->load.lineno) {
+            for (int i = 0; i < 4; i++)
+                if (m->load.data[i].bit >= 0)
+                    saved |= 1 << i;
+                else if (def && def->load.lineno && def->load.data[i].bit >= 0)
+                    m->load.data[i] = def->load.data[i];
+        }
+        if (saved) {
+            if (m->next)
+                m->next->key.preserve_saved(saved);
+            else if (def && def->next)
+                def->next->key.preserve_saved(saved);
+        }
+    }
+}
+
+/********* output *********/
+
+/// Extractor config tracking and register config code
+/// Different tofino models have very different ways in which their parser extractors are
+/// managed, but all are common in that there are multiple extractions that can happen in
+/// parallel in a single parser match tcam row.  We manage this by having a target-specific
+/// 'output_map' object passed via a void * to target-sepcific write_output_config methods
+/// along with an `unsigned used` mask that tracks which or how many extractors have been
+/// used, so as to issue errors for conflicting uses.
+///
+/// The `setup_phv_output_map` method creates the target specific output_map object that
+/// will be passed to subsequent `write_output_config` calls to deal with each individual
+/// extract.  Finally, `mark_unused_output_map` is called to deal with any register setup
+/// needed for unused extractors.  They're called 'outputs' as the are concerned with
+/// outputting PHV values from the parser.
+///
+/// PHV outputs are split into 'saves' and 'sets' which come from different syntax in the
+/// asm source.  'saves' copy data from the input buffer into PHVs, while 'sets' write
+/// constants into the PHVs.  Different targets have different constraints on how flexible
+/// they are for saves vs sets, so some want to do saves first and other sets
+///  - tofino1: do saves first (why? sets seem more constrained, but there's an issue
+///    with ganging smaller extractors to write larger PHVs)
+///  - tofino2: do sets first as some extractors can only do saves
+///
+/// FIXME -- should probably refactor this into a more C++ style base class pointer with
+/// derived classes for each target.  Should move the 'used' mask into that object as well.
+/// Alternately, could move the entire `setup` to `mark_unused` process into a target specific
+/// method.
+
+std::set<Parser::State::Match *> Parser::State::Match::get_all_preds() {
+    std::set<Parser::State::Match *> visited;
+    return get_all_preds_impl(visited);
+}
+
+std::set<Parser::State::Match *> Parser::State::Match::get_all_preds_impl(
+    std::set<Parser::State::Match *> &visited) {
+    if (visited.count(this)) return {};
+
+    visited.insert(this);
+
+    std::set<Parser::State::Match *> rv;
+
+    for (auto p : this->state->pred) {
+        rv.insert(p);
+        auto pred = p->get_all_preds_impl(visited);
+        rv.insert(pred.begin(), pred.end());
+    }
+
+    return rv;
+}
+
+/* If the bitvec contains one of a pair of 8-bit PHVs, add the other, as they need
+ * to be owened together in the parser ingress/egress ownership */
+bitvec expand_parser_groups(bitvec phvs) {
+    for (int i : phvs)
+        if (Phv::reg(i)->size == 8) phvs[i ^ 1] = 1;
+    return phvs;
+}
+
+/* remove PHVs from the bitvec which are not accessable in the parser
+ * FIXME -- should just have a static const bitvec of the valid ones and & with it */
+bitvec remove_nonparser(bitvec phvs) {
+    for (int i : phvs)
+        if (Phv::reg(i)->parser_id() < 0) phvs[i] = 0;
+    return phvs;
+}
+
+void setup_jbay_ownership(bitvec phv_use[2], checked_array<128, ubits<1>> &left,
+                          checked_array<128, ubits<1>> &right, checked_array<256, ubits<1>> &main_i,
+                          checked_array<256, ubits<1>> &main_e) {
+    for (int i : phv_use[EGRESS]) {
+        if (Phv::reg(i)->size == 8) {
+            if (phv_use[INGRESS][i ^ 1])
+                error(0, "Can't use %s in ingress and %s in egress in Tofino2 parser",
+                      Phv::reg(i ^ 1)->name, Phv::reg(i)->name);
+        }
+    }
+
+    std::set<unsigned> left_egress_owner_ids, right_egress_owner_ids;
+    std::set<unsigned> all_egress_owner_ids;
+
+    for (int i : phv_use[EGRESS]) {
+        auto id = Phv::reg(i)->parser_id();
+        if (id < 0)
+            error(0, "Can't access %s in parser", Phv::reg(i)->name);
+        else if (id < 128)
+            left_egress_owner_ids.insert(id);
+        else
+            right_egress_owner_ids.insert(id - 128);
+
+        all_egress_owner_ids.insert(id);
+
+        if (Phv::reg(i)->size == 32) {
+            if (++id < 128)
+                left_egress_owner_ids.insert(id);
+            else
+                right_egress_owner_ids.insert(id - 128);
+
+            all_egress_owner_ids.insert(id);
+        }
+    }
+
+    for (auto id : left_egress_owner_ids) left[id] = 1;
+    for (auto id : right_egress_owner_ids) right[id] = 1;
+    for (auto id : all_egress_owner_ids) main_i[id] = main_e[id] = 1;
+}
+
+void setup_jbay_clear_on_write(bitvec phv_allow_clear_on_write, checked_array<128, ubits<1>> &left,
+                               checked_array<128, ubits<1>> &right,
+                               checked_array<256, ubits<1>> &main_i,
+                               checked_array<256, ubits<1>> &main_e) {
+    for (int i : phv_allow_clear_on_write) {
+        auto id = Phv::reg(i)->parser_id();
+
+        if (id < 0)
+            error(0, "Can't access %s in parser", Phv::reg(i)->name);
+        else if (id < 128)
+            left[id] = 1;
+        else
+            right[id - 128] = 1;
+
+        main_i[id] = main_e[id] = 1;
+
+        if (Phv::reg(i)->size == 32) {
+            if (++id < 128)
+                left[id] = 1;
+            else
+                right[id - 128] = 1;
+
+            main_i[id] = main_e[id] = 1;
+        }
+    }
+}
+
+void setup_jbay_no_multi_write(bitvec phv_allow_bitwise_or, bitvec phv_allow_clear_on_write,
+                               checked_array<256, ubits<1>> &nmw_i,
+                               checked_array<256, ubits<1>> &nmw_e) {
+    std::set<unsigned> allow_multi_write_ids;
+
+    for (int i : phv_allow_bitwise_or) {
+        auto id = Phv::reg(i)->parser_id();
+        allow_multi_write_ids.insert(id);
+
+        if (Phv::reg(i)->size == 32) allow_multi_write_ids.insert(++id);
+    }
+
+    for (int i : phv_allow_clear_on_write) {
+        auto id = Phv::reg(i)->parser_id();
+        allow_multi_write_ids.insert(id);
+
+        if (Phv::reg(i)->size == 32) allow_multi_write_ids.insert(++id);
+    }
+
+    for (int i = 0; i < 256; i++) {
+        if (!allow_multi_write_ids.count(i)) nmw_i[i] = nmw_e[i] = 1;
+    }
+}
+
+// WARNING: This function will print all parser paths. In some programs based on
+// the complexity of parser graphs, this can result in a path explosion as it
+// visits all possible paths and can lead to the function taking an unreasonably
+// large amount of time to execute.
+//
+// The intention for this function is for DEBUG purposes only and should not be
+// checked in with it being called from anywhere for logging due to above
+// potential worst case issue.
+//
+// Function also checks for cycles in the parser graph.
+// For debug, call function on a parser object and run assembler with -Tparser:1
+void Parser::print_all_paths() {
+    // Check for cycles in states
+    ordered_set<std::string> vstates;
+    int count = 0;
+    std::function<void(State *, std::string)> visit_states = [&](State *s, std::string sstr) {
+        count++;
+        // To limit execution uncomment and set variable
+        // if (count > COUNT_STATE_PATHS) exit(1);
+        if (s == nullptr) {
+            LOG1("State Path : " << sstr << " => END");
+            return;
+        }
+        // Check for previously visited states to show cycles in parser state
+        // graph
+        if (vstates.count(s->name)) {
+            LOG1("****Revisiting " << s->name << " through path : " << sstr
+                                   << ". Parser graph has a cycle");
+            return;
+        }
+        if (!sstr.empty()) sstr += " => ";
+        sstr += s->name;
+        vstates.insert(s->name);
+
+        LOG1("State Path (" << count << ") : depth (" << vstates.size() << ") :" << sstr);
+
+        for (auto m : s->match) {
+            std::stringstream ss;
+            ss << m->match;
+            std::string sstr2 = sstr + ("(" + ss.str() + ")");
+            for (auto ns : m->next) {
+                visit_states(ns, sstr2);
+            }
+        }
+        vstates.erase(s->name);
+    };
+    if (states.size() > 0) visit_states(states.begin()->second, "");
+}
diff --git a/backends/tofino/bf-asm/parser-tofino-jbay.h b/backends/tofino/bf-asm/parser-tofino-jbay.h
new file mode 100644
index 00000000000..74dd41aab77
--- /dev/null
+++ b/backends/tofino/bf-asm/parser-tofino-jbay.h
@@ -0,0 +1,722 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef PARSER_TOFINO_JBAY_H_
+#define PARSER_TOFINO_JBAY_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "backends/tofino/bf-asm/target.h"
+#include "lib/bitvec.h"
+#include "parser.h"
+#include "phv.h"
+#include "sections.h"
+#include "ubits.h"
+
+enum {
+    /* global constants related to parser */
+    PARSER_STATE_MASK = 0xff,
+    PARSER_TCAM_DEPTH = 256,
+    PARSER_CHECKSUM_ROWS = 32,
+    PARSER_CTRINIT_ROWS = 16,
+    PARSER_INPUT_BUFFER_SIZE = 32,
+    PARSER_SRC_MAX_IDX = 63,
+    PARSER_MAX_CLOTS = 64,
+    PARSER_MAX_CLOT_LENGTH = 64,
+};
+
+/**
+ * @brief Representation of the Tofino 1/2 parser in assembler
+ * @ingroup parde
+ */
+class Parser : public BaseParser, public Contextable {
+    void write_config(RegisterSetBase &regs, json::map &json, bool legacy = true) override;
+    template <class REGS>
+    void write_config(REGS &, json::map &, bool legacy = true);
+    struct CounterInit {
+        gress_t gress;
+        int lineno = -1, addr = -1;
+        int add = 0, mask = 255, rot = 0, max = 255, src = -1;
+        CounterInit(gress_t, pair_t);
+        void pass1(Parser *) {}
+        void pass2(Parser *);
+        template <class REGS>
+        void write_config(REGS &, gress_t, int);
+        bool equiv(const CounterInit &) const;
+    };
+    struct PriorityUpdate {
+        int lineno = -1, offset = -1, shift = -1, mask = -1;
+        PriorityUpdate() {}
+        explicit PriorityUpdate(const value_t &data);
+        bool parse(const value_t &exp, int what = 0);
+        explicit operator bool() const { return lineno >= 0; }
+        template <class REGS>
+        void write_config(REGS &);
+    };
+    struct RateLimit {
+        int lineno = -1;
+        int inc = -1, dec = -1, max = -1, interval = -1;
+        void parse(const VECTOR(pair_t) &);
+        explicit operator bool() const { return lineno >= 0; }
+        template <class REGS>
+        void write_config(REGS &, gress_t);
+    };
+
+ public:
+    struct Checksum;
+
+    struct State {
+        struct Ref {
+            int lineno;
+            std::string name;
+            match_t pattern;
+            std::vector<State *> ptr;
+            Ref() : lineno(-1) { pattern.word0 = pattern.word1 = 0; }
+            Ref &operator=(const value_t &);
+            explicit Ref(value_t &v) { *this = v; }
+            operator bool() const { return ptr.size() > 0; }
+            State *operator->() const {
+                BUG_CHECK(ptr.size() == 1);
+                return ptr[0];
+            }
+            State *operator*() const {
+                BUG_CHECK(ptr.size() == 1);
+                return ptr[0];
+            }
+            bool operator==(const Ref &a) const { return name == a.name && pattern == a.pattern; }
+            void check(gress_t, Parser *, State *);
+            std::vector<State *>::const_iterator begin() const { return ptr.begin(); }
+            std::vector<State *>::const_iterator end() const { return ptr.end(); }
+        };
+        struct MatchKey {
+            int lineno;
+            struct {
+                short bit, byte;
+            } data[4];
+            enum { USE_SAVED = 0x7fff }; /* magic number can be stored in 'byte' field */
+            short specified;
+            short ctr_zero, ctr_neg;
+            short width;
+            short save = 0;
+            MatchKey() : lineno(0), specified(0), ctr_zero(-1), ctr_neg(-1), width(0) {
+                for (auto &a : data) a.bit = a.byte = -1;
+            }
+            void setup(value_t &);
+            int setup_match_el(int, value_t &);
+            void preserve_saved(unsigned mask);
+            template <class REGS>
+            void write_config(REGS &, json::vector &);
+
+         private:
+            int add_byte(int, int, bool use_saved = false);
+            int move_down(int);
+        };
+        struct OutputUse {
+            unsigned b8 = 0, b16 = 0, b32 = 0;
+            OutputUse &operator+=(const OutputUse &a) {
+                b8 += a.b8;
+                b16 += a.b16;
+                b32 += a.b32;
+                return *this;
+            }
+        };
+        struct Match {
+            int lineno;
+            State *state = nullptr;
+            match_t match;
+            std::string value_set_name;
+            int value_set_size = 0;
+            int value_set_handle = -1;
+            int offset_inc = 0, shift = 0, buf_req = -1;
+            int disable_partial_hdr_err = -1, partial_hdr_err_proc = -1;
+            bool offset_rst = false;
+            int intr_md_bits = 0;
+
+            int ctr_imm_amt = 0, ctr_ld_src = 0, ctr_load = 0;
+            bool ctr_stack_push = false, ctr_stack_upd_w_top = false, ctr_stack_pop = false;
+
+            CounterInit *ctr_instr = nullptr;
+
+            PriorityUpdate priority;
+
+            Ref next;
+            MatchKey load;
+
+            int row = -1;
+            /// Data for narrow to wide extraction analysis, flag and
+            /// vector of affected PHV locations
+            bool has_narrow_to_wide_extract = false;
+            // 32b narrow to wide extractions using 2x16 extractions
+            std::vector<const Phv::Ref *> narrow_to_wide_32b_16;
+            // 32b narrow to wide extractions using 4x8 extractions
+            std::vector<const Phv::Ref *> narrow_to_wide_32b_8;
+            // 16b narrow to wide extractions using 2x8 extractions
+            std::vector<const Phv::Ref *> narrow_to_wide_16b_8;
+
+            enum flags_t { OFFSET = 1, ROTATE = 2 };
+
+            struct Save {
+                Match *match;
+                int lo, hi;
+                Phv::Ref where, second;
+                int flags;
+                Save(gress_t, Match *m, int l, int h, value_t &data, int flgs = 0);
+                template <class REGS>
+                int write_output_config(REGS &, void *, unsigned &, int, int) const;
+            };
+            std::vector<Save *> save;
+
+            struct Set {
+                Match *match = nullptr;
+                Phv::Ref where;
+                unsigned what;
+                int flags;
+                Set(gress_t gress, Match *m, value_t &data, int v, int flgs = 0);
+                template <class REGS>
+                void write_output_config(REGS &, void *, unsigned &, int, int) const;
+                bool merge(gress_t, const Set &a);
+                bool operator==(const Set &a) const {
+                    return where == a.where && what == a.what && flags == a.flags;
+                }
+            };
+            std::vector<Set *> set;
+
+            struct Clot {
+                int lineno, tag;
+                std::string name;
+                bool load_length = false;
+                int start = -1, length = -1, length_shift = -1, length_mask = -1;
+                int max_length = -1;
+                int csum_unit = -1;
+                int stack_depth = 1;
+                int stack_inc = 1;
+                Clot(gress_t gress, const value_t &tag, const value_t &data);
+                Clot(const Clot &) = delete;
+                Clot(Clot &&) = delete;
+                bool parse_length(const value_t &exp, int what = 0);
+                template <class PO_ROW>
+                void write_config(PO_ROW &, int, bool) const;
+
+             private:
+                Clot(gress_t, const Clot &, int);
+            };
+            std::vector<Clot *> clots;
+            std::vector<Checksum> csum;
+
+            struct FieldMapping {
+                Phv::Ref where;
+                std::string container_id;
+                int lo = -1;
+                int hi = -1;
+                FieldMapping(Phv::Ref &ref, const value_t &a);
+            };
+            std::vector<FieldMapping> field_mapping;
+
+            struct HdrLenIncStop {
+                int lineno = -1;
+                unsigned final_amt = 0;
+                HdrLenIncStop() {}
+                explicit HdrLenIncStop(const value_t &data);
+                explicit operator bool() const { return lineno >= 0; }
+                template <class PO_ROW>
+                void write_config(PO_ROW &) const;
+            } hdr_len_inc_stop;
+
+            Match(int lineno, gress_t, State *s, match_t m, VECTOR(pair_t) & data);
+            Match(int lineno, gress_t, State *n);
+            ~Match() {
+                if (ctr_instr) delete ctr_instr;
+            }
+            void unmark_reachable(Parser *, State *state, bitvec &unreach);
+            void pass1(Parser *pa, State *state);
+            void pass2(Parser *pa, State *state);
+            template <class REGS>
+            int write_load_config(REGS &, Parser *, State *, int) const;
+            template <class REGS>
+            void write_lookup_config(REGS &, State *, int) const;
+            template <class EA_REGS>
+            void write_counter_config(EA_REGS &) const;
+            template <class REGS>
+            void write_common_row_config(REGS &, Parser *, State *, int, Match *, json::map &);
+            template <class REGS>
+            void write_row_config(REGS &, Parser *, State *, int, Match *, json::map &);
+            template <class REGS>
+            void write_config(REGS &, Parser *, State *, Match *, json::map &);
+            template <class REGS>
+            void write_config(REGS &, json::vector &);
+
+            template <class REGS>
+            void write_saves(REGS &regs, Match *def, void *output_map, int &max_off, unsigned &used,
+                             int csum_8b, int csum_16b);
+            template <class REGS>
+            void write_sets(REGS &regs, Match *def, void *output_map, unsigned &used, int csum_8b,
+                            int csum_16b);
+
+            std::set<Match *> get_all_preds();
+            std::set<Match *> get_all_preds_impl(std::set<Match *> &visited);
+        };
+
+        std::string name;
+        gress_t gress;
+        match_t stateno;
+        MatchKey key;
+        std::vector<Match *> match;
+        Match *def;
+        std::set<Match *> pred;
+        bool ignore_max_depth = false;
+        int lineno = -1;
+        int all_idx = -1;
+
+        State(State &&) = default;
+        State(int lineno, const char *name, gress_t, match_t stateno, const VECTOR(pair_t) & data);
+        bool can_be_start();
+        void unmark_reachable(Parser *, bitvec &);
+        void pass1(Parser *);
+        void pass2(Parser *);
+        template <class REGS>
+        int write_lookup_config(REGS &, Parser *, State *, int, const std::vector<State *> &);
+        template <class REGS>
+        void write_config(REGS &, Parser *, json::vector &);
+    };
+
+    struct Checksum {
+        int lineno = -1, addr = -1, unit = -1;
+        gress_t gress;
+        Phv::Ref dest;
+        int tag = -1;
+        unsigned add = 0, mask = 0, swap = 0, mul_2 = 0;
+        unsigned dst_bit_hdr_end_pos = 0;
+        bool start = false, end = false, shift = false;
+        unsigned type = 0;  // 0 = verify, 1 = residual, 2 = clot
+        Checksum(gress_t, pair_t);
+        bool equiv(const Checksum &) const;
+        void pass1(Parser *);
+        void pass2(Parser *);
+        template <class REGS>
+        void write_config(REGS &, Parser *);
+        template <class REGS>
+        void write_output_config(REGS &, Parser *, State::Match *, void *, unsigned &) const;
+
+     private:
+        template <typename ROW>
+        void write_tofino_row_config(ROW &row);
+        template <typename ROW>
+        void write_row_config(ROW &row);
+    };
+
+ public:
+    void input(VECTOR(value_t) args, value_t data);
+    void process();
+    void output(json::map &) override;
+    void output_legacy(json::map &);
+    gress_t gress;
+    std::string name;
+    std::map<std::string, State *> states;
+    std::vector<State *> all;
+    std::map<State::Match *, int> match_to_row;
+    bitvec port_use;
+    int parser_no;  // used to print cfg.json
+    bitvec state_use;
+    State::Ref start_state[4];
+    int priority[4] = {0};
+    int pri_thresh[4] = {0, 0, 0, 0};
+    int tcam_row_use = 0;
+    Phv::Ref parser_error;
+    // the ghost "parser" extracts 32-bit value
+    // this information is first extracted in AsmParser and passed to
+    // individual Parser, because currently parse_merge register is programmed
+    // in Parser class.
+    // FIXME -- should move all merge reg handling into AsmParser.
+    std::vector<Phv::Ref> ghost_parser;
+    unsigned ghost_pipe_mask = 0xf;  // only set for JBAY
+    bitvec (&phv_use)[2];
+    bitvec phv_allow_bitwise_or, phv_allow_clear_on_write;
+    bitvec phv_init_valid;
+    int hdr_len_adj = 0, meta_opt = 0;
+    std::vector<std::array<Checksum *, PARSER_CHECKSUM_ROWS>> checksum_use;
+    std::array<CounterInit *, PARSER_CTRINIT_ROWS> counter_init = {};
+    static std::map<gress_t, std::map<std::string, std::vector<State::Match::Clot *>>> clots;
+    static std::array<std::vector<State::Match::Clot *>, PARSER_MAX_CLOTS> clot_use;
+    static unsigned max_handle;
+    int parser_handle = -1;
+    RateLimit rate_limit;
+
+    Parser(bitvec (&phv_use)[2], gress_t gr, int idx)
+        : gress(gr), parser_no(idx), phv_use(phv_use) {
+        if (gress == INGRESS) {
+            parser_depth_max_bytes = Target::PARSER_DEPTH_MAX_BYTES_INGRESS();
+            parser_depth_min_bytes = Target::PARSER_DEPTH_MIN_BYTES_INGRESS();
+        } else {
+            parser_depth_max_bytes = Target::PARSER_DEPTH_MAX_BYTES_EGRESS();
+            parser_depth_min_bytes = Target::PARSER_DEPTH_MIN_BYTES_EGRESS();
+        }
+    }
+
+    template <class REGS>
+    void gen_configuration_cache(REGS &, json::vector &cfg_cache);
+    static int clot_maxlen(gress_t gress, unsigned tag) {
+        auto &vec = clot_use[tag];
+        return vec.empty() ? -1 : vec.at(0)->max_length;
+    }
+    static int clot_maxlen(gress_t gress, std::string tag) {
+        if (clots.count(gress) && clots.at(gress).count(tag))
+            return clots.at(gress).at(tag).at(0)->max_length;
+        return -1;
+    }
+    static int clot_tag(gress_t gress, std::string tag) {
+        if (clots.count(gress) && clots.at(gress).count(tag))
+            return clots.at(gress).at(tag).at(0)->tag;
+        return -1;
+    }
+
+    static const char *match_key_loc_name(int loc);
+    static int match_key_loc(const char *key);
+    static int match_key_loc(value_t &key, bool errchk = true);
+    static int match_key_size(const char *key);
+
+    // Parser Handle Setup
+    // ____________________________________________________
+    // | Table Type | Pipe Id | Parser Handle | PVS Handle |
+    // 31          24        20              12            0
+    // PVS Handle = 12 bits
+    // Parser Handle = 8 bits
+    // Pipe ID = 4 bits
+    // Table Type = 8 bits (Parser type is 15)
+    static unsigned next_handle() {
+        // unique_table_offset is to support multiple pipe.
+        // assume parser type is 15, table type used 0 - 6
+        return max_handle++ << 12 | unique_table_offset << 20 | 15 << 24;
+    }
+    // Store parser names to their handles. Used by phase0 match tables to link
+    // parser handle
+    static std::map<std::string, unsigned> parser_handles;
+    static unsigned get_parser_handle(std::string phase0Table) {
+        for (auto p : Parser::parser_handles) {
+            auto parser_name = p.first;
+            if (phase0Table.find(parser_name) != std::string::npos) return p.second;
+        }
+        return 0;
+    }
+
+    template <class REGS>
+    void *setup_phv_output_map(REGS &, gress_t, int);
+
+    State *get_start_state() {
+        std::vector<std::string> startNames = {"start", "START", "$entry_point.start",
+                                               "$entry_point"};
+        for (auto n : startNames) {
+            if (states.count(n)) return states.at(n);
+        }
+        return nullptr;
+    }
+
+    int get_prsr_max_dph();
+    int get_header_stack_size_from_valid_bits(std::vector<State::Match::Set *> sets);
+
+    // Debug
+    void print_all_paths();
+
+ private:
+    template <class REGS>
+    void mark_unused_output_map(REGS &, void *, unsigned);
+    void define_state(gress_t gress, pair_t &kv);
+    void output_default_ports(json::vector &vec, bitvec port_use);
+    int state_prsr_dph_max(const State *s);
+    int state_prsr_dph_max(const State *s, std::map<const State *, std::pair<int, int>> &visited,
+                           int curr_dph_bits);
+    int parser_depth_max_bytes, parser_depth_min_bytes;
+};
+
+class AsmParser : public BaseAsmParser {
+    std::vector<Parser *> parser[2];     // INGRESS, EGRESS
+    bitvec phv_use[2];                   // ingress/egress only
+    std::vector<Phv::Ref> ghost_parser;  // the ghost "parser" extracts 32-bit value. This 32-bit
+                                         // can be from a single 32-bit container or multiple
+                                         // smaller one.
+    unsigned ghost_pipe_mask = 0xf;      // only set for JBAY
+    void start(int lineno, VECTOR(value_t) args) override;
+    void input(VECTOR(value_t) args, value_t data) override;
+    void process() override;
+    void output(json::map &) override;
+    void init_port_use(bitvec &port_use, const value_t &arg);
+
+ public:
+    AsmParser() : BaseAsmParser("parser"){};
+    ~AsmParser() {}
+
+    // For gtest
+    std::vector<Parser *> test_get_parser(gress_t gress);
+};
+
+template <class REGS>
+void Parser::PriorityUpdate::write_config(REGS &action_row) {
+    if (offset >= 0) {
+        action_row.pri_upd_type = 1;
+        action_row.pri_upd_src = offset;
+        action_row.pri_upd_en_shr = shift;
+        action_row.pri_upd_val_mask = mask;
+    } else {
+        action_row.pri_upd_type = 0;
+        action_row.pri_upd_en_shr = 1;
+        action_row.pri_upd_val_mask = mask;
+    }
+}
+
+// for jbay (tofino1 is specialized)
+template <>
+void Parser::RateLimit::write_config(::Tofino::regs_pipe &regs, gress_t gress);
+template <class REGS>
+void Parser::RateLimit::write_config(REGS &regs, gress_t gress) {
+    if (gress == INGRESS) {
+        auto &ctrl = regs.pardereg.pgstnreg.parbreg.left.i_phv_rate_ctrl;
+        ctrl.inc = inc;
+        ctrl.interval = interval;
+        ctrl.max = max;
+    } else if (gress == EGRESS) {
+        auto &ctrl = regs.pardereg.pgstnreg.parbreg.right.e_phv_rate_ctrl;
+        ctrl.inc = inc;
+        ctrl.interval = interval;
+        ctrl.max = max;
+    }
+}
+
+template <class REGS>
+void Parser::State::MatchKey::write_config(REGS &, json::vector &) {
+    // FIXME -- TBD -- probably needs to be different for tofino/jbay, so there will be
+    // FIXME -- template specializations for this in those files
+}
+
+template <class REGS>
+void Parser::State::Match::write_saves(REGS &regs, Match *def, void *output_map, int &max_off,
+                                       unsigned &used, int csum_8b, int csum_16b) {
+    if (offset_inc)
+        for (auto s : save) s->flags |= OFFSET;
+    for (auto s : save)
+        max_off =
+            std::max(max_off, s->write_output_config(regs, output_map, used, csum_8b, csum_16b));
+    if (def)
+        for (auto &s : def->save)
+            max_off = std::max(max_off,
+                               s->write_output_config(regs, output_map, used, csum_8b, csum_16b));
+}
+
+template <class REGS>
+void Parser::State::Match::write_sets(REGS &regs, Match *def, void *output_map, unsigned &used,
+                                      int csum_8b, int csum_16b) {
+    if (offset_inc)
+        for (auto s : set) s->flags |= ROTATE;
+    for (auto s : set) s->write_output_config(regs, output_map, used, csum_8b, csum_16b);
+    if (def)
+        for (auto s : def->set) s->write_output_config(regs, output_map, used, csum_8b, csum_16b);
+}
+
+template <class REGS>
+void Parser::State::Match::write_common_row_config(REGS &regs, Parser *pa, State *state, int row,
+                                                   Match *def, json::map &ctxt_json) {
+    int max_off = -1;
+    write_lookup_config(regs, state, row);
+
+    auto &ea_row = regs.memory[state->gress].ml_ea_row[row];
+    if (ctr_instr || ctr_load || ctr_imm_amt || ctr_stack_pop) {
+        write_counter_config(ea_row);
+    } else if (def) {
+        def->write_counter_config(ea_row);
+    }
+    if (shift)
+        max_off = std::max(max_off, int(ea_row.shift_amt = shift) - 1);
+    else if (def)
+        max_off = std::max(max_off, int(ea_row.shift_amt = def->shift) - 1);
+    max_off = std::max(max_off, write_load_config(regs, pa, state, row));
+    if (auto &next = (!this->next && def) ? def->next : this->next) {
+        std::vector<State *> prev;
+        for (auto n : next) {
+            max_off = std::max(max_off, n->write_lookup_config(regs, pa, state, row, prev));
+            prev.push_back(n);
+        }
+        const match_t &n = next.pattern ? next.pattern : next->stateno;
+        ea_row.nxt_state = n.word1;
+        ea_row.nxt_state_mask = ~(n.word0 & n.word1) & PARSER_STATE_MASK;
+    } else {
+        ea_row.done = 1;
+    }
+
+    auto &action_row = regs.memory[state->gress].po_action_row[row];
+    for (auto &c : csum) {
+        action_row.csum_en[c.unit] = 1;
+        action_row.csum_addr[c.unit] = c.addr;
+    }
+    if (offset_inc || offset_rst) {
+        action_row.dst_offset_inc = offset_inc;
+        action_row.dst_offset_rst = offset_rst;
+    } else if (def) {
+        action_row.dst_offset_inc = def->offset_inc;
+        action_row.dst_offset_rst = def->offset_rst;
+    }
+    if (priority) priority.write_config(action_row);
+    if (hdr_len_inc_stop) hdr_len_inc_stop.write_config(action_row);
+
+    void *output_map = pa->setup_phv_output_map(regs, state->gress, row);
+    unsigned used = 0;
+    int csum_8b = 0;
+    int csum_16b = 0;
+    for (auto &c : csum) {
+        c.write_output_config(regs, pa, this, output_map, used);
+        if (c.type == 0 && c.dest) {
+            if (c.dest->reg.size == 8)
+                ++csum_8b;
+            else if (c.dest->reg.size == 16)
+                ++csum_16b;
+        }
+    }
+
+    if (options.target == TOFINO) {
+        write_sets(regs, def, output_map, used, csum_8b, csum_16b);
+        write_saves(regs, def, output_map, max_off, used, csum_8b, csum_16b);
+    } else {
+        write_sets(regs, def, output_map, used, 0, 0);
+        write_saves(regs, def, output_map, max_off, used, 0, 0);
+    }
+
+    int clot_unit = 0;
+    for (auto *c : clots) c->write_config(action_row, clot_unit++, offset_inc > 0);
+    if (def)
+        for (auto *c : def->clots) c->write_config(action_row, clot_unit++, offset_inc > 0);
+    pa->mark_unused_output_map(regs, output_map, used);
+
+    if (buf_req < 0) {
+        buf_req = max_off + 1;
+        BUG_CHECK(buf_req <= 32);
+    }
+    ea_row.buf_req = buf_req;
+}
+
+template <class REGS>
+void Parser::State::Match::write_row_config(REGS &regs, Parser *pa, State *state, int row,
+                                            Match *def, json::map &ctxt_json) {
+    write_common_row_config(regs, pa, state, row, def, ctxt_json);
+}
+
+template <class REGS>
+void Parser::State::Match::write_config(REGS &regs, Parser *pa, State *state, Match *def,
+                                        json::map &ctxt_json) {
+    int row, count = 0;
+    do {
+        if ((row = --pa->tcam_row_use) < 0) {
+            if (row == -1)
+                error(state->lineno, "Ran out of tcam space in %sgress parser",
+                      state->gress ? "e" : "in");
+            return;
+        }
+        ctxt_json["tcam_rows"].to<json::vector>().push_back(row);
+        write_row_config(regs, pa, state, row, def, ctxt_json);
+        pa->match_to_row[this] = row;
+    } while (++count < value_set_size);
+}
+
+template <class REGS>
+void Parser::State::Match::write_config(REGS &regs, json::vector &vec) {
+    int select_statement_bit = 0;
+    for (auto f : field_mapping) {
+        json::map container_cjson;
+        container_cjson["container_width"] = Parser::match_key_size(f.container_id.c_str());
+
+        int container_hardware_id = Parser::match_key_loc(f.container_id.c_str());
+        container_cjson["container_hardware_id"] = container_hardware_id;
+
+        container_cjson["mask"] = (1 << (f.hi - f.lo + 1)) - 1;
+        json::vector field_mapping_cjson;
+        for (auto i = f.lo; i <= f.hi; i++) {
+            json::map field_map;
+            field_map["register_bit"] = i;
+            field_map["field_name"] = f.where.name();
+            field_map["start_bit"] = i;
+            field_map["select_statement_bit"] = select_statement_bit++;
+            field_mapping_cjson.push_back(field_map.clone());
+        }
+        container_cjson["field_mapping"] = field_mapping_cjson.clone();
+        vec.push_back(container_cjson.clone());
+    }
+}
+
+template <class REGS>
+void Parser::State::write_config(REGS &regs, Parser *pa, json::vector &ctxt_json) {
+    LOG2(gress << " state " << name << " (" << stateno << ')');
+    for (auto i : match) {
+        bool uses_pvs = false;
+        json::map state_cjson;
+        state_cjson["parser_name"] = name;
+        i->write_config(regs, state_cjson["match_registers"]);
+        if (i->value_set_size > 0) uses_pvs = true;
+        i->write_config(regs, pa, this, def, state_cjson);
+        state_cjson["uses_pvs"] = uses_pvs;
+        if (def) def->write_config(regs, pa, this, 0, state_cjson);
+        if (uses_pvs) {
+            state_cjson["pvs_name"] = i->value_set_name;
+            if (i->value_set_handle < 0)
+                error(lineno, "Invalid handle for parser value set %s", i->value_set_name.c_str());
+            auto pvs_handle_full = i->value_set_handle;
+            state_cjson["pvs_handle"] = pvs_handle_full;
+        }
+        for (auto idx : MatchIter(stateno)) {
+            state_cjson["parser_state_id"] = idx;
+            ctxt_json.push_back(state_cjson.clone());
+        }
+    }
+}
+
+template <typename ROW>
+void Parser::Checksum::write_tofino_row_config(ROW &row) {
+    row.add = add;
+    if (dest)
+        row.dst = dest->reg.parser_id();
+    else if (tag >= 0)
+        row.dst = tag;
+    row.dst_bit_hdr_end_pos = dst_bit_hdr_end_pos;
+    row.hdr_end = end;
+    int rsh = 0;
+    for (auto &el : row.mask) el = (mask >> rsh++) & 1;
+    row.shr = shift;
+    row.start = start;
+    rsh = 0;
+    for (auto &el : row.swap) el = (swap >> rsh++) & 1;
+    row.type = type;
+}
+
+template <class ROW>
+void Parser::Checksum::write_row_config(ROW &row) {
+    write_tofino_row_config(row);
+    int rsh = 0;
+    for (auto &el : row.mul_2) el = (mul_2 >> rsh++) & 1;
+}
+
+// Used with JBay
+bitvec expand_parser_groups(bitvec phvs);
+bitvec remove_nonparser(bitvec phvs);
+void setup_jbay_ownership(bitvec phv_use[2], checked_array<128, ubits<1>> &left,
+                          checked_array<128, ubits<1>> &right, checked_array<256, ubits<1>> &main_i,
+                          checked_array<256, ubits<1>> &main_e);
+void setup_jbay_no_multi_write(bitvec phv_allow_bitwise_or, bitvec phv_allow_clear_on_write,
+                               checked_array<256, ubits<1>> &nmw_i,
+                               checked_array<256, ubits<1>> &nmw_e);
+void setup_jbay_clear_on_write(bitvec phv_allow_clear_on_write, checked_array<128, ubits<1>> &left,
+                               checked_array<128, ubits<1>> &right,
+                               checked_array<256, ubits<1>> &main_i,
+                               checked_array<256, ubits<1>> &main_e);
+
+#endif /* PARSER_TOFINO_JBAY_H_ */
diff --git a/backends/tofino/bf-asm/parser.h b/backends/tofino/bf-asm/parser.h
new file mode 100644
index 00000000000..e49e79025b4
--- /dev/null
+++ b/backends/tofino/bf-asm/parser.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_PARSER_H_
+#define BACKENDS_TOFINO_BF_ASM_PARSER_H_
+
+#include "asm-types.h"
+#include "backends/tofino/bf-asm/json.h"
+#include "backends/tofino/bf-asm/target.h"
+#include "sections.h"
+#include "vector.h"
+
+/**
+ * @brief Base class of Tofino parser in assembler
+ *
+ * For Tofino 1/2, the class Parser is derived.
+ */
+class BaseParser : virtual public Configurable {
+ protected:
+    int lineno = -1;
+};
+
+/**
+ * @brief Base class of parser assembly section
+ */
+class BaseAsmParser : public Section {
+ public:
+    explicit BaseAsmParser(const char *name_) : Section(name_) {}
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_PARSER_H_ */
diff --git a/backends/tofino/bf-asm/phase0.cpp b/backends/tofino/bf-asm/phase0.cpp
new file mode 100644
index 00000000000..4e182ae28c1
--- /dev/null
+++ b/backends/tofino/bf-asm/phase0.cpp
@@ -0,0 +1,93 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "parser-tofino-jbay.h"
+
+DEFINE_TABLE_TYPE(Phase0MatchTable)
+
+void Phase0MatchTable::setup(VECTOR(pair_t) & data) {
+    for (auto &kv : MapIterChecked(data)) {
+        if (common_setup(kv, data, P4Table::MatchEntry)) {
+        } else if (auto *fmt = get(data, "format")) {
+            if (CHECKTYPEPM(*fmt, tMAP, fmt->map.size > 0, "non-empty map"))
+                format.reset(new Format(this, fmt->map));
+        } else if (kv.key == "size") {
+            if (CHECKTYPE(kv.value, tINT)) size = kv.value.i;
+        } else if (kv.key == "constant_value") {
+            if (CHECKTYPE(kv.value, tINT)) constant_value = kv.value.i;
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (gress != INGRESS || stage->stageno != 0)
+        error(lineno, "Phase 0 match table can only be in stage 0 ingress");
+}
+
+void Phase0MatchTable::pass1() {
+    LOG1("### Phase 0 match table " << name() << " pass1 " << loc());
+    MatchTable::pass1();
+    if (actions) actions->pass1(this);
+}
+
+void Phase0MatchTable::pass2() { LOG1("### Phase 0 match table " << name() << " pass2 " << loc()); }
+
+void Phase0MatchTable::pass3() { LOG1("### Phase 0 match table " << name() << " pass3 " << loc()); }
+
+template <class REGS>
+void Phase0MatchTable::write_regs_vt(REGS &) {
+    LOG1("### Phase 0 match table " << name() << " write_regs " << loc());
+}
+
+void Phase0MatchTable::gen_tbl_cfg(json::vector &out) const {
+    json::map &tbl = *base_tbl_cfg(out, "match_entry", p4_table ? p4_table->size : size);
+    common_tbl_cfg(tbl);
+    tbl["statistics_table_refs"] = json::vector();
+    tbl["meter_table_refs"] = json::vector();
+    tbl["selection_table_refs"] = json::vector();
+    tbl["stateful_table_refs"] = json::vector();
+    tbl["action_data_table_refs"] = json::vector();
+    json::map &match_attributes = tbl["match_attributes"] = json::map();
+    json::map &stage_tbl = *add_stage_tbl_cfg(match_attributes, "phase_0_match", size);
+    match_attributes["match_type"] = "phase_0_match";
+    stage_tbl["stage_number"] = -1;
+    // Associate the phase0 table with corresponding parser. This is used in a
+    // multi parser scenario which has multiple phase0 tables
+    // and the handle is used by the driver to link the phase0 table to the
+    // parser.
+    auto parser_handle = Parser::get_parser_handle(name());
+    if (parser_handle > 0) stage_tbl["parser_handle"] = parser_handle;
+    stage_tbl.erase("logical_table_id");
+    stage_tbl.erase("default_next_table");
+    stage_tbl.erase("has_attached_gateway");
+    auto &mra = stage_tbl["memory_resource_allocation"] = json::map();
+    mra["memory_type"] = "ingress_buffer";
+    json::map tmp;
+    (tmp["vpns"] = json::vector()).push_back(INT64_C(0));
+    (tmp["memory_units"] = json::vector()).push_back(INT64_C(0));
+    (mra["memory_units_and_vpns"] = json::vector()).push_back(std::move(tmp));
+    // Driver looks at the pack format to determine the fields and their
+    // positions. Since phase0 is only mimicking a table, the driver expects to
+    // have a single entry within the pack format.
+    bool pad_zeros = false;
+    bool print_fields = true;
+    add_pack_format(stage_tbl, format.get(), pad_zeros, print_fields);
+    if (actions) actions->gen_tbl_cfg(tbl["actions"]);
+    if (context_json) stage_tbl.merge(*context_json);
+}
diff --git a/backends/tofino/bf-asm/phv.cpp b/backends/tofino/bf-asm/phv.cpp
new file mode 100644
index 00000000000..b7fcf49d0a9
--- /dev/null
+++ b/backends/tofino/bf-asm/phv.cpp
@@ -0,0 +1,496 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "phv.h"
+
+#include <algorithm>
+#include <iostream>
+
+#include "lib/log.h"
+#include "misc.h"
+
+Phv Phv::phv;
+const Phv::Register Phv::Slice::invalid("<bad>", Phv::Register::NORMAL, 0, ~0, 0);
+
+void Phv::init_phv(target_t target_type) {
+    if (target) {
+        BUG_CHECK(target->type() == target_type);  // sanity check
+        return;
+    }
+    switch (target_type) {
+#define INIT_FOR_TARGET(TARGET)           \
+    case Target::TARGET::tag:             \
+        target = new Target::TARGET::Phv; \
+        break;
+        FOR_ALL_TARGETS(INIT_FOR_TARGET)
+        default:
+            BUG();
+    }
+#undef INIT_FOR_TARGET
+    target->init_regs(*this);
+}
+
+void Phv::start(int lineno, VECTOR(value_t) args) {
+    if (options.target == NO_TARGET) {
+        error(lineno, "No target specified prior to PHV section");
+        return;
+    }
+    init_phv(options.target);
+    // The only argument to phv is the thread.  We allow phv section with no thread argument
+    // which defines aliases for all threads.  Does that really make sense when threads can't
+    // share registers?  We never use this capability in the compiler.
+    if (args.size > 1 || (args.size == 1 && args[0] != "ingress" && args[0] != "egress" &&
+                          (args[0] != "ghost" || options.target < JBAY)))
+        error(lineno, "phv can only be ingress%s or egress",
+              (options.target >= JBAY ? ", ghost" : 0));
+}
+
+int Phv::addreg(gress_t gress, const char *name, const value_t &what, int stage, int max_stage) {
+    std::string phv_name = name;
+    remove_name_tail_range(phv_name);
+    if (stage == -1 && what.type == tMAP) {
+        int rv = 0;
+        for (auto &kv : what.map) {
+            auto &key = kv.key.type == tCMD && kv.key.vec.size > 1 && kv.key == "stage" ? kv.key[1]
+                                                                                        : kv.key;
+            if (CHECKTYPE2(key, tINT, tRANGE)) {
+                if (key.type == tINT)
+                    rv |= addreg(gress, name, kv.value, key.i);
+                else
+                    rv |= addreg(gress, name, kv.value, key.range.lo, key.range.hi);
+            }
+        }
+        int size = -1;
+        PerStageInfo *prev = 0;
+        for (auto &ch : names[gress].at(name)) {
+            if (prev) {
+                if (prev->max_stage >= ch.first) {
+                    if (prev->max_stage != INT_MAX)
+                        error(what.lineno, "Overlapping assignments in stages %d..%d for %s",
+                              ch.first, prev->max_stage, name);
+                    prev->max_stage = ch.first - 1;
+                }
+            }
+            prev = &ch.second;
+            if (size < 0) {
+                size = ch.second.slice->size();
+            } else if (size != ch.second.slice->size() && size > 0) {
+                error(what.lineno, "Inconsitent sizes for %s", name);
+                size = 0;
+            }
+        }
+        if (prev && prev->max_stage >= Target::NUM_MAU_STAGES()) prev->max_stage = INT_MAX;
+        add_phv_field_sizes(gress, phv_name, size);
+        return rv;
+    }
+    if (!CHECKTYPE2M(what, tSTR, tCMD, "register or slice")) return -1;
+    auto reg = what.type == tSTR ? what.s : what[0].s;
+    if (const Slice *sl = get(gress, stage, reg)) {
+        if (sl->valid) {
+            phv_use[gress][sl->reg.uid] = true;
+            user_defined[&sl->reg].first = gress;
+            if (max_stage != INT_MAX) {
+                /* a name that spans across stages - add it to all stages */
+                for (int i = stage; i <= max_stage; i++) {
+                    user_defined[&sl->reg].second[i].insert(name);
+                }
+            } else {
+                for (int i = 0; i <= Target::NUM_MAU_STAGES(); i++) {
+                    user_defined[&sl->reg].second[i].insert(name);
+                }
+            }
+            LOG5(" Adding " << name << " to user_defined");
+        }
+        auto &reg = names[gress][name];
+        if (what.type == tSTR) {
+            reg[stage].slice = *sl;
+        } else if (what.vec.size != 2) {
+            error(what.lineno, "Syntax error, expecting bit or slice");
+            return -1;
+        } else if (!CHECKTYPE2M(what[1], tINT, tRANGE, "bit or slice")) {
+            return -1;
+        } else if (what[1].type == tINT) {
+            reg[stage].slice = Slice(*sl, what[1].i, what[1].i);
+        } else {
+            reg[stage].slice = Slice(*sl, what[1].range.lo, what[1].range.hi);
+        }
+        reg[stage].max_stage = max_stage;
+        if (!reg[stage].slice.valid) {
+            auto slice = reg[stage].slice;
+            error(what.lineno, "Invalid register slice - %s[%d:%d]", slice.reg.name, slice.hi,
+                  slice.lo);
+            return -1;
+        }
+        if (stage == -1) {
+            add_phv_field_sizes(gress, phv_name, reg[stage].slice->size());
+            if (is_pov(phv_name)) {
+                phv_pov_names[sl->reg.mau_id()][reg[stage].slice.lo] = phv_name;
+            }
+        }
+        return 0;
+    } else {
+        error(what.lineno, "No register named %s", reg);
+        return -1;
+    }
+}
+
+void Phv::input(VECTOR(value_t) args, value_t data) {
+    if (!CHECKTYPE(data, tMAP)) return;
+    gress_t gress =
+        args[0] == "ingress"  ? INGRESS
+        : args[0] == "egress" ? EGRESS
+        : args[0] == "ghost" && options.target >= JBAY
+            ? GHOST
+            : (error(args[1].lineno, "Invalid thread %s", value_desc(args[1])), INGRESS);
+    for (auto &kv : data.map) {
+        if (!CHECKTYPE(kv.key, tSTR)) continue;
+        if (kv.key == "context_json") {
+            if (!CHECKTYPE(kv.value, tMAP)) continue;
+            field_context_json.merge(*toJson(kv.value.map));
+        } else {
+            if (get(gress, INT_MAX, kv.key.s) || (!args.size && get(EGRESS, INT_MAX, kv.key.s)) ||
+                (!args.size && get(GHOST, INT_MAX, kv.key.s))) {
+                error(kv.key.lineno, "Duplicate phv name '%s'", kv.key.s);
+                continue;
+            }
+            if (!addreg(gress, kv.key.s, kv.value) && args.size == 0) {
+                addreg(EGRESS, kv.key.s, kv.value);
+                if (options.target >= JBAY) addreg(GHOST, kv.key.s, kv.value);
+            }
+        }
+    }
+}
+
+Phv::Ref::Ref(gress_t g, int stage, const value_t &n)
+    : gress_(g), stage(stage), lo(-1), hi(-1), lineno(n.lineno) {
+    if (CHECKTYPE2M(n, tSTR, tCMD, "phv or register reference or slice")) {
+        if (n.type == tSTR) {
+            name_ = n.s;
+        } else {
+            name_ = n[0].s;
+            if (PCHECKTYPE2M(n.vec.size == 2, n[1], tINT, tRANGE, "register slice")) {
+                if (n[1].type == tINT) {
+                    lo = hi = n[1].i;
+                } else {
+                    lo = n[1].range.lo;
+                    hi = n[1].range.hi;
+                    if (lo > hi) {
+                        lo = n[1].range.hi;
+                        hi = n[1].range.lo;
+                    }
+                }
+            }
+        }
+    }
+}
+
+Phv::Ref::Ref(const Phv::Register &r, gress_t gr, int l, int h)
+    : gress_(gr), name_(r.name), stage(0), lo(l), hi(h < 0 ? l : h), lineno(-1) {}
+
+bool Phv::Ref::merge(const Phv::Ref &r) {
+    if (r.name_ != name_ || r.gress_ != gress_) return false;
+    if (lo < 0) return true;
+    if (r.lo < 0) {
+        lo = hi = -1;
+        return true;
+    }
+    if (r.hi + 1 < lo || hi + 1 < r.lo) return false;
+    if (r.lo < lo) lo = r.lo;
+    if (r.hi > hi) {
+        lineno = r.lineno;
+        hi = r.hi;
+    }
+    return true;
+}
+
+void merge_phv_vec(std::vector<Phv::Ref> &vec, const Phv::Ref &r) {
+    int merged = -1;
+    for (int i = 0; (unsigned)i < vec.size(); i++) {
+        if (merged >= 0) {
+            if (vec[merged].merge(vec[i])) {
+                vec.erase(vec.begin() + i);
+                --i;
+            }
+        } else if (vec[i].merge(r)) {
+            merged = i;
+        }
+    }
+    if (merged < 0) vec.push_back(r);
+}
+
+void merge_phv_vec(std::vector<Phv::Ref> &v1, const std::vector<Phv::Ref> &v2) {
+    for (auto &r : v2) merge_phv_vec(v1, r);
+}
+
+std::vector<Phv::Ref> split_phv_bytes(const Phv::Ref &r) {
+    std::vector<Phv::Ref> rv;
+    const auto &sl = *r;
+    for (unsigned byte = sl.lo / 8U; byte <= sl.hi / 8U; byte++) {
+        int lo = byte * 8 - sl.lo;
+        int hi = lo + 7;
+        if (lo < 0) lo = 0;
+        if (hi >= static_cast<int>(sl.size())) hi = sl.size() - 1;
+        rv.emplace_back(r, lo, hi);
+    }
+    return rv;
+}
+
+std::vector<Phv::Ref> split_phv_bytes(const std::vector<Phv::Ref> &v) {
+    std::vector<Phv::Ref> rv;
+    for (auto &r : v) append(rv, split_phv_bytes(r));
+    return rv;
+}
+
+std::string Phv::Ref::toString() const {
+    std::stringstream str;
+    str << *this;
+    return str.str();
+}
+
+void Phv::Ref::dbprint(std::ostream &out) const {
+    out << name_;
+    if (lo >= 0) {
+        out << '[' << hi;
+        if (hi != lo) out << ":" << lo;
+        out << ']';
+    }
+    Slice sl(**this);
+    if (sl.valid) {
+        out << '[';
+        sl.dbprint(out);
+        out << ']';
+    }
+}
+
+std::string Phv::Ref::desc() const { return toString(); }
+
+std::string Phv::Slice::toString() const {
+    std::stringstream str;
+    str << *this;
+    return str.str();
+}
+
+void Phv::Slice::dbprint(std::ostream &out) const {
+    if (valid) {
+        out << reg.name;
+        if (lo != 0 || hi != reg.size - 1) {
+            out << '[' << hi;
+            if (hi != lo) out << ":" << lo;
+            out << ']';
+        }
+    } else {
+        out << "<invalid>";
+    }
+}
+
+std::string Phv::db_regset(const bitvec &s) {
+    std::string rv;
+    for (int reg : s) {
+        if (!rv.empty()) rv += ", ";
+        rv += Phv::reg(reg)->name;
+    }
+    return rv;
+}
+
+// For snapshot, the driver (generate pd script) generates a buffer of all phv
+// fields and indexes through the buffer with a position offset to determine its
+// location. It assumes the phv fields are arranged with the pov fields at the
+// end. To maintain this ordering while generating the position offsets for each
+// phv field, we initially generate 2 separate maps for normal and pov phv
+// fields. We loop through the normap phv map first and then the pov phv map
+// adding field sizes. The fields are byte aligned and put into 8/16/32 bit
+// containers.
+int Phv::get_position_offset(gress_t gress, std::string name) {
+    int position_offset = 0;
+    for (auto f : phv_field_sizes[gress]) {
+        if (f.first == name) return position_offset;
+        auto bytes_to_add = (f.second + 7) / 8U;
+        if (bytes_to_add == 3) bytes_to_add++;
+        position_offset += bytes_to_add;
+    }
+    for (auto f : phv_pov_field_sizes[gress]) {
+        if (f.first == name) return position_offset;
+        // POV should be single bit
+        BUG_CHECK(f.second == 1);
+        position_offset += 1;
+    }
+    return 0;
+}
+
+// Output function sets the 'phv_allocation' node in context json Contains info
+// on phv containers per gress (INGRESS/EGRESS) per stage Currently the phv
+// containers are assumed to be present in all stages hence are replicated in
+// each stage. Support for liveness indication for each container must be added
+// (in assembly syntax/compiler) to set per stage phv containers correctly.
+void Phv::output(json::map &ctxt_json) {
+    bool warn_once = false;
+    json::vector &phv_alloc = ctxt_json["phv_allocation"];
+    for (int i = 0; i <= Target::NUM_MAU_STAGES(); i++) {
+        json::map phv_alloc_stage;
+        json::vector &phv_alloc_stage_ingress = phv_alloc_stage["ingress"] = json::vector();
+        json::vector &phv_alloc_stage_egress = phv_alloc_stage["egress"] = json::vector();
+        for (auto &slot : phv.user_defined) {
+            unsigned phv_number = slot.first->uid;
+            unsigned phv_container_size = slot.first->size;
+            gress_t gress = slot.second.first;
+            auto stage_usernames = slot.second.second[i];
+            json::map phv_container;
+            phv_container["phv_number"] = phv_number;
+            phv_container["container_type"] = slot.first->type_to_string();
+            json::vector &phv_records = phv_container["records"] = json::vector();
+            for (auto field_name : stage_usernames) {
+                LOG5("Output phv record for field : " << field_name);
+                unsigned phv_lsb = 0, phv_msb = 0;
+                unsigned field_lo = 0;
+                int field_size = 0;
+                json::map phv_record;
+                auto sl = get(gress, i, field_name);
+                if (!sl) continue;
+                phv_lsb = sl->lo;
+                phv_msb = sl->hi;
+                field_lo = remove_name_tail_range(field_name, &field_size);
+                auto field_width = get_phv_field_size(gress, field_name);
+                if (field_size == 0) field_size = field_width;
+                phv_record["position_offset"] = get_position_offset(gress, field_name);
+                phv_record["field_name"] = field_name;
+                phv_record["field_msb"] = field_lo + field_size - 1;
+                phv_record["field_lsb"] = field_lo;
+                auto field_width_bytes = (field_width + 7) / 8U;
+                phv_record["field_width"] = field_width_bytes;
+                phv_record["phv_msb"] = phv_msb;
+                phv_record["phv_lsb"] = phv_lsb;
+                // FIXME-P4C: 'is_compiler_generated' is set to false for all
+                // fields except POV as there is no sure way of knowing from
+                // current assembly syntax whether the field is in the header or
+                // generated by the compiler. This will require additional
+                // assembly syntax to convey the same. Driver does not use
+                // is_compiler_generated (other than requiring it).  p4i does
+                // use it for display purposes.
+                phv_record["is_compiler_generated"] = false;
+                phv_record["is_pov"] = false;
+                if (is_pov(field_name)) {
+                    phv_record["is_pov"] = true;
+                    phv_record["is_compiler_generated"] = true;
+                    phv_record["field_width"] = 0;
+                    phv_record["position_offset"] = 0;
+                    /* Now that we know that this record is representing a POV, overwrite the
+                     * phv_record to call it "POV" and get rid of "$valid" */
+                    phv_record["field_name"] = "POV";
+                    json::vector &pov_headers = phv_record["pov_headers"] = json::vector();
+                    json::map pov_header;
+                    pov_header["bit_index"] = phv_lsb;
+                    pov_header["position_offset"] = get_position_offset(gress, field_name);
+                    pov_header["header_name"] = field_name;
+                    // FIXME: Checks for reserved POV bits, not supported?
+                    pov_header["hidden"] = false;
+                    ;
+                    pov_headers.push_back(std::move(pov_header));
+                }
+                // Pass through per-field context_json information from the compiler.
+                if (field_context_json.count(slot.first->name)) {
+                    auto add_phv_record_items = [&](int live_stage, std::string live_string) {
+                        if (live_stage == -1) {
+                            phv_record[live_string] = "parser";
+                            return;
+                        }
+                        if (live_stage == Target::NUM_MAU_STAGES()) {
+                            phv_record[live_string] = "deparser";
+                            return;
+                        }
+                        phv_record[live_string] = live_stage;
+                    };
+                    auto container_json = field_context_json[slot.first->name];
+                    BUG_CHECK(container_json);
+                    bool field_added = false;
+                    if (!container_json->as_vector()) {
+                        // FIXME -- should be flexible about parsing context_json -- continue
+                        // to accept a map instead of a vector here.
+                        if (!warn_once) {
+                            // FIXME -- would be nice to have the bfa lineno here.
+                            warning(-1, "Invalid/obsolete phv context_json:, ignoring");
+                            warn_once = true;
+                        }
+                        continue;
+                    }
+                    for (auto &field_json : *container_json->as_vector()) {
+                        auto live_start = -1, live_end = Target::NUM_MAU_STAGES();
+                        auto container_field_json = field_json->as_map();
+                        if (container_field_json->count("name")) {
+                            if ((*container_field_json)["name"] != field_name) continue;
+                        } else {
+                            continue;
+                        }
+                        if (container_field_json->count("live_start")) {
+                            auto live_start_json = (*container_field_json)["live_start"];
+                            if (auto n = live_start_json->as_number()) live_start = n->val;
+                        }
+                        if (container_field_json->count("live_end")) {
+                            auto live_end_json = (*container_field_json)["live_end"];
+                            if (auto n = live_end_json->as_number()) live_end = n->val;
+                        }
+                        if (i >= live_start && i <= live_end) {
+                            add_phv_record_items(live_start, "live_start");
+                            add_phv_record_items(live_end, "live_end");
+                            phv_record["mutually_exclusive_with"] = json::vector();
+                            if (container_field_json->count("mutually_exclusive_with")) {
+                                auto mutex_json =
+                                    (*container_field_json)["mutually_exclusive_with"];
+                                if (json::vector *mutex_json_vec = mutex_json->as_vector())
+                                    phv_record["mutually_exclusive_with"] =
+                                        std::move(*mutex_json_vec);
+                            }
+                            field_added = true;
+                            // Skip duplicates
+                            if (!std::any_of(phv_records.begin(), phv_records.end(),
+                                             [&phv_record](std::unique_ptr<json::obj> &r) {
+                                                 return *r == phv_record;
+                                             }))
+                                phv_records.push_back(phv_record.clone());
+                        }
+                    }
+                    if (!field_added) {
+                        auto live_start = -1, live_end = Target::NUM_MAU_STAGES();
+                        add_phv_record_items(live_start, "live_start");
+                        add_phv_record_items(live_end, "live_end");
+                        phv_record["mutually_exclusive_with"] = json::vector();
+                        phv_records.push_back(phv_record.clone());
+                    }
+                } else {
+                    phv_records.push_back(std::move(phv_record));
+                }
+            }
+            phv_container["word_bit_width"] = phv_container_size;
+            // Ghost phv's are considered as ingress phv's
+            if (phv_records.size() > 0) {
+                if ((gress == INGRESS) || (gress == GHOST)) {
+                    phv_alloc_stage_ingress.push_back(std::move(phv_container));
+                } else if (gress == EGRESS) {
+                    phv_alloc_stage_egress.push_back(std::move(phv_container));
+                }
+            }
+        }
+        phv_alloc_stage["stage_number"] = i;
+        phv_alloc.push_back(std::move(phv_alloc_stage));
+    }
+    // FIXME: Fix json clone method to do above loops more efficiently
+    // for (int i = 0; i < Target::NUM_MAU_STAGES(); i++) {
+    //     phv_alloc_stage["stage_number"] = i;
+    //     phv_alloc.push_back(std::move(phv_alloc_stage.clone())); }
+}
+
+#include "jbay/phv.cpp"    // NOLINT(build/include)
+#include "tofino/phv.cpp"  // NOLINT(build/include)
diff --git a/backends/tofino/bf-asm/phv.h b/backends/tofino/bf-asm/phv.h
new file mode 100644
index 00000000000..133a72365c6
--- /dev/null
+++ b/backends/tofino/bf-asm/phv.h
@@ -0,0 +1,327 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_PHV_H_
+#define BACKENDS_TOFINO_BF_ASM_PHV_H_
+
+#include <set>
+#include <vector>
+
+#include "backends/tofino/bf-asm/json.h"
+#include "backends/tofino/bf-asm/target.h"
+#include "bfas.h"
+#include "lib/bitvec.h"
+#include "match_source.h"
+#include "misc.h"
+#include "sections.h"
+
+class Phv : public Section {
+    void start(int lineno, VECTOR(value_t) args) override;
+    void input(VECTOR(value_t) args, value_t data) override;
+    void output(json::map &) override;
+    Phv() : Section("phv") {}
+    Phv(const Phv &) = delete;
+    Phv &operator=(const Phv &) = delete;
+    ~Phv() {}
+    static Phv phv;  // singleton class
+    Target::Phv *target = nullptr;
+    FOR_ALL_TARGETS(FRIEND_TARGET_CLASS, ::Phv)
+
+ public:
+    struct Register {
+        char name[8];
+        enum type_t { NORMAL, TAGALONG, CHECKSUM, MOCHA, DARK } type;
+        // uid is used for "phv_number" in the context.json, but otherwise is just
+        // a unique id for the register, encoded differently for different targets
+        unsigned short index = 0, uid = 0, size = 0;
+        Register() { type = NORMAL; }
+        Register(const Register &) = delete;
+        Register &operator=(const Register &) = delete;
+        Register(const char *n, type_t t, unsigned i, unsigned u, unsigned s)
+            : type(t), index(i), uid(u), size(s) {
+            strncpy(name, n, sizeof(name));
+            name[7] = 0;
+        }
+        bool operator==(const Register &a) const { return uid == a.uid; }
+        bool operator!=(const Register &a) const { return uid != a.uid; }
+        bool operator<(const Register &a) const { return uid < a.uid; }
+        virtual int parser_id() const { return -1; }
+        virtual int mau_id() const { return -1; }
+        virtual int ixbar_id() const { return -1; }
+        virtual int deparser_id() const { return -1; }
+        /// return a string representation based on the container type
+        const char *type_to_string() const {
+            switch (type) {
+                case NORMAL:
+                    return "normal";
+                case TAGALONG:
+                    return "tagalong";
+                case CHECKSUM:
+                    return "checksum";
+                case MOCHA:
+                    return "mocha";
+                case DARK:
+                    return "dark";
+            }
+            return "";
+        }
+    };
+    class Slice : public IHasDbPrint {
+        static const Register invalid;
+
+     public:
+        const Register &reg;
+        int lo = -1, hi = -1;
+        bool valid;
+        Slice() : reg(invalid), valid(false) {}
+        Slice(const Register &r, int l, int h) : reg(r), lo(l), hi(h) {
+            valid = lo >= 0 && hi >= lo && hi < reg.size;
+        }
+        Slice(const Register &r, int b) : reg(r), lo(b), hi(b) {
+            valid = lo >= 0 && hi >= lo && hi < reg.size;
+        }
+        Slice(const Slice &s, int l, int h) : reg(s.reg), lo(s.lo + l), hi(s.lo + h) {
+            valid = lo >= 0 && hi >= lo && hi <= s.hi && hi < reg.size;
+        }
+        Slice(const Slice &) = default;
+        explicit operator bool() const { return valid; }
+        Slice &operator=(const Slice &a) {
+            new (this) Slice(a.reg, a.lo, a.hi);
+            return *this;
+        }
+        const Slice *operator->() const { return this; }
+        bool operator==(const Slice &s) const {
+            return valid && s.valid && reg.uid == s.reg.uid && lo == s.lo && hi == s.hi;
+        }
+        bool operator<(const Slice &a) const {
+            if (reg.uid < a.reg.uid) return true;
+            if (reg.uid > a.reg.uid) return false;
+            if (lo < a.lo) return true;
+            if (lo > a.lo) return false;
+            return (hi < a.hi);
+        }
+        bool overlaps(const Slice &a) const {
+            return valid && a.valid && reg.uid == a.reg.uid && lo <= a.hi && a.lo <= hi;
+        }
+        unsigned size() const { return valid ? hi - lo + 1 : 0; }
+        std::string toString() const;
+        void dbprint(std::ostream &out) const;
+    };
+
+ protected:
+    // registers indexed according to MAU id
+    std::vector<Register *> regs;
+    std::map<int, std::map<int, std::string>> phv_pov_names;
+    struct PerStageInfo {
+        int max_stage = INT_MAX;
+        Slice slice;
+    };
+    std::map<std::string, std::map<int, PerStageInfo>> names[3];
+
+ private:
+    typedef std::map<int, std::set<std::string>> user_stagenames_t;
+    std::map<const Register *, std::pair<gress_t, user_stagenames_t>, ptrless<Register>>
+        user_defined;
+    bitvec phv_use[3];
+    std::map<std::string, int> phv_field_sizes[3];
+    std::map<std::string, int> phv_pov_field_sizes[3];
+
+    // Maps P4-level field names (i.e. returned by stack_asm_name_to_p4()) to a
+    // map to be embedded in the field's context_json "records" node.
+    json::map field_context_json;
+
+    void init_phv(target_t);
+    bool is_pov(std::string name) {
+        // There are 2 types of POV bits we are interested in
+        // Either ending with .$valid or .$deparse...
+        return (name.find(".$valid") != std::string::npos ||
+                name.find(".$deparse") != std::string::npos);
+    }
+    void gen_phv_field_size_map();
+    int addreg(gress_t gress, const char *name, const value_t &what, int stage = -1,
+               int max_stage = INT_MAX);
+    int get_position_offset(gress_t gress, std::string name);
+    void add_phv_field_sizes(gress_t gress, std::string name, int size) {
+        auto &phv_field_map = is_pov(name) ? phv_pov_field_sizes : phv_field_sizes;
+        phv_field_map[gress][name] += size;
+    }
+    int get_phv_field_size(gress_t gress, std::string name) {
+        if (phv_field_sizes[gress].count(name) > 0) return phv_field_sizes[gress][name];
+        if (phv_pov_field_sizes[gress].count(name) > 0) return phv_pov_field_sizes[gress][name];
+        return 0;
+    }
+
+ public:
+    static const Slice *get(gress_t gress, int stage, const std::string &name) {
+        phv.init_phv(options.target);
+        auto phvIt = phv.names[gress].find(name);
+        if (phvIt == phv.names[gress].end()) return 0;
+        auto &per_stage = phvIt->second;
+        auto it = per_stage.upper_bound(stage);
+        if (it == per_stage.begin()) {
+            if (it == per_stage.end() || stage != -1) return 0;
+        } else {
+            --it;
+        }
+        if (stage > it->second.max_stage) return 0;
+        return &it->second.slice;
+    }
+    static const Slice *get(gress_t gress, int stg, const char *name) {
+        return get(gress, stg, std::string(name));
+    }
+    class Ref : public MatchSource {
+     protected:
+        gress_t gress_;
+        std::string name_;
+        int stage = -1;
+        int lo = -1, hi = -1;
+
+     public:
+        int lineno;
+        Ref() : gress_(INGRESS), lineno(-1) {}
+        Ref(gress_t g, int stage, const value_t &n);
+        Ref(gress_t g, int stage, int line, const std::string &n, int l, int h)
+            : gress_(g), name_(n), stage(stage), lo(l), hi(h), lineno(line) {}
+        Ref(const Ref &r, int l, int h)
+            : gress_(r.gress_),
+              name_(r.name_),
+              stage(r.stage),
+              lo(r.lo < 0 ? l : r.lo + l),
+              hi(r.lo < 0 ? h : r.lo + h),
+              lineno(r.lineno) {
+            BUG_CHECK(r.hi < 0 || hi <= r.hi);
+        }
+        Ref(const Register &r, gress_t gr, int lo = -1, int hi = -1);
+        explicit operator bool() const { return lineno >= 0; }
+        Slice operator*() const {
+            if (auto *s = phv.get(gress_, stage, name_)) {
+                if (hi >= 0) return Slice(*s, lo, hi);
+                return *s;
+            } else {
+                error(lineno, "No phv record %s (%s, stage %d)", name_.c_str(),
+                      gress_ == INGRESS ? "INGRESS" : "EGRESS", stage);
+                phv.get(gress_, stage, name_);
+                return Slice();
+            }
+        }
+        bool operator<(const Ref &r) const {
+            return (**this).reg.parser_id() < (*r).reg.parser_id();
+        }
+        Slice operator->() const { return **this; }
+        bool operator==(const Ref &a) const {
+            if (name_ == a.name_ && lo == a.lo && hi == a.hi) return true;
+            return **this == *a;
+        }
+        bool check(bool err = true) const {
+            if (auto *s = phv.get(gress_, stage, name_)) {
+                if (hi >= 0 && !Slice(*s, lo, hi).valid) {
+                    error(lineno, "Invalid slice of %s", name_.c_str());
+                    return false;
+                }
+                return true;
+            } else if (lineno >= 0 && err) {
+                error(lineno, "No phv record %s", name_.c_str());
+            }
+            return false;
+        }
+        gress_t gress() const { return gress_; }
+        const char *name() const override { return name_.c_str(); }
+        std::string desc() const;
+        int lobit() const { return lo < 0 ? 0 : lo; }
+        int hibit() const { return hi < 0 ? (**this).size() - 1 : hi; }
+        unsigned size() const override {
+            if (lo >= 0) return hi - lo + 1;
+            if (auto *s = phv.get(gress_, stage, name_)) return s->size();
+            return 0;
+        }
+        bool merge(const Ref &r);
+        std::string toString() const override;
+        void dbprint(std::ostream &out) const;
+
+        int get_lineno() const override { return lineno; }
+        int fieldlobit() const override { return lobit(); }
+        int fieldhibit() const override { return hibit(); }
+        int slicelobit() const override { return (**this).lo; }
+        int slicehibit() const override { return (**this).hi; }
+    };
+    // Return register using mau_id as @arg index
+    static const Register *reg(int idx) {
+        BUG_CHECK(idx >= 0 && size_t(idx) < phv.regs.size());
+        return phv.regs[idx];
+    }
+
+    static const Register *reg(std::string name) {
+        for (auto &reg : phv.regs)
+            if (reg->name == name) return reg;
+        return nullptr;
+    }
+
+    // Return the number registers
+    static int num_regs() { return phv.regs.size(); }
+
+    // Return POV name allocated in @arg reg at @arg index
+    static const std::string get_pov_name(int reg, int index) {
+        if (phv.phv_pov_names.count(reg) && phv.phv_pov_names.at(reg).count(index))
+            return phv.phv_pov_names[reg][index];
+        return " ";
+    }
+
+    static const bitvec &use(gress_t gress) { return phv.phv_use[gress]; }
+    static void setuse(gress_t gress, const bitvec &u) { phv.phv_use[gress] |= u; }
+    static void unsetuse(gress_t gress, const bitvec &u) { phv.phv_use[gress] -= u; }
+    static std::string db_regset(const bitvec &s);
+    static unsigned mau_groupsize();
+
+    // Return all field names in @arg reg at @arg stage
+    static const std::set<std::string> &aliases(const Register *reg, int stage) {
+        static std::set<std::string> empty;
+        if (!phv.user_defined.count(reg)) return empty;
+        auto &m = phv.user_defined.at(reg).second;
+        auto it = m.upper_bound(stage);
+        if (it == m.begin()) return empty;
+        return (--it)->second;
+    }
+
+    // For use by gtests
+    static void test_clear() {
+        phv.target = nullptr;
+        phv.regs.clear();
+        phv.phv_pov_names.clear();
+        phv.names[INGRESS].clear();
+        phv.names[EGRESS].clear();
+        phv.names[GHOST].clear();
+    }
+};
+
+extern void merge_phv_vec(std::vector<Phv::Ref> &vec, const Phv::Ref &r);
+extern void merge_phv_vec(std::vector<Phv::Ref> &v1, const std::vector<Phv::Ref> &v2);
+extern std::vector<Phv::Ref> split_phv_bytes(const Phv::Ref &r);
+extern std::vector<Phv::Ref> split_phv_bytes(const std::vector<Phv::Ref> &v);
+
+class Target::Phv {
+    friend class ::Phv;
+    virtual void init_regs(::Phv &) = 0;
+    virtual target_t type() const = 0;
+    virtual unsigned mau_groupsize() const = 0;
+};
+
+inline unsigned Phv::mau_groupsize() { return phv.target->mau_groupsize(); }
+
+#include "jbay/phv.h"
+#include "tofino/phv.h"
+
+#endif /* BACKENDS_TOFINO_BF_ASM_PHV_H_ */
diff --git a/backends/tofino/bf-asm/power_ctl.h b/backends/tofino/bf-asm/power_ctl.h
new file mode 100644
index 00000000000..cbae87a075b
--- /dev/null
+++ b/backends/tofino/bf-asm/power_ctl.h
@@ -0,0 +1,65 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_POWER_CTL_H_
+#define BACKENDS_TOFINO_BF_ASM_POWER_CTL_H_
+
+#include "misc.h"
+
+/* power_ctl is weirdly encoded!
+ * As far as I can tell, it actually walks like this:
+ * -[1:0] dimension controls hi-lo for each 8/16/32b type. In other words,
+ *    [0] = 8b[31~0], 16b[47~0], 32b[31~0] and [1] = 8b[63~32], 16b[95~48], 32[63~32].
+ * -Within the wider dimension, [13:0] = 112b vector, where [31:0] = control for
+ *  32b section (array slice 3~0), [63:32] = control for 8b section (array slice 7~4),
+ *  [111:64] = control for 16b section (array slice 13~8)
+ *
+ * Yes, Jay's decription of how the [1~0][13~0] translates to 224b is correct.
+ * The [1~0] index discriminates phv words going to the left side alu's [0]
+ * vs the right side ones [1]. Within each container size, the bottom 32
+ * (or 48 for 16b) are on the left and the top half ones are on the right.
+ * Pat
+ *
+ * CSR DESCRIPTION IS WRONG!!!
+ */
+
+template <int I>
+void set_power_ctl_reg(checked_array<2, checked_array<16, ubits<I>>> &power_ctl, int reg) {
+    int side = 0;
+    switch (reg / (I * 8)) {
+        case 1:  // 8 bit
+            reg -= I * 8;
+            side = reg / (I * 4);
+            reg = (reg % (I * 4)) + (I * 4);
+            break;
+        case 2:
+        case 3:  // 16 bit
+            reg -= I * 16;
+            side = reg / (I * 6);
+            reg = (reg % (I * 6)) + (I * 8);
+            break;
+        case 0:  // 32 bit
+            side = reg / (I * 4);
+            reg = (reg % (I * 4));
+            break;
+        default:
+            BUG();
+    }
+    power_ctl[side][reg / I] |= 1U << reg % I;
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_POWER_CTL_H_ */
diff --git a/backends/tofino/bf-asm/primitives.cpp b/backends/tofino/bf-asm/primitives.cpp
new file mode 100644
index 00000000000..d9ee4a885a7
--- /dev/null
+++ b/backends/tofino/bf-asm/primitives.cpp
@@ -0,0 +1,165 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "backends/tofino/bf-asm/json.h"
+#include "bfas.h"
+#include "lib/log.h"
+#include "sections.h"
+
+class Primitives : public Section {
+    int lineno = -1;
+    std::unique_ptr<json::obj> _primitives = nullptr;
+    std::string _primitivesFileName;
+
+    Primitives() : Section("primitives") {}
+
+    void input(VECTOR(value_t) args, value_t data) {
+        lineno = data.lineno;
+        if (!CHECKTYPE(data, tSTR)) return;
+        _primitivesFileName = data.s;
+    }
+
+    void process() {
+        if (_primitivesFileName.empty()) return;
+        std::ifstream inputFile(_primitivesFileName);
+        if (!inputFile && _primitivesFileName[0] != '/')
+            inputFile.open(asmfile_dir + "/" + _primitivesFileName);
+        if (!inputFile) {
+            warning(lineno, "%s: can't read file", _primitivesFileName.c_str());
+        } else {
+            inputFile >> _primitives;
+            if (!inputFile) {
+                warning(lineno, "%s: not valid primitives json representation",
+                        _primitivesFileName.c_str());
+                _primitives.reset(new json::map());
+            }
+        }
+    }
+
+    bool merge_actions(json::vector &_prim_actions, json::vector &ctxt_actions) {
+        bool merged = false;
+        for (auto &_prim_action : _prim_actions) {
+            for (auto &ctxt_action : ctxt_actions) {
+                if (*ctxt_action->to<json::map>()["name"] ==
+                    *_prim_action->to<json::map>()["name"]) {
+                    ctxt_action->to<json::map>().merge(_prim_action->to<json::map>());
+                    merged = true;
+                    auto aname = ctxt_action->to<json::map>()["name"]->to<json::string>();
+                    LOG3("Merged primitive action : " << aname);
+                    break;
+                }
+            }
+        }
+        return merged;
+    }
+
+    // If primitives json is present this function will merge the primitives
+    // nodes in the correct table->actions->action node The 'primitives' section
+    // is run last so we have already populated the context json tables at this
+    // stage. We check for the following tree structures to merge the action
+    // nodes
+    // Structure 1 (Match Tables)
+    // tables
+    //  |
+    //  |--> table0
+    //        |--> name
+    //        |--> actions
+    //              |
+    //              |--> action0
+    //                      |
+    //                      |--> name
+    //                      |--> primitives (merge here)
+    // Structure 2 (ALPM Tables)
+    // tables
+    //  |
+    //  |--> table0
+    //        |--> name
+    //        |--> match_attributes
+    //              |
+    //              |--> pre_classifier
+    //                      |
+    //                      |--> actions
+    //                             |
+    //                             |--> action0
+    //                                   |
+    //                                   |--> name
+    //                                   |--> primitives (merge here)
+    // We can have multiple tables with the same name but one without
+    // and other with actions node e.g. stateful & its associated match table.
+    // In this case we want to merge with match table since it has the actions
+    // node
+    void output(json::map &ctxtJson) {
+        if (_primitives) {
+            json::vector &prim_tables = _primitives->to<json::map>()["tables"];
+            json::vector &ctxt_tables = ctxtJson["tables"];
+            for (auto &prim_table : prim_tables) {
+                json::string prim_table_name =
+                    prim_table->to<json::map>()["name"]->to<json::string>();
+                bool is_merged = false;
+                json::string ctxt_table_name;
+                for (auto &ctxt_table : ctxt_tables) {
+                    ctxt_table_name = ctxt_table->to<json::map>()["name"]->to<json::string>();
+                    if (prim_table_name == ctxt_table_name) {
+                        if ((ctxt_table->to<json::map>().count("actions") > 0) &&
+                            (prim_table->to<json::map>().count("actions") > 0)) {
+                            json::vector &prim_table_actions =
+                                prim_table->to<json::map>()["actions"];
+                            json::vector &ctxt_table_actions =
+                                ctxt_table->to<json::map>()["actions"];
+                            is_merged = merge_actions(prim_table_actions, ctxt_table_actions);
+                            break;
+                        } else if ((ctxt_table->to<json::map>().count("match_attributes") > 0) &&
+                                   (prim_table->to<json::map>().count("match_attributes") > 0)) {
+                            json::map &prim_table_ma =
+                                prim_table->to<json::map>()["match_attributes"];
+                            json::map &ctxt_table_ma =
+                                ctxt_table->to<json::map>()["match_attributes"];
+                            if ((ctxt_table_ma.to<json::map>().count("pre_classifier") > 0) &&
+                                (prim_table_ma.to<json::map>().count("pre_classifier") > 0)) {
+                                json::map &prim_table_pc =
+                                    prim_table_ma.to<json::map>()["pre_classifier"];
+                                json::map &ctxt_table_pc =
+                                    ctxt_table_ma.to<json::map>()["pre_classifier"];
+                                if ((ctxt_table_pc.to<json::map>().count("actions") > 0) &&
+                                    (prim_table_pc.to<json::map>().count("actions") > 0)) {
+                                    json::vector &prim_table_actions =
+                                        prim_table_pc.to<json::map>()["actions"];
+                                    json::vector &ctxt_table_actions =
+                                        ctxt_table_pc.to<json::map>()["actions"];
+                                    LOG3("Merging primitive actions on table: " << prim_table_name);
+                                    is_merged =
+                                        merge_actions(prim_table_actions, ctxt_table_actions);
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                }
+                if (!is_merged) {
+                    warning(lineno, "No table named %s found to merge primitive info",
+                            prim_table_name.c_str());
+                }
+            }
+        }
+    }
+
+    static Primitives singleton_primitives;
+} Primitives::singleton_primitives;
diff --git a/backends/tofino/bf-asm/proxy_hash.cpp b/backends/tofino/bf-asm/proxy_hash.cpp
new file mode 100644
index 00000000000..13395de2d5c
--- /dev/null
+++ b/backends/tofino/bf-asm/proxy_hash.cpp
@@ -0,0 +1,188 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "input_xbar.h"
+
+void ProxyHashMatchTable::setup(VECTOR(pair_t) & data) {
+    common_init_setup(data, false, P4Table::MatchEntry);
+    for (auto &kv : MapIterChecked(data, {"meter", "stats", "stateful"})) {
+        if (common_setup(kv, data, P4Table::MatchEntry)) {
+        } else if (kv.key == "proxy_hash_group") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                proxy_hash_group = kv.value.i;
+            }
+        } else if (kv.key == "proxy_hash_algorithm") {
+            if (CHECKTYPE(kv.value, tSTR)) {
+                proxy_hash_alg = kv.value.s;
+            }
+        } else if (kv.key == "search_bus" || kv.key == "result_bus") {
+            // already dealt with in Table::setup_layout via common_init_setup
+        } else {
+            common_sram_setup(kv, data);
+        }
+    }
+}
+
+bool ProxyHashMatchTable::verify_match_key() {
+    for (auto &match_key : match) {
+        if (!dynamic_cast<HashMatchSource *>(match_key)) {
+            error(match_key->get_lineno(), "A proxy hash table %s has a non hash key", name());
+            continue;
+        }
+    }
+    auto match_format = format->field("match");
+    if (match_format && match.empty()) BUG_CHECK("Proxy hash table has no match");
+    return error_count == 0;
+}
+
+int ProxyHashMatchTable::determine_pre_byteswizzle_loc(MatchSource *ms, int lo, int hi, int word) {
+    return (ms->slicelobit() + lo) / 8;
+}
+
+void ProxyHashMatchTable::pass1() {
+    LOG1("### Proxy Hash match table " << name() << " pass1 " << loc());
+    SRamMatchTable::pass1();
+}
+
+void ProxyHashMatchTable::setup_ways() {
+    SRamMatchTable::setup_ways();
+    for (auto &row : layout) {
+        int first_way = -1;
+        for (auto &unit : row.memunits) {
+            int way = way_map.at(unit).way;
+            if (first_way < 0) {
+                first_way = way;
+            } else if (ways[way].group_xme != ways[first_way].group_xme) {
+                error(row.lineno,
+                      "Ways %d and %d of table %s share address bus on row %d, "
+                      "but use different hash groups",
+                      first_way, way, name(), row.row);
+                break;
+            }
+        }
+    }
+}
+
+void ProxyHashMatchTable::setup_word_ixbar_group() {
+    word_ixbar_group.resize(match_in_word.size());
+    for (size_t i = 0; i < match_in_word.size(); i++) {
+        // Basically the value per row/bus of rams.row.vh_xbar.exactmatch_row_vh_xbar_ctl,
+        // based on the diagram in uArch section 6.2.3 Exact Match Row Vertical/Horizontal (VH)
+        // Xbars
+        word_ixbar_group[i] = BYTE_XBAR_GROUPS + proxy_hash_group;
+    }
+}
+
+void ProxyHashMatchTable::pass2() {
+    LOG1("### Proxy Hash match table " << name() << " pass2 " << loc());
+    for (auto &ixb : input_xbar) ixb->pass2();
+    setup_word_ixbar_group();
+
+    if (actions) actions->pass2(this);
+    if (gateway) gateway->pass2();
+    if (idletime) idletime->pass2();
+    if (format) format->pass2(this);
+    for (auto &hd : hash_dist) hd.pass2(this);
+}
+
+void ProxyHashMatchTable::pass3() {
+    LOG1("### Proxy Hash match table " << name() << " pass3 " << loc());
+}
+
+template <class REGS>
+void ProxyHashMatchTable::write_regs_vt(REGS &regs) {
+    LOG1("### Proxy Hash match table " << name() << " write_regs " << loc());
+    SRamMatchTable::write_regs(regs);
+
+    for (auto &row : layout) {
+        auto &rams_row = regs.rams.array.row[row.row];
+        for (auto &unit : row.memunits) {
+            auto &way = way_map[unit];
+            auto &ram = rams_row.ram[unit.col];
+            ram.match_nibble_s0q1_enable = version_nibble_mask.getrange(way.word * 32U, 32);
+            ram.match_nibble_s1q0_enable = UINT64_C(0xffffffff);
+        }
+    }
+}
+
+/**
+ * The purpose of this function is to add the proxy_hash_function cJSON node.  This is used
+ * by the driver in order to build the match key for the proxy hash table.
+ *
+ * By using the group from the proxy hash table, only pull the relevant bits for the proxy
+ * hash lookup.
+ */
+void ProxyHashMatchTable::add_proxy_hash_function(json::map &stage_tbl) const {
+    bitvec hash_matrix_use;
+    for (auto *match_key : match) {
+        hash_matrix_use.setrange(match_key->fieldlobit(), match_key->size());
+    }
+
+    json::map &proxy_hash_function = stage_tbl["proxy_hash_function"] = json::map();
+    json::vector &hash_bits = proxy_hash_function["hash_bits"] = json::vector();
+    BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+    auto *hash_group = input_xbar[0]->get_hash_group(proxy_hash_group);
+    if (hash_group) {
+        for (unsigned id : bitvec(hash_group->tables)) {
+            auto hash_table = input_xbar[0]->get_hash_table(id);
+            gen_hash_bits(hash_table, InputXbar::HashTable(InputXbar::HashTable::EXACT, id),
+                          hash_bits, proxy_hash_group, hash_matrix_use);
+        }
+        proxy_hash_function["hash_function_number"] = proxy_hash_group;
+        proxy_hash_function["ghost_bit_to_hash_bit"] = json::vector();
+        proxy_hash_function["ghost_bit_info"] = json::vector();
+    }
+}
+
+void ProxyHashMatchTable::gen_tbl_cfg(json::vector &out) const {
+    unsigned size = get_number_entries();
+    json::map &tbl = *base_tbl_cfg(out, "match", size);
+    json::map &stage_tbl = *add_common_sram_tbl_cfgs(tbl, "exact", "proxy_hash_match");
+    stage_tbl["memory_resource_allocation"] = nullptr;
+    // FIXME: stash_allocation being null is a placeholder until implemented.
+    stage_tbl["stash_allocation"] = nullptr;
+    add_pack_format(stage_tbl, format.get(), true, false);
+    json::map &match_attributes = tbl["match_attributes"];
+    match_attributes["uses_dynamic_key_masks"] = false;
+    if (ways.size() > 0) {
+        json::vector &way_stage_tables = stage_tbl["ways"] = json::vector();
+        unsigned way_number = 0;
+        for (auto &way : ways) {
+            json::map way_tbl;
+            way_tbl["stage_number"] = stage->stageno;
+            way_tbl["way_number"] = way_number++;
+            way_tbl["stage_table_type"] = "hash_way";
+            auto fmt_width = get_format_width();
+            BUG_CHECK(fmt_width);
+            way_tbl["size"] = way.rams.size() / fmt_width * format->groups() * 1024;
+            add_pack_format(way_tbl, format.get(), false);
+            way_tbl["memory_resource_allocation"] = gen_memory_resource_allocation_tbl_cfg(way);
+            way_stage_tables.push_back(std::move(way_tbl));
+        }
+    }
+    add_proxy_hash_function(stage_tbl);
+    stage_tbl["proxy_hash_algorithm"] = proxy_hash_alg;
+    int proxy_hash_width = 0;
+    for (auto m : match) {
+        proxy_hash_width += m->size();
+    }
+    stage_tbl["proxy_hash_bit_width"] = proxy_hash_width;
+}
+
+DEFINE_TABLE_TYPE(ProxyHashMatchTable)
diff --git a/backends/tofino/bf-asm/reflow.cpp b/backends/tofino/bf-asm/reflow.cpp
new file mode 100644
index 00000000000..83a7f25d9fb
--- /dev/null
+++ b/backends/tofino/bf-asm/reflow.cpp
@@ -0,0 +1,113 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+void output_normal(std::ostream &out, std::vector<std::string> &lines) {
+    for (auto &l : lines) out << l << '\n';
+    lines.clear();
+}
+
+void strip_trail_ws(std::string &s) {
+    auto end = s.find_last_not_of(" \t\r\n");
+    if (end != std::string::npos) s.resize(end + 1);
+}
+void strip_lead_ws(std::string &s) {
+    auto start = s.find_first_not_of(" \t\r\n");
+    if (start != std::string::npos) s.erase(0, start);
+}
+
+void output_1line(std::ostream &out, std::vector<std::string> &lines) {
+    bool first = true;
+    for (auto &l : lines) {
+        if (first) {
+            strip_trail_ws(l);
+            first = false;
+        } else {
+            strip_trail_ws(l);
+            strip_lead_ws(l);
+            out << ' ';
+        }
+        out << l;
+    }
+    out << '\n';
+    lines.clear();
+}
+
+size_t output_len(std::vector<std::string> &lines) {
+    size_t rv = 0;
+    for (auto &l : lines) {
+        size_t len = l.find_last_not_of(" \t\r\n"), plen;
+        if (len == std::string::npos) len = l.size();
+        if (rv == 0 && (plen = l.find_first_not_of(" \t\r\n")) != std::string::npos)
+            len -= plen - 1;
+        rv += len;
+    }
+    return rv;
+}
+
+void reflow(std::istream &in, std::ostream &out) {
+    std::string line;
+    char looking = 0;
+    std::vector<std::string> save;
+    const auto npos = std::string::npos;
+    while (getline(in, line)) {
+        if (line.find('{') != npos && line.find('}') == npos) {
+            output_normal(out, save);
+            looking = '}';
+            save.push_back(line);
+        } else if (line.find('[') != npos && line.find(']') == npos) {
+            output_normal(out, save);
+            looking = ']';
+            save.push_back(line);
+        } else if (looking) {
+            save.push_back(line);
+            if (line.find(looking) != std::string::npos) {
+                output_1line(out, save);
+                looking = 0;
+            } else if (output_len(save) > 100) {
+                output_normal(out, save);
+                looking = 0;
+            }
+        } else {
+            out << line << '\n';
+        }
+    }
+    output_normal(out, save);
+    out << std::flush;
+}
+
+int main(int ac, char **av) {
+    if (ac == 2) {
+        std::ifstream in(av[1]);
+        if (in) {
+            reflow(in, std::cout);
+        } else {
+            std::cerr << "Can't open " << av[1] << std::endl;
+            return 1;
+        }
+    } else if (ac == 1) {
+        reflow(std::cin, std::cout);
+    } else {
+        std::cerr << "usage: " << av[0] << " [file]" << std::endl;
+        return 1;
+    }
+    return 0;
+}
diff --git a/backends/tofino/bf-asm/register_reference.h b/backends/tofino/bf-asm/register_reference.h
new file mode 100644
index 00000000000..12536d2dbbd
--- /dev/null
+++ b/backends/tofino/bf-asm/register_reference.h
@@ -0,0 +1,111 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_REGISTER_REFERENCE_H_
+#define BACKENDS_TOFINO_BF_ASM_REGISTER_REFERENCE_H_
+
+#include <functional>
+#include <iostream>
+
+#include "lib/log.h"
+
+/* used by `dump_unread` methods to hold a concatenation of string literals for printing.
+ * Allocated on the stack, the `pfx` chain prints the calling context */
+struct prefix {
+    const prefix *pfx;
+    const char *str;  // should always be a string literal
+    prefix(const prefix *p, const char *s) : pfx(p), str(s) {}
+};
+
+inline std::ostream &operator<<(std::ostream &out, const prefix *p) {
+    if (p) {
+        if (p->pfx) out << p->pfx << '.';
+        out << p->str;
+    }
+    return out;
+}
+
+/* Class to link register trees together into a larger dag that will expand into a tree
+ * when dumped as binary (so trees that appear in mulitple places will be duplicated)
+ * 'name' is the json file name to use when dumping as cfg.json, and the name for logging
+ * 'tree' is the subtree to dump as binary at the appropriate offset
+ */
+template <class REG>
+class register_reference {
+    REG *tree = nullptr;
+    std::string name;
+
+ public:
+    mutable bool read = false, write = false, disabled_ = false;
+    register_reference() {}
+    register_reference(const register_reference &) = default;
+    register_reference(register_reference &&) = default;
+    register_reference &operator=(const register_reference &) & = default;
+    register_reference &operator=(register_reference &&) & = default;
+    ~register_reference() {}
+
+    register_reference &set(const char *a, REG *r) {
+        if (disabled_) LOG1("ERROR: Writing disabled register value in " << this);
+        if (write) LOG1("WARNING: Overwriting \"" << name << "\" with \"" << a << "\" in " << this);
+        name = a;
+        tree = r;
+        log();
+        write = true;
+        return *this;
+    }
+    const char *c_str() const { return name.c_str(); }
+    REG *operator->() const {
+        read = true;
+        return tree;
+    }
+    explicit operator bool() const { return tree != nullptr; }
+    bool modified() const { return write; }
+    void set_modified(bool v = true) { write = v; }
+    void rewrite() { write = false; }
+    // friend std::ostream &operator<<(std::ostream &out, const register_reference<REG> &u);
+    void enable() { disabled_ = false; }
+    bool disabled() const { return disabled_; }
+    bool disable_if_unmodified() { return false; }
+    bool disable_if_zero() { return false; }
+    bool disable_if_reset_value() { return false; }
+    bool disable() {
+        if (!name.empty()) {
+            LOG1("ERROR: Disabling modified register in " << this);
+            return false;
+        }
+        tree = nullptr;
+        disabled_ = true;
+        return true;
+    }
+    void log() const { LOG1(this << " = \"" << name << "\""); }
+};
+
+template <class REG>
+inline std::ostream &operator<<(std::ostream &out, const register_reference<REG> *u) {
+    print_regname(out, u, u + 1);
+    return out;
+}
+template <class REG>
+inline std::ostream &operator<<(std::ostream &out, const register_reference<REG> &u) {
+    if (!*u.c_str())
+        out << 0;
+    else
+        out << '"' << u.c_str() << '"';
+    return out;
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_REGISTER_REFERENCE_H_ */
diff --git a/backends/tofino/bf-asm/rvalue_reference_wrapper.h b/backends/tofino/bf-asm/rvalue_reference_wrapper.h
new file mode 100644
index 00000000000..a86e4e946e3
--- /dev/null
+++ b/backends/tofino/bf-asm/rvalue_reference_wrapper.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_RVALUE_REFERENCE_WRAPPER_H_
+#define BACKENDS_TOFINO_BF_ASM_RVALUE_REFERENCE_WRAPPER_H_
+
+template <class T>
+class rvalue_reference_wrapper {
+    T *ref;
+
+ public:
+    typedef T type;
+    rvalue_reference_wrapper(T &&r) : ref(&r) {}  // NOLINT(runtime/explicit)
+    template <class U>
+    rvalue_reference_wrapper(U &&r) : ref(&r) {}  // NOLINT(runtime/explicit)
+    T &&get() { return std::move(*ref); }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_RVALUE_REFERENCE_WRAPPER_H_ */
diff --git a/backends/tofino/bf-asm/salu_inst.cpp b/backends/tofino/bf-asm/salu_inst.cpp
new file mode 100644
index 00000000000..a4e370fb2f8
--- /dev/null
+++ b/backends/tofino/bf-asm/salu_inst.cpp
@@ -0,0 +1,1056 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstring>
+
+#include <boost/optional.hpp>
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "instruction.h"
+#include "lib/hex.h"
+#include "phv.h"
+
+namespace StatefulAlu {
+
+struct operand : public IHasDbPrint {
+    struct Base : public IHasDbPrint {
+        int lineno;
+        explicit Base(int line) : lineno(line) {}
+        Base(const Base &a) : lineno(a.lineno) {}
+        virtual ~Base() {}
+        virtual Base *clone() const = 0;
+        virtual void dbprint(std::ostream &) const = 0;
+        virtual bool equiv(const Base *) const = 0;
+        virtual const char *kind() const = 0;
+        virtual Base *lookup(Base *&) { return this; }
+        virtual bool phvRead(std::function<void(const ::Phv::Slice &sl)>) { return false; }
+        virtual void pass1(StatefulTable *) {}
+    } *op;
+    struct Const : public Base {
+        int64_t value;
+        Const *clone() const override { return new Const(*this); }
+        Const(int line, int64_t v) : Base(line), value(v) {}
+        void dbprint(std::ostream &out) const override { out << value; }
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const Const *>(a_)) {
+                return value == a->value;
+            } else {
+                return false;
+            }
+        }
+        const char *kind() const override { return "constant"; }
+    };
+    // Operand representing a constant stored in the register file
+    struct Regfile : public Base {
+        int index = -1;
+        Regfile *clone() const override { return new Regfile(*this); }
+        Regfile(int line, int index) : Base(line), index(index) {}
+        Regfile(int line, const value_t &n) : Base(line) {
+            if (PCHECKTYPE2M(n.vec.size == 2, n[1], tINT, tBIGINT, "SALU regfile row reference"))
+                index = get_int64(n[1], sizeof(index) / 8, "regfile row index out of bounds");
+        }
+        void dbprint(std::ostream &out) const override { out << index; }
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const Regfile *>(a_)) {
+                return index == a->index;
+            } else {
+                return false;
+            }
+        }
+        const char *kind() const override { return "register file constant"; }
+    };
+    struct Phv : public Base {
+        virtual Phv *clone() const = 0;
+        explicit Phv(int lineno) : Base(lineno) {}
+        virtual int phv_index(StatefulTable *tbl) = 0;
+    };
+    struct PhvReg : public Phv {
+        ::Phv::Ref reg;
+        PhvReg *clone() const override { return new PhvReg(*this); }
+        PhvReg(gress_t gress, int stage, const value_t &v) : Phv(v.lineno), reg(gress, stage, v) {}
+        void dbprint(std::ostream &out) const override { out << reg; }
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const PhvReg *>(a_)) {
+                return reg == a->reg;
+            } else {
+                return false;
+            }
+        }
+        const char *kind() const override { return "phv_reg"; }
+        void pass1(StatefulTable *tbl) override {
+            if (!reg.check()) return;
+            int size = tbl->format->begin()->second.size / 8;
+            if (tbl->input_xbar.empty()) {
+                error(lineno, "No input xbar for salu instruction operand for phv");
+                return;
+            }
+            BUG_CHECK(tbl->input_xbar.size() == 1, "%s does not have one input xbar", tbl->name());
+            int byte = tbl->find_on_ixbar(*reg, tbl->input_xbar[0]->match_group());
+            int base = options.target == TOFINO ? 8 : 0;
+            if (byte < 0)
+                error(lineno, "Can't find %s on the input xbar", reg.name());
+            else if (byte != base && byte != base + size)
+                error(lineno, "%s must be at %d or %d on ixbar to be used in stateful table %s",
+                      reg.desc().c_str(), base * 8, (base + size) * 8, tbl->name());
+            else if (int(reg->size()) > size * 8)
+                error(lineno, "%s is too big for stateful table %s", reg.desc().c_str(),
+                      tbl->name());
+            else
+                tbl->phv_byte_mask |= ((1U << (reg->size() + 7) / 8U) - 1) << (byte - base);
+        }
+        int phv_index(StatefulTable *tbl) override {
+            int base = options.target == TOFINO ? 8 : 0;
+            return tbl->find_on_ixbar(*reg, tbl->input_xbar[0]->match_group()) > base;
+        }
+        bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override {
+            fn(*reg);
+            return true;
+        }
+    };
+    // Operand which directly accesses phv(hi/lo) from Input Xbar
+    struct PhvRaw : public Phv {
+        int pi = -1;
+        unsigned mask = ~0U;
+        PhvRaw *clone() const override { return new PhvRaw(*this); }
+        PhvRaw(gress_t gress, const value_t &v) : Phv(v.lineno) {
+            if (v == "phv_lo")
+                pi = 0;
+            else if (v == "phv_hi")
+                pi = 1;
+            else
+                BUG();
+            if (v.type == tCMD && PCHECKTYPE(v.vec.size == 2, v[1], tRANGE)) {
+                if ((v[1].range.lo & 7) || ((v[1].range.hi + 1) & 7))
+                    error(lineno, "only byte slices allowed on %s", v[0].s);
+                mask = (1U << (v[1].range.hi + 1) / 8U) - (1U << (v[1].range.lo / 8U));
+            }
+        }
+        void dbprint(std::ostream &out) const override { out << (pi ? "phv_hi" : "phv_lo"); }
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const PhvRaw *>(a_)) {
+                return pi == a->pi;
+            } else {
+                return false;
+            }
+        }
+        const char *kind() const override { return "phv_ixb"; }
+        void pass1(StatefulTable *tbl) override {
+            int size = tbl->format->begin()->second.size / 8U;
+            if (mask == ~0U)
+                mask = (1U << size) - 1;
+            else if (mask & ~((1U << size) - 1))
+                error(lineno, "slice out of range for %d byte value", size);
+            tbl->phv_byte_mask |= mask << (size * pi);
+        }
+        int phv_index(StatefulTable *tbl) override { return pi; }
+        bool phvRead(std::function<void(const ::Phv::Slice &sl)>) override { return true; }
+    };
+    struct Memory : public Base {
+        Table *tbl;
+        Table::Format::Field *field;
+        Memory *clone() const override { return new Memory(*this); }
+        Memory(int line, Table *t, Table::Format::Field *f) : Base(line), tbl(t), field(f) {}
+        void dbprint(std::ostream &out) const override { out << tbl->format->find_field(field); }
+        bool equiv(const Base *a_) const override {
+            if (auto *a = dynamic_cast<const Memory *>(a_)) {
+                return field == a->field;
+            } else {
+                return false;
+            }
+        }
+        const char *kind() const override { return "memory"; }
+    };
+    struct MathFn;
+    bool neg = false;
+    uint64_t mask = uint32_t(-1);
+    operand() : op(0) {}
+    operand(const operand &a) : op(a.op ? a.op->clone() : 0) {}
+    operand(operand &&a) : op(a.op) { a.op = 0; }
+    operand &operator=(const operand &a) {
+        if (&a != this) {
+            delete op;
+            op = a.op ? a.op->clone() : 0;
+        }
+        return *this;
+    }
+    operand &operator=(operand &&a) {
+        if (&a != this) {
+            delete op;
+            op = a.op;
+            a.op = 0;
+        }
+        return *this;
+    }
+    ~operand() { delete op; }
+    operand(Table *tbl, const Table::Actions::Action *act, const value_t &v, bool can_mask = false);
+    bool valid() const { return op != 0; }
+    explicit operator bool() const { return op != 0; }
+    bool operator==(operand &a) {
+        return op == a.op || (op && a.op && op->lookup(op)->equiv(a.op->lookup(a.op)));
+    }
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) {
+        return op ? op->lookup(op)->phvRead(fn) : false;
+    }
+    void dbprint(std::ostream &out) const {
+        if (neg) out << '-';
+        if (op)
+            op->dbprint(out);
+        else
+            out << "(null)";
+    }
+    Base *operator->() { return op->lookup(op); }
+    template <class T>
+    T *to() {
+        return dynamic_cast<T *>(op);
+    }
+};
+
+struct operand::MathFn : public Base {
+    operand of;
+    MathFn *clone() const override { return new MathFn(*this); }
+    MathFn(int line, operand of) : Base(line), of(of) {}
+    void dbprint(std::ostream &out) const override {
+        out << "math(" << of << ")";
+        ;
+    }
+    bool equiv(const Base *a_) const override {
+        if (auto *a = dynamic_cast<const MathFn *>(a_)) {
+            return of.op == a->of.op;
+        } else {
+            return false;
+        }
+    }
+    const char *kind() const override { return "math fn"; }
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) { return of->phvRead(fn); }
+    void pass1(StatefulTable *tbl) override { of->pass1(tbl); }
+};
+
+operand::operand(Table *tbl, const Table::Actions::Action *act, const value_t &v_, bool can_mask)
+    : op(nullptr) {
+    const value_t *v = &v_;
+    if (options.target == TOFINO) can_mask = false;
+    if (can_mask && v->type == tCMD && *v == "&" && v->vec.size == 3) {
+        if (v->vec[2].type == tINT || v->vec[2].type == tBIGINT) {
+            mask = get_int64(v->vec[2], 64, "mask too large");
+            v = &v->vec[1];
+        } else if (v->vec[1].type == tINT || v->vec[1].type == tBIGINT) {
+            mask = get_int64(v->vec[1], 64, "mask too large");
+            v = &v->vec[2];
+        } else {
+            error(v->lineno, "mask must be a constant");
+        }
+    }
+    if (v->type == tCMD && *v == "-") {
+        neg = true;
+        v = &v->vec[1];
+    }
+    if (v->type == tINT || v->type == tBIGINT) {
+        auto i = get_int64(*v, 64, "Integer too large");
+        op = new Const(v->lineno, i);
+        return;
+    }
+    if (v->type == tCMD && *v == "register_param") {
+        op = new Regfile(v->lineno, *v);
+        return;
+    }
+    if (v->type == tSTR) {
+        if (auto f = tbl->format->field(v->s)) {
+            op = new Memory(v->lineno, tbl, f);
+            return;
+        }
+    }
+    if (v->type == tCMD) {
+        BUG_CHECK(v->vec.size > 0 && v->vec[0].type == tSTR);
+        if (auto f = tbl->format->field(v->vec[0].s)) {
+            if (v->vec.size > 1 && CHECKTYPE(v->vec[1], tRANGE) && v->vec[1].range.lo != 0)
+                error(v->vec[1].lineno, "Can't slice memory field %s in stateful action",
+                      v->vec[0].s);
+            op = new Memory(v->lineno, tbl, f);
+            return;
+        }
+    }
+    if ((v->type == tCMD) && (v->vec[0] == "math_table")) {
+        // operand *opP = new operand(tbl, act, v->vec[1]);
+        op = new MathFn(v->lineno, operand(tbl, act, v->vec[1]));
+        return;
+    }
+    if (*v == "phv_lo" || *v == "phv_hi") {
+        op = new PhvRaw(tbl->gress, *v);
+        return;
+    }
+    if (::Phv::Ref(tbl->gress, tbl->stage->stageno, *v).check(false))
+        op = new PhvReg(tbl->gress, tbl->stage->stageno, *v);
+}
+
+enum salu_slot_use {
+    CMP0,
+    CMP1,
+    CMP2,
+    CMP3,
+    ALU2LO,
+    ALU1LO,
+    ALU2HI,
+    ALU1HI,
+    ALUOUT0,
+    ALUOUT1,
+    ALUOUT2,
+    ALUOUT3,
+    MINMAX,
+    // aliases
+    CMPLO = CMP0,
+    CMPHI = CMP1,
+    ALUOUT = ALUOUT0,
+};
+
+// Abstract interface class for SALU Instructions
+// SALU Instructions - AluOP, BitOP, CmpOP, OutOP
+struct SaluInstruction : public Instruction {
+    explicit SaluInstruction(int lineno) : Instruction(lineno) {}
+    // Stateful ALU's dont access PHV's directly
+    static int decode_predicate(const value_t &exp);
+};
+
+int SaluInstruction::decode_predicate(const value_t &exp) {
+    if (exp == "cmplo") return Target::STATEFUL_PRED_MASK() & STATEFUL_PREDICATION_ENCODE_CMPLO;
+    if (exp == "cmphi") return Target::STATEFUL_PRED_MASK() & STATEFUL_PREDICATION_ENCODE_CMPHI;
+    if (exp == "cmp0") return Target::STATEFUL_PRED_MASK() & STATEFUL_PREDICATION_ENCODE_CMP0;
+    if (Target::STATEFUL_CMP_UNITS() > 1 && exp == "cmp1")
+        return Target::STATEFUL_PRED_MASK() & STATEFUL_PREDICATION_ENCODE_CMP1;
+    if (Target::STATEFUL_CMP_UNITS() > 2 && exp == "cmp2")
+        return Target::STATEFUL_PRED_MASK() & STATEFUL_PREDICATION_ENCODE_CMP2;
+    if (Target::STATEFUL_CMP_UNITS() > 3 && exp == "cmp3")
+        return Target::STATEFUL_PRED_MASK() & STATEFUL_PREDICATION_ENCODE_CMP3;
+    if (exp == "!") return Target::STATEFUL_PRED_MASK() ^ decode_predicate(exp[1]);
+    if (exp == "&") {
+        auto rv = decode_predicate(exp[1]);
+        for (int i = 2; i < exp.vec.size; ++i) rv &= decode_predicate(exp[i]);
+        return rv;
+    }
+    if (exp == "|") {
+        auto rv = decode_predicate(exp[1]);
+        for (int i = 2; i < exp.vec.size; ++i) rv |= decode_predicate(exp[i]);
+        return rv;
+    }
+    if (exp == "^") {
+        auto rv = decode_predicate(exp[1]);
+        for (int i = 2; i < exp.vec.size; ++i) rv ^= decode_predicate(exp[i]);
+        return rv;
+    }
+    if (exp.type == tINT && exp.i >= 0 && exp.i <= Target::STATEFUL_PRED_MASK()) return exp.i;
+    error(exp.lineno, "Unexpected expression %s in predicate", value_desc(&exp));
+    return -1;
+}
+
+struct AluOP : public SaluInstruction {
+    const struct Decode : public Instruction::Decode {
+        std::string name;
+        unsigned opcode;
+        enum operands_t { NONE, A, B, AandB } operands = AandB;
+        const Decode *swap_args;
+        Decode(const char *n, int opc, bool assoc = false, const char *alias_name = 0)
+            : Instruction::Decode(n, STATEFUL_ALU),
+              name(n),
+              opcode(opc),
+              swap_args(assoc ? this : 0) {
+            if (alias_name) alias(alias_name, STATEFUL_ALU);
+        }
+        Decode(const char *n, int opc, Decode *sw, const char *alias_name = 0,
+               operands_t use = AandB)
+            : Instruction::Decode(n, STATEFUL_ALU),
+              name(n),
+              opcode(opc),
+              operands(use),
+              swap_args(sw) {
+            if (sw && !sw->swap_args) sw->swap_args = this;
+            if (alias_name) alias(alias_name, STATEFUL_ALU);
+        }
+        Decode(const char *n, int opc, const char *alias_name)
+            : Instruction::Decode(n, STATEFUL_ALU), name(n), opcode(opc), swap_args(0) {
+            if (alias_name) alias(alias_name, STATEFUL_ALU);
+        }
+        Decode(const char *n, int opc, bool assoc, operands_t use)
+            : Instruction::Decode(n, STATEFUL_ALU),
+              name(n),
+              opcode(opc),
+              operands(use),
+              swap_args(assoc ? this : 0) {}
+        Decode(const char *n, int opc, const char *alias_name, operands_t use)
+            : Instruction::Decode(n, STATEFUL_ALU),
+              name(n),
+              opcode(opc),
+              operands(use),
+              swap_args(0) {
+            if (alias_name) alias(alias_name, STATEFUL_ALU);
+        }
+        Decode(const char *n, int opc, Decode *sw, operands_t use)
+            : Instruction::Decode(n, STATEFUL_ALU),
+              name(n),
+              opcode(opc),
+              operands(use),
+              swap_args(sw) {
+            if (sw && !sw->swap_args) sw->swap_args = this;
+        }
+        Decode(const char *n, target_t targ, int opc)
+            : Instruction::Decode(n, targ, STATEFUL_ALU), name(n), opcode(opc), swap_args(0) {}
+
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    int predication_encode = STATEFUL_PREDICATION_ENCODE_UNCOND;
+    enum dest_t { LO, HI };
+    dest_t dest = LO;
+    operand srca, srcb;
+    AluOP(const Decode *op, int l) : SaluInstruction(l), opc(op) {}
+    std::string name() override { return opc->name; };
+    Instruction *pass1(Table *tbl, Table::Actions::Action *) override;
+    void pass2(Table *tbl, Table::Actions::Action *) override {}
+    bool salu_alu() const override { return true; }
+    bool equiv(Instruction *a_) override;
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override {
+        return srca.phvRead(fn) | srcb.phvRead(fn);
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "INSTR: " << opc->name << " pred=0x" << hex(predication_encode) << " "
+            << (dest ? "hi" : "lo") << ", " << srca << ", " << srcb;
+    }
+    template <class REGS>
+    void write_regs(REGS &regs, Table *tbl, Table::Actions::Action *act);
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+static AluOP::Decode opADD("add", 0x1c, true), opSUB("sub", 0x1e), opSADDU("saddu", 0x10, true),
+    opSADDS("sadds", 0x11, true), opSSUBU("ssubu", 0x12), opSSUBS("ssubs", 0x13),
+    opMINU("minu", 0x14, true), opMINS("mins", 0x15, true), opMAXU("maxu", 0x16, true),
+    opMAXS("maxs", 0x17, true), opNOP("nop", 0x18, true, AluOP::Decode::NONE),
+    opSUBR("subr", 0x1f, &opSUB), opSSUBRU("ssubru", 0x1a, &opSSUBU),
+    opSSUBRS("ssubrs", 0x1b, &opSSUBS),
+
+    opSETZ("setz", 0x00, true, AluOP::Decode::NONE), opNOR("nor", 0x01, true),
+    opANDCA("andca", 0x02), opNOTA("nota", 0x03, "not", AluOP::Decode::A),
+    opANDCB("andcb", 0x04, &opANDCA), opNOTB("notb", 0x05, &opNOTA, AluOP::Decode::B),
+    opXOR("xor", 0x06, true), opNAND("nand", 0x07, true), opAND("and", 0x08, true),
+    opXNOR("xnor", 0x09, true), opB("alu_b", 0x0a, "b", AluOP::Decode::B), opORCA("orca", 0x0b),
+    opA("alu_a", 0x0c, &opB, "a", AluOP::Decode::A), opORCB("orcb", 0x0d, &opORCA),
+    opOR("or", 0x0e, true), opSETHI("sethi", 0x0f, true, AluOP::Decode::NONE);
+
+Instruction *AluOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                   const VECTOR(value_t) & op) const {
+    AluOP *rv = new AluOP(this, op[0].lineno);
+    auto operands = this->operands;
+    int idx = 1;
+    // Check optional predicate operand
+    if (idx < op.size) {
+        if (op[idx].type == tINT) {
+            // Predicate is an integer. no warning for odd values
+            rv->predication_encode = op[idx++].i;
+        } else if (op[idx].startsWith("cmp") || op[idx] == "!" || op[idx] == "&" ||
+                   op[idx] == "|" || op[idx] == "^") {
+            // Predicate is an expression
+            rv->predication_encode = decode_predicate(op[idx++]);
+            if (rv->predication_encode == STATEFUL_PREDICATION_ENCODE_NOOP)
+                warning(op[idx - 1].lineno, "Instruction predicate is always false");
+            else if (rv->predication_encode == STATEFUL_PREDICATION_ENCODE_UNCOND)
+                warning(op[idx - 1].lineno, "Instruction predicate is always true");
+        }
+    }
+    if (idx < op.size && op[idx] == "lo") {
+        rv->dest = LO;
+        idx++;
+    } else if (idx < op.size && op[idx] == "hi") {
+        rv->dest = HI;
+        idx++;
+    } else if (idx == op.size && name == "nop") {
+        // allow nop without even a destination -- assume lo
+        rv->dest = LO;
+    } else {
+        error(rv->lineno, "invalid destination for %s instruction", op[0].s);
+    }
+    if (operands == NONE) {
+        if (idx < op.size) error(rv->lineno, "too many operands for %s instruction", op[0].s);
+        return rv;
+    }
+    if (idx < op.size && operands != B) rv->srca = operand(tbl, act, op[idx++]);
+    if (idx < op.size && operands != A) rv->srcb = operand(tbl, act, op[idx++]);
+    if (swap_args && (rv->srca.to<operand::Phv>() || rv->srca.to<operand::MathFn>() ||
+                      (rv->srcb.to<operand::Memory>() &&
+                       (rv->srca.to<operand::Const>() || rv->srca.to<operand::Regfile>())))) {
+        operands = (rv->opc = swap_args)->operands;
+        std::swap(rv->srca, rv->srcb);
+    }
+    if (idx < op.size)
+        error(rv->lineno, "too many operands for %s instruction", op[0].s);
+    else if ((!rv->srca && operands != B) || (!rv->srcb && operands != A))
+        error(rv->lineno, "not enough operands for %s instruction", op[0].s);
+    if (auto mf = rv->srca.to<operand::MathFn>()) {
+        error(rv->lineno, "Can't reference math table in %soperand of %s instruction",
+              operands != A ? "first " : "", op[0].s);
+        if (!mf->of.to<operand::Phv>() && !mf->of.to<operand::Memory>())
+            error(rv->lineno, "Math table input must come from Phv or memory");
+    }
+    if (rv->srca.to<operand::Phv>())
+        error(rv->lineno, "Can't reference phv in %soperand of %s instruction",
+              operands != A ? "first " : "", op[0].s);
+    if (rv->srcb.to<operand::Memory>())
+        error(rv->lineno, "Can't reference memory in %soperand of %s instruction",
+              operands != A ? "first " : "", op[0].s);
+    if (auto mf = rv->srcb.to<operand::MathFn>()) {
+        rv->slot = ALU2LO;
+        if (rv->dest != LO) error(rv->lineno, "Can't reference math table in alu-hi");
+        if (!mf->of.to<operand::Phv>() && !mf->of.to<operand::Memory>())
+            error(rv->lineno, "Math table input must come from Phv or memory");
+    }
+    if (rv->srca.neg) {
+        if (auto k = rv->srca.to<operand::Const>())
+            k->value = -k->value;
+        else
+            error(rv->lineno, "Can't negate operand of %s instruction", op[0].s);
+    }
+    if (rv->srcb.neg) {
+        if (auto k = rv->srcb.to<operand::Const>())
+            k->value = -k->value;
+        else
+            error(rv->lineno, "Can't negate operand of %s instruction", op[0].s);
+    }
+    return rv;
+}
+
+bool AluOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<AluOP *>(a_))
+        return opc == a->opc && predication_encode == a->predication_encode && dest == a->dest &&
+               srca == a->srca && srcb == a->srcb;
+    return false;
+}
+
+Instruction *AluOP::pass1(Table *tbl_, Table::Actions::Action *act) {
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    if (slot < 0 && act->slot_use[slot = (dest ? ALU1HI : ALU1LO)]) slot = dest ? ALU2HI : ALU2LO;
+    auto k1 = srca.to<operand::Const>();
+    auto k2 = srcb.to<operand::Const>();
+    // Check cases when both constants would be stored in the register file on different rows
+    // Two constants that do not fit as immediate constants
+    if (k1 && k2 && !k1->equiv(k2))
+        error(lineno, "can only have one constant in an SALU instruction");
+    if (!k1) k1 = k2;
+    if (k1 && (k1->value < Target::STATEFUL_ALU_CONST_MIN() ||
+               k1->value > Target::STATEFUL_ALU_CONST_MAX())) {
+        if (k1->value >= (INT64_C(1) << tbl->alu_size()) ||
+            k1->value < (INT64_C(~0u) << (tbl->alu_size() - 1))) {
+            error(lineno,
+                  "value %" PRIi64
+                  " of the constant operand"
+                  " out of range for %d bit stateful ALU",
+                  k1->value, tbl->alu_size());
+        } else if (k1->value >= (INT64_C(1) << (Target::STATEFUL_REGFILE_CONST_WIDTH() - 1))) {
+            // constants have a limited width, and are always signed, so need to make
+            // sure they wrap properly
+            k1->value -= INT64_C(1) << Target::STATEFUL_REGFILE_CONST_WIDTH();
+            if (k2 && k2 != k1) k2->value = k1->value;
+        }
+    }
+    auto r1 = srca.to<operand::Regfile>();
+    auto r2 = srcb.to<operand::Regfile>();
+    if (r1 && r2 && !r1->equiv(r2))
+        error(lineno, "can only have one register file reference in an SALU instruction");
+    if (!r1) r1 = r2;
+    if (r1) {
+        int64_t v1 = tbl->get_const_val(r1->index);
+        if (v1 >= (INT64_C(1) << tbl->alu_size()) || v1 < (INT64_C(~0u) << (tbl->alu_size() - 1))) {
+            error(lineno,
+                  "initial value %" PRIi64
+                  " of the register file operand"
+                  " out of range for %d bit stateful ALU",
+                  v1, tbl->alu_size());
+        }
+    }
+    if (k1 && r1)
+        error(lineno,
+              "can have either a constant or a register file reference"
+              " in an SALU instruction");
+    if (srca) srca->pass1(tbl);
+    if (srcb) srcb->pass1(tbl);
+    return this;
+}
+
+Instruction *genNoop(StatefulTable *tbl, Table::Actions::Action *act) {
+    VECTOR(value_t) args = EMPTY_VECTOR_INIT;
+    BUG_CHECK(tbl->format->begin() != tbl->format->end(), "No tbl->format!");
+    args.add("or").add("lo").add(0).add(tbl->format->begin()->first.c_str());
+    auto *rv = Instruction::decode(tbl, act, args);
+    VECTOR_fini(args);
+    return rv;
+}
+
+struct BitOP : public SaluInstruction {
+    const struct Decode : public Instruction::Decode {
+        std::string name;
+        unsigned opcode;
+        Decode(const char *n, unsigned opc)
+            : Instruction::Decode(n, STATEFUL_ALU), name(n), opcode(opc) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    int predication_encode = STATEFUL_PREDICATION_ENCODE_UNCOND;
+    BitOP(const Decode *op, int lineno) : SaluInstruction(lineno), opc(op) {}
+    std::string name() override { return opc->name; };
+    Instruction *pass1(Table *, Table::Actions::Action *) override {
+        slot = ALU1LO;
+        return this;
+    }
+    void pass2(Table *, Table::Actions::Action *) override {}
+    bool salu_alu() const override { return true; }
+    bool equiv(Instruction *a_) override;
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override { return false; }
+    void dbprint(std::ostream &out) const override { out << "INSTR: " << opc->name; }
+    template <class REGS>
+    void write_regs(REGS &regs, Table *tbl, Table::Actions::Action *act);
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+static BitOP::Decode opSET_BIT("set_bit", 0x0), opSET_BITC("set_bitc", 0x1),
+    opCLR_BIT("clr_bit", 0x2), opCLR_BITC("clr_bitc", 0x3), opREAD_BIT("read_bit", 0x4),
+    opREAD_BITC("read_bitc", 0x5), opSET_BIT_AT("set_bit_at", 0x6),
+    opSET_BITC_AT("set_bitc_at", 0x7), opCLR_BIT_AT("clr_bit_at", 0x8),
+    opCLR_BITC_AT("clr_bitc_at", 0x9);
+
+Instruction *BitOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                   const VECTOR(value_t) & op) const {
+    BitOP *rv = new BitOP(this, op[0].lineno);
+    if (op.size > 1) error(rv->lineno, "too many operands for %s instruction", op[0].s);
+    return rv;
+}
+
+bool BitOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<BitOP *>(a_)) return opc == a->opc;
+    return false;
+}
+
+struct CmpOP : public SaluInstruction {
+    const struct Decode : public Instruction::Decode {
+        std::string name;
+        unsigned opcode;
+        Decode(const char *n, unsigned opc, bool type)
+            : Instruction::Decode(n, STATEFUL_ALU, type), name(n), opcode(opc) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    int type = 0;
+    operand::Memory *srca = 0;
+    uint32_t maska = 0xffffffffU;
+    operand::Phv *srcb = 0;
+    uint32_t maskb = 0xffffffffU;
+    operand::Base *srcc = 0;  // operand::Const or operand::Regfile
+    bool srca_neg = false, srcb_neg = false;
+    bool learn = false, learn_not = false;
+    CmpOP(const Decode *op, int lineno) : SaluInstruction(lineno), opc(op) {}
+    std::string name() override { return opc->name; };
+    Instruction *pass1(Table *tbl, Table::Actions::Action *) override;
+    void pass2(Table *tbl, Table::Actions::Action *) override {}
+    bool equiv(Instruction *a_) override;
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override {
+        bool rv = false;
+        if (srca) rv |= srca->phvRead(fn);
+        if (srcb) rv |= srcb->phvRead(fn);
+        if (srcc) rv |= srcc->phvRead(fn);
+        return rv;
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "INSTR: " << opc->name << " cmp" << slot;
+        if (srca) {
+            out << ", " << (srca_neg ? "-" : "") << *srca;
+            if (maska != 0xffffffffU) out << " & 0x" << hex(maska);
+        }
+        if (srcb) {
+            out << ", " << (srcb_neg ? "-" : "") << *srcb;
+            if (maskb != 0xffffffffU) out << " & 0x" << hex(maskb);
+        }
+        if (srcc) out << ", " << *srcc;
+        if (learn) out << ", learn";
+        if (learn_not) out << ", learn_not";
+    }
+    template <class REGS>
+    void write_regs(REGS &regs, Table *tbl, Table::Actions::Action *act);
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+static CmpOP::Decode opEQU("equ", 0, false), opNEQ("neq", 1, false), opGRT("grt", 0, true),
+    opLEQ("leq", 1, true), opGEQ("geq", 2, true), opLSS("lss", 3, true);
+
+Instruction *CmpOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                   const VECTOR(value_t) & op) const {
+    auto rv = new CmpOP(this, op[0].lineno);
+    if (auto *p = strchr(op[0].s, '.')) {
+        if (type_suffix && !strcmp(p, ".s"))
+            rv->type = 1;
+        else if (type_suffix && !strcmp(p, ".u"))
+            rv->type = 2;
+        else if (type_suffix && !strcmp(p, ".uus"))
+            rv->type = 3;
+        else
+            error(rv->lineno, "Invalid type %s for %s instruction", p + 1, name.c_str());
+    } else if (type_suffix) {
+        error(rv->lineno, "Missing type for %s instruction", name.c_str());
+    }
+    if (op.size < 1 || op[1].type != tSTR) {
+        error(rv->lineno, "invalid destination for %s instruction", op[0].s);
+        return rv;
+    }
+    unsigned unit;
+    int len;
+    if (op[1] == "lo") {
+        rv->slot = CMPLO;
+    } else if (op[1] == "hi") {
+        rv->slot = CMPHI;
+    } else if ((sscanf(op[1].s, "p%u%n", &unit, &len) >= 1 ||
+                sscanf(op[1].s, "cmp%u%n", &unit, &len) >= 1) &&
+               unit < Target::STATEFUL_CMP_UNITS() && op[1].s[len] == 0) {
+        rv->slot = CMP0 + unit;
+    } else {
+        error(rv->lineno, "invalid destination for %s instruction", op[0].s);
+    }
+    for (int idx = 2; idx < op.size; ++idx) {
+        if (!rv->learn) {
+            if (op[idx] == "learn") {
+                rv->learn = true;
+                continue;
+            }
+            if (op[idx] == "!" && op[idx].type == tCMD && op[idx].vec.size == 2 &&
+                op[idx][1] == "learn") {
+                rv->learn = rv->learn_not = true;
+                continue;
+            }
+        }
+        operand src(tbl, act, op[idx], true);
+        if (!rv->srca && (rv->srca = src.to<operand::Memory>())) {
+            rv->srca_neg = src.neg;
+            rv->maska = src.mask;
+            src.op = nullptr;
+        } else if (!rv->srcb && (rv->srcb = src.to<operand::Phv>())) {
+            rv->srcb_neg = src.neg;
+            rv->maskb = src.mask;
+            src.op = nullptr;
+        } else if (!rv->srcc && (rv->srcc = src.to<operand::Const>())) {
+            auto *srcc = src.to<operand::Const>();
+            if (src.neg) srcc->value = -srcc->value;
+            if (src.mask != ~0U) srcc->value &= src.mask;
+            src.op = nullptr;
+        } else if (!rv->srcc && (rv->srcc = src.to<operand::Regfile>())) {
+            if (src.neg || src.mask != ~0U)
+                error(src->lineno, "Register file operand cannot be negated or masked");
+            src.op = nullptr;
+        } else if (src) {
+            error(src->lineno, "Can't have more than one %s operand to an SALU compare",
+                  src->kind());
+        }
+    }
+    return rv;
+}
+
+bool CmpOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<CmpOP *>(a_))
+        return opc == a->opc && slot == a->slot && srca == a->srca && maska == a->maska &&
+               srcb == a->srcb && maskb == a->maskb && srcc == a->srcc && learn == a->learn &&
+               learn_not == a->learn_not;
+    return false;
+}
+
+Instruction *CmpOP::pass1(Table *tbl_, Table::Actions::Action *act) {
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    if (srca) srca->pass1(tbl);
+    if (srcb) srcb->pass1(tbl);
+    if (srcc) srcc->pass1(tbl);
+    return this;
+}
+
+struct TMatchOP : public SaluInstruction {
+    const struct Decode : public Instruction::Decode {
+        std::string name;
+        Decode(const char *n, target_t target)
+            : Instruction::Decode(n, target, STATEFUL_ALU), name(n) {}
+        Decode(const char *n, std::set<target_t> target)
+            : Instruction::Decode(n, target, STATEFUL_ALU), name(n) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    } *opc;
+    operand::Memory *srca = 0;
+    uint64_t mask = 0;
+    operand::Phv *srcb = 0;
+    bool learn = false, learn_not = false;
+    TMatchOP(const Decode *op, int lineno) : SaluInstruction(lineno), opc(op) {}
+    std::string name() override { return opc->name; };
+    Instruction *pass1(Table *tbl, Table::Actions::Action *) override;
+    void pass2(Table *tbl, Table::Actions::Action *) override {}
+    bool equiv(Instruction *a_) override;
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override {
+        return srcb ? srcb->phvRead(fn) : false;
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "INSTR: " << opc->name << " cmp" << slot;
+        if (srca) out << ", " << *srca;
+        if (mask) out << ", 0x" << hex(mask);
+        if (srcb) out << ", " << *srcb;
+        if (learn) out << ", learn";
+        if (learn_not) out << ", learn_not";
+    }
+    template <class REGS>
+    void write_regs(REGS &regs, Table *tbl, Table::Actions::Action *act);
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+static TMatchOP::Decode opTMatch("tmatch", {
+                                               JBAY,
+                                           });
+
+Instruction *TMatchOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                      const VECTOR(value_t) & op) const {
+    auto rv = new TMatchOP(this, op[0].lineno);
+    if (op.size < 1 || op[1].type != tSTR) {
+        error(rv->lineno, "invalid destination for %s instruction", op[0].s);
+        return rv;
+    }
+    unsigned unit;
+    int len;
+    if ((sscanf(op[1].s, "p%u%n", &unit, &len) >= 1 ||
+         sscanf(op[1].s, "cmp%u%n", &unit, &len) >= 1) &&
+        unit < Target::STATEFUL_TMATCH_UNITS() && op[1].s[len] == 0) {
+        rv->slot = CMP0 + unit;
+    } else {
+        error(rv->lineno, "invalid destination for %s instruction", op[0].s);
+    }
+    for (int idx = 2; idx < op.size; ++idx) {
+        if (!rv->learn) {
+            if (op[idx] == "learn") {
+                rv->learn = true;
+                continue;
+            }
+            if (op[idx] == "!" && op[idx].type == tCMD && op[idx].vec.size == 2 &&
+                op[idx][1] == "learn") {
+                rv->learn = rv->learn_not = true;
+                continue;
+            }
+        }
+        if (op[idx].type == tINT || op[idx].type == tBIGINT) {
+            if (rv->mask)
+                error(op[idx].lineno, "Can't have more than one mask operand to an SALU tmatch");
+            rv->mask = get_int64(op[idx], 64, "Integer too large");
+        } else if (op[idx].type == tSTR) {
+            if (auto f = tbl->format->field(op[idx].s)) {
+                if (rv->srca) {
+                    error(op[idx].lineno,
+                          "Can't have more than one memory operand to an "
+                          "SALU tmatch");
+                    delete rv->srca;
+                }
+                rv->srca = new operand::Memory(op[idx].lineno, tbl, f);
+            } else if (rv->srcb) {
+                error(op[idx].lineno, "Can't have more than one phv operand to an SALU tmatch");
+            } else if (op[idx] == "phv_lo" || op[idx] == "phv_hi") {
+                rv->srcb = new operand::PhvRaw(tbl->gress, op[idx]);
+            } else {
+                rv->srcb = new operand::PhvReg(tbl->gress, tbl->stage->stageno, op[idx]);
+            }
+        }
+    }
+    if (!rv->srca || !rv->srcb || !rv->mask)
+        error(rv->lineno, "Not enough operands to SALU tmatch");
+    return rv;
+}
+
+bool TMatchOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<TMatchOP *>(a_))
+        return opc == a->opc && slot == a->slot && srca == a->srca && srcb == a->srcb &&
+               mask == a->mask && learn == a->learn && learn_not == a->learn_not;
+    return false;
+}
+
+Instruction *TMatchOP::pass1(Table *tbl_, Table::Actions::Action *act) {
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    if (srca) srca->pass1(tbl);
+    if (srcb) srcb->pass1(tbl);
+    if (tbl->tmatch_use[slot].op) {
+        if (mask != tbl->tmatch_use[slot].op->mask) {
+            error(lineno, "Incompatable tmatch masks in stateful actions %s and %s",
+                  tbl->tmatch_use[slot].act->name.c_str(), act->name.c_str());
+            error(tbl->tmatch_use[slot].op->lineno, "previous use");
+        }
+    } else {
+        tbl->tmatch_use[slot].act = act;
+        tbl->tmatch_use[slot].op = this;
+    }
+    return this;
+}
+
+// Output ALU instruction
+struct OutOP : public SaluInstruction {
+    struct Decode : public Instruction::Decode {
+        explicit Decode(const char *n) : Instruction::Decode(n, STATEFUL_ALU) {}
+        Instruction *decode(Table *tbl, const Table::Actions::Action *act,
+                            const VECTOR(value_t) & op) const override;
+    };
+    int predication_encode = STATEFUL_PREDICATION_ENCODE_UNCOND;
+    operand src;
+    int output_mux = -1;
+    bool lmatch = false;
+    int lmatch_pred = 0;
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void decode_output_mux,
+                          (register_type, Table *tbl, value_t &op))
+    void decode_output_mux(Table *tbl, value_t &op) {
+        SWITCH_FOREACH_TARGET(options.target, decode_output_mux(TARGET(), tbl, op););
+    }
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, int decode_output_option, (register_type, value_t &op))
+    int decode_output_option(value_t &op) {
+        SWITCH_FOREACH_TARGET(options.target, return decode_output_option(TARGET(), op););
+    }
+    OutOP(const Decode *op, int lineno) : SaluInstruction(lineno) {}
+    std::string name() override { return "output"; };
+    Instruction *pass1(Table *tbl, Table::Actions::Action *) override;
+    void pass2(Table *tbl, Table::Actions::Action *) override {}
+    bool salu_output() const override { return true; }
+    bool equiv(Instruction *a_) override;
+    bool phvRead(std::function<void(const ::Phv::Slice &sl)> fn) override {
+        return src ? src->phvRead(fn) : false;
+    }
+    void dbprint(std::ostream &out) const override {
+        out << "INSTR: output " << "pred=0x" << hex(predication_encode) << " word"
+            << (slot - ALUOUT0) << " mux=" << output_mux;
+    }
+    template <class REGS>
+    void write_regs(REGS &regs, Table *tbl, Table::Actions::Action *act);
+    FOR_ALL_REGISTER_SETS(DECLARE_FORWARD_VIRTUAL_INSTRUCTION_WRITE_REGS)
+};
+
+static OutOP::Decode opOUTPUT("output");
+
+bool OutOP::equiv(Instruction *a_) {
+    if (auto *a = dynamic_cast<OutOP *>(a_))
+        return predication_encode == a->predication_encode && slot == a->slot &&
+               output_mux == a->output_mux;
+    return false;
+}
+
+Instruction *OutOP::Decode::decode(Table *tbl, const Table::Actions::Action *act,
+                                   const VECTOR(value_t) & op) const {
+    OutOP *rv = new OutOP(this, op[0].lineno);
+    int idx = 1;
+    // Check optional predicate operand
+    if (idx < op.size) {
+        // Predicate is an integer
+        if (op[idx].type == tINT) {
+            rv->predication_encode = op[idx++].i;
+            // Predicate is an expression
+        } else if (op[idx].startsWith("cmp") || op[idx] == "!" || op[idx] == "&" ||
+                   op[idx] == "|" || op[idx] == "^") {
+            rv->predication_encode = decode_predicate(op[idx++]);
+            if (rv->predication_encode == STATEFUL_PREDICATION_ENCODE_NOOP)
+                warning(op[idx - 1].lineno, "Instruction predicate is always false");
+            else if (rv->predication_encode == STATEFUL_PREDICATION_ENCODE_UNCOND)
+                warning(op[idx - 1].lineno, "Instruction predicate is always true");
+        }
+    }
+    rv->slot = ALUOUT;
+    // Check for destination
+    if (idx < op.size && op[idx].startsWith("word")) {
+        int unit = -1;
+        char *end;
+        if (op[idx].type == tSTR) {
+            if (isdigit(op[idx].s[4])) {
+                unit = strtol(op[idx].s + 4, &end, 10);
+                if (*end) unit = -1;
+            }
+        } else if (op[idx].vec.size == 2 && op[idx][1].type == tINT) {
+            unit = op[idx][1].i;
+        }
+        if (unit >= Target::STATEFUL_OUTPUT_UNITS())
+            error(op[idx].lineno, "Invalid output dest %s", value_desc(op[idx]));
+        else
+            rv->slot = unit + ALUOUT0;
+        idx++;
+    }
+    // Check mux operand
+    if (idx < op.size) {
+        rv->src = operand(tbl, act, op[idx], false);
+        // DANGER -- decoding the output mux here (as part of input parsing) requires that
+        // the phv section be before the section we're currently parsing in the .bfa file.
+        // That's always the case with compiler output, but do we want to require it for
+        // hand-written code?  Could reorg stuff to do this in pass1 instead.
+        rv->decode_output_mux(tbl, op[idx]);
+        if (rv->output_mux < 0)
+            error(op[idx].lineno, "invalid operand '%s' for '%s' instruction", value_desc(op[idx]),
+                  op[0].s);
+        idx++;
+    } else {
+        error(rv->lineno, "too few operands for %s instruction", op[0].s);
+    }
+    while (idx < op.size) {
+        if (rv->decode_output_option(op[idx]) < 0) break;
+        ++idx;
+    }
+    if (idx < op.size) error(rv->lineno, "too many operands for %s instruction", op[0].s);
+
+    return rv;
+}
+
+Instruction *OutOP::pass1(Table *tbl_, Table::Actions::Action *act) {
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    if (src) src->pass1(tbl);
+    if (output_mux == STATEFUL_PREDICATION_OUTPUT) {
+        if (act->pred_comb_sel >= 0 && act->pred_comb_sel != predication_encode)
+            error(lineno, "Only one output of predication allowed");
+        act->pred_comb_sel = predication_encode;
+    }
+    if (lmatch) {
+        if (tbl->output_lmatch) {
+            auto *other = dynamic_cast<OutOP *>(tbl->output_lmatch);
+            BUG_CHECK(other);
+            if (lmatch_pred != other->lmatch_pred) {
+                error(lineno, "Conflict lmatch output use in stateful %s", tbl->name());
+                error(other->lineno, "conflicting use here");
+            }
+        }
+        tbl->output_lmatch = this;
+    }
+    return this;
+}
+
+#include "jbay/salu_inst.cpp"    // NOLINT(build/include)
+#include "tofino/salu_inst.cpp"  // NOLINT(build/include)
+
+}  // end namespace StatefulAlu
+
+bool StatefulTable::p4c_5192_workaround(const Actions::Action *act) const {
+    // when trying to output bits 96..127
+    // of either memory or phv input in an SALU in 128-bit mode, the model asserts
+    // Not clear if this is a hardware limitation or a model bug.
+    // RMT_ASSERTS on lines 547 and 565 of model/src/shared/mau-stateful-alu.cpp
+    // Workaround is to use 64x2 mode instead which is otherwise equivalent, except
+    // for possible problems if minmax is used
+    using namespace StatefulAlu;
+    if (format->log2size != 7 || is_dual_mode()) return false;  // only apply in 128-bit mode
+    for (auto &inst : act->instr) {
+        if (auto *out = dynamic_cast<const OutOP *>(inst.get())) {
+            if (out->slot > ALUOUT1 && (out->output_mux == 1 || out->output_mux == 3)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
diff --git a/backends/tofino/bf-asm/sections.h b/backends/tofino/bf-asm/sections.h
new file mode 100644
index 00000000000..ea44ce80f07
--- /dev/null
+++ b/backends/tofino/bf-asm/sections.h
@@ -0,0 +1,104 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_SECTIONS_H_
+#define BACKENDS_TOFINO_BF_ASM_SECTIONS_H_
+
+#include <stdarg.h>
+
+#include <string>
+
+#include "asm-types.h"
+#include "backends/tofino/bf-asm/json.h"
+#include "bfas.h"
+#include "map.h"
+
+/// A Section represents a top level section in assembly
+/// Current sections include:
+/// version, phv, parser, deparser, stage, dynhash, primitives
+class Section : virtual public Parsable, virtual public Contextable {
+    static std::map<std::string, Section *> *sections;
+    std::string name;
+    bool isInput = false;
+    static Section *get(const char *name) { return ::get(sections, name); }
+
+ protected:
+    explicit Section(const char *name_) : name(name_) {
+        if (!sections) sections = new std::map<std::string, Section *>();
+        if (get(name_)) {
+            fprintf(stderr, "Duplicate section handler for %s\n", name_);
+            exit(1);
+        }
+        (*sections)[name] = this;
+    }
+    virtual ~Section() {
+        sections->erase(name);
+        if (sections->empty()) {
+            delete sections;
+            sections = 0;
+        }
+    }
+    /// process the arguments on the same line as the heading
+    virtual void start(int lineno, VECTOR(value_t) args) {}
+    /// optionally process the data if not done during parsing
+    virtual void process() {}
+
+ public:
+    static int start_section(int lineno, char *name, VECTOR(value_t) args) {
+        if (Section *sec = get(name)) {
+            int prev_error_count = error_count;
+            sec->isInput = true;
+            sec->start(lineno, args);
+            return error_count > prev_error_count;
+        } else {
+            warning(lineno, "Unknown section %s, ignoring\n", name);
+            return 1;
+        }
+    }
+    static void asm_section(char *name, VECTOR(value_t) args, value_t data) {
+        if (Section *sec = get(name)) sec->input(args, data);
+    }
+    static void process_all() {
+        if (sections)
+            for (auto &it : *sections) it.second->process();
+    }
+    static void output_all(json::map &ctxtJson) {
+        if (sections) {
+            for (auto &it : *sections) {
+                // Skip primitives to be called last
+                if (it.first == "primitives") continue;
+                it.second->output(ctxtJson);
+            }
+            auto &s = *sections;
+            if (s.count("primitives")) s["primitives"]->output(ctxtJson);
+        }
+    }
+    static bool no_sections_in_assembly() {
+        if (sections) {
+            for (auto &it : *sections) {
+                if (it.second->isInput) return false;
+            }
+        }
+        return true;
+    }
+    static bool section_in_assembly(const char *name) { return get(name)->isInput; }
+
+ public:  // for gtest
+    static Section *test_get(const char *name) { return get(name); }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_SECTIONS_H_ */
diff --git a/backends/tofino/bf-asm/selection.cpp b/backends/tofino/bf-asm/selection.cpp
new file mode 100644
index 00000000000..e4fafd0441a
--- /dev/null
+++ b/backends/tofino/bf-asm/selection.cpp
@@ -0,0 +1,449 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "data_switchbox.h"
+#include "input_xbar.h"
+#include "lib/algorithm.h"
+#include "misc.h"
+
+void SelectionTable::setup(VECTOR(pair_t) & data) {
+    setup_layout(layout, data);
+    VECTOR(pair_t) p4_info = EMPTY_VECTOR_INIT;
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (kv.key == "input_xbar") {
+            if (CHECKTYPE(kv.value, tMAP))
+                input_xbar.emplace_back(InputXbar::create(this, false, kv.key, kv.value.map));
+        } else if (kv.key == "mode") {
+            mode_lineno = kv.value.lineno;
+            if (CHECKTYPEPM(kv.value, tCMD, kv.value.vec.size == 2 && kv.value[1].type == tINT,
+                            "hash mode and int param")) {
+                if (kv.value[0] == "resilient")
+                    resilient_hash = true;
+                else if (kv.value[0] == "fair")
+                    resilient_hash = false;
+                else
+                    error(kv.value.lineno, "Unknown hash mode %s", kv.value[0].s);
+                param = kv.value[1].i;
+            }
+        } else if (kv.key == "non_linear") {
+            non_linear_hash = get_bool(kv.value);
+        } else if (kv.key == "per_flow_enable") {
+            if (CHECKTYPE(kv.value, tSTR)) {
+                per_flow_enable = true;
+                per_flow_enable_param = kv.value.s;
+            }
+        } else if (kv.key == "pool_sizes") {
+            if (CHECKTYPE(kv.value, tVEC))
+                for (value_t &v : kv.value.vec)
+                    if (CHECKTYPE(v, tINT)) pool_sizes.push_back(v.i);
+        } else if (kv.key == "selection_hash") {
+            if (CHECKTYPE(kv.value, tINT)) selection_hash = kv.value.i;
+        } else if (kv.key == "hash_dist") {
+            HashDistribution::parse(hash_dist, kv.value);
+            if (hash_dist.size() > 1)
+                error(kv.key.lineno, "More than one hast_dist in a selection table not supported");
+        } else if (kv.key == "maprams") {
+            setup_maprams(kv.value);
+        } else if (kv.key == "p4") {
+            if (CHECKTYPE(kv.value, tMAP))
+                p4_table = P4Table::get(P4Table::Selection, kv.value.map);
+        } else if (kv.key == "p4_table") {
+            push_back(p4_info, "name", std::move(kv.value));
+        } else if (kv.key == "p4_table_size") {
+            push_back(p4_info, "size", std::move(kv.value));
+        } else if (kv.key == "handle") {
+            push_back(p4_info, "handle", std::move(kv.value));
+        } else if (kv.key == "context_json") {
+            setup_context_json(kv.value);
+        } else if (kv.key == "row" || kv.key == "logical_row" || kv.key == "column" ||
+                   kv.key == "bus") {
+            /* already done in setup_layout */
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (p4_info.size) {
+        if (p4_table)
+            error(p4_info[0].key.lineno, "old and new p4 table info in %s", name());
+        else
+            p4_table = P4Table::get(P4Table::Selection, p4_info);
+    }
+    fini(p4_info);
+    if (Target::SRAM_GLOBAL_ACCESS())
+        alloc_global_srams();
+    else
+        alloc_rams(true, stage->sram_use);
+}
+
+void SelectionTable::pass1() {
+    LOG1("### Selection table " << name() << " pass1 " << loc());
+    if (!p4_table)
+        p4_table = P4Table::alloc(P4Table::Selection, this);
+    else
+        p4_table->check(this);
+    alloc_vpns();
+    alloc_maprams();
+    std::sort(layout.begin(), layout.end(),
+              [](const Layout &a, const Layout &b) -> bool { return a.row > b.row; });
+    for (auto &ixb : input_xbar) ixb->pass1();
+    if (param < 0 || param > (resilient_hash ? 7 : 2))
+        error(mode_lineno, "Invalid %s hash param %d", resilient_hash ? "resilient" : "fair",
+              param);
+    min_words = INT_MAX;
+    max_words = 0;
+    if (pool_sizes.empty()) {
+        min_words = max_words = 1;
+    } else {
+        for (int size : pool_sizes) {
+            int words = (size + SELECTOR_PORTS_PER_WORD - 1) / SELECTOR_PORTS_PER_WORD;
+            if (words < min_words) min_words = words;
+            if (words > max_words) max_words = words;
+        }
+    }
+    stage->table_use[timing_thread(gress)] |= Stage::USE_SELECTOR;
+    if (max_words > 1) {
+        stage->table_use[timing_thread(gress)] |= Stage::USE_WIDE_SELECTOR;
+        for (auto &hd : hash_dist) hd.xbar_use |= HashDistribution::HASHMOD_DIVIDEND;
+    }
+    for (auto &hd : hash_dist) hd.pass1(this, HashDistribution::SELECTOR, non_linear_hash);
+    bool home = true;  // first layout row is home row
+    for (Layout &row : layout) {
+        if (home)
+            need_bus(row.lineno, stage->selector_adr_bus_use, row.row | 3, "Selector Address");
+        need_bus(row.lineno, stage->selector_adr_bus_use, row.row, "Selector Address");
+        if ((row.row & 2) == 0)  // even phy rows wired together
+            need_bus(row.lineno, stage->selector_adr_bus_use, row.row ^ 1, "Selector Address");
+        home = false;
+    }
+    AttachedTable::pass1();
+}
+
+void SelectionTable::pass2() {
+    LOG1("### Selection table " << name() << " pass2 " << loc());
+    for (auto &ixb : input_xbar) {
+        ixb->pass2();
+        if (selection_hash < 0 && (selection_hash = ixb->hash_group()) < 0)
+            error(lineno, "No selection_hash in selector table %s", name());
+    }
+    if (input_xbar.empty()) {
+        error(lineno, "No input xbar in selector table %s", name());
+    }
+    for (auto &hd : hash_dist) hd.pass2(this);
+}
+
+void SelectionTable::pass3() { LOG1("### Selection table " << name() << " pass3 " << loc()); }
+
+int SelectionTable::indirect_shiftcount() const {
+    return METER_ADDRESS_ZERO_PAD - 7;  // selectors always start at bit 7 address
+}
+
+unsigned SelectionTable::per_flow_enable_bit(MatchTable *match) const {
+    if (!per_flow_enable)
+        return SELECTOR_PER_FLOW_ENABLE_START_BIT;
+    else
+        return AttachedTable::per_flow_enable_bit(match);
+}
+
+unsigned SelectionTable::determine_shiftcount(Table::Call &call, int group, unsigned word,
+                                              int tcam_shift) const {
+    return determine_meter_shiftcount(call, group, word, tcam_shift);
+}
+
+template <class REGS>
+void SelectionTable::write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                                         const std::vector<Call::Arg> &args) {
+    auto &merge = regs.rams.match.merge;
+    setup_physical_alu_map(regs, type, bus, meter_group());
+    merge.mau_payload_shifter_enable[type][bus].meter_adr_payload_shifter_en = 1;
+
+    unsigned adr_mask = 0U;
+    unsigned per_entry_en_mux_ctl = 0U;
+    unsigned adr_default = 0U;
+    unsigned meter_type_position = 0U;
+    AttachedTable::determine_meter_merge_regs(match, type, bus, args, METER_SELECTOR, adr_mask,
+                                              per_entry_en_mux_ctl, adr_default,
+                                              meter_type_position);
+    merge.mau_meter_adr_default[type][bus] = adr_default;
+    merge.mau_meter_adr_mask[type][bus] = adr_mask;
+    merge.mau_meter_adr_per_entry_en_mux_ctl[type][bus] = per_entry_en_mux_ctl;
+    merge.mau_meter_adr_type_position[type][bus] = meter_type_position;
+}
+
+/**
+ * This validates the call as the value to the selection_length key.  The call requires
+ * two arguments:
+ *
+ *     1. A selector length mod argument
+ *     2. A selector length shift argument
+ *
+ * This is formatted in the following way:
+ *     (msb_side) {shift, mod} (lsb_side)
+ *
+ * These can come from match overhead, or can come from $DEFAULT
+ *
+ * In actuality, both of these arguments are extracted by the same extractor, and they must
+ * be contiguous to each other.  The reason for the separation is that the driver requires
+ * them to be separated in the pack format.
+ */
+bool SelectionTable::validate_length_call(const Table::Call &call) {
+    if (call.args.size() != 2) {
+        error(call.lineno, "The selector length call for %s requires two arguments", name());
+        return false;
+    }
+
+    if (call.args[0].name()) {
+        if (call.args[0] != "$DEFAULT") {
+            error(call.lineno, "Index %s for %s length cannot be found", call.args[0].name(),
+                  name());
+            return false;
+        }
+    } else if (!call.args[0].field()) {
+        error(call.lineno, "Index for %s length cannot be understood", name());
+        return false;
+    }
+
+    if (call.args[1].name()) {
+        if (call.args[1] != "$DEFAULT") {
+            error(call.lineno, "Index %s for %s length cannot be found", call.args[0].name(),
+                  name());
+            return false;
+        }
+    } else if (!call.args[1].field()) {
+        error(call.lineno, "Index for %s length cannot be understood", name());
+        return false;
+    }
+
+    if (call.args[0].field() && call.args[1].field()) {
+        auto mod = call.args[0].field();
+        auto shift = call.args[1].field();
+
+        if (mod->bit(0) + mod->size != shift->bit(0)) {
+            error(call.lineno, "Indexes for %s must be contiguous on the format", name());
+            return false;
+        }
+    }
+    return true;
+}
+
+unsigned SelectionTable::determine_length_shiftcount(const Table::Call &call, int group,
+                                                     int word) const {
+    if (auto f = call.args[0].field()) {
+        BUG_CHECK(f->by_group[group]->bit(0) / 128 == word && group == 0);
+        BUG_CHECK(f->by_group[group]->bit(0) % 128 <= 8);
+        return f->by_group[group]->bit(0) % 128U;
+    }
+    return 0;
+}
+
+unsigned SelectionTable::determine_length_mask(const Table::Call &call) const {
+    unsigned rv = 0;
+    if (auto f = call.args[0].field()) rv |= ((1U << f->size) - 1);
+    if (auto f = call.args[1].field()) rv |= ((1U << f->size) - 1) << SELECTOR_LENGTH_MOD_BITS;
+    return rv;
+}
+
+unsigned SelectionTable::determine_length_default(const Table::Call &call) const {
+    unsigned rv = 0;
+    if (call.args[0].name() && strcmp(call.args[0].name(), "$DIRECT") == 0) rv = 1;
+    return rv;
+}
+
+template <>
+void SelectionTable::setup_physical_alu_map(Target::Tofino::mau_regs &regs, int type, int bus,
+                                            int alu) {
+    auto &merge = regs.rams.match.merge;
+    merge.mau_physical_to_meter_alu_ixbar_map[type][bus / 8U].set_subfield(4 | alu, 3 * (bus % 8U),
+                                                                           3);
+}
+template <>
+void SelectionTable::setup_physical_alu_map(Target::JBay::mau_regs &regs, int type, int bus,
+                                            int alu) {
+    auto &merge = regs.rams.match.merge;
+    merge.mau_physical_to_meter_alu_icxbar_map[type][bus / 8U] |= (1U << alu) << (4 * (bus % 8U));
+}
+
+template <class REGS>
+void SelectionTable::write_regs_vt(REGS &regs) {
+    LOG1("### Selection table " << name() << " write_regs " << loc());
+    for (auto &ixb : input_xbar) ixb->write_regs(regs);
+    Layout *home = &layout[0];
+    bool push_on_overflow = false;
+    auto &map_alu = regs.rams.map_alu;
+    DataSwitchboxSetup<REGS> swbox(regs, this);
+    int minvpn, maxvpn;
+    layout_vpn_bounds(minvpn, maxvpn, true);
+    BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+    for (Layout &logical_row : layout) {
+        unsigned row = logical_row.row / 2U;
+        unsigned side = logical_row.row & 1; /* 0 == left  1 == right */
+        /* FIXME factor vpn/mapram stuff with counter.cpp */
+        auto vpn = logical_row.vpns.begin();
+        auto mapram = logical_row.maprams.begin();
+        auto &map_alu_row = map_alu.row[row];
+        LOG2("# DataSwitchbox.setup(" << row << ") home=" << home->row / 2U);
+        swbox.setup_row(row);
+        for (auto &memunit : logical_row.memunits) {
+            BUG_CHECK(memunit.stage == INT_MIN && memunit.row == logical_row.row,
+                      "bogus %s in logical row %d", memunit.desc(), logical_row.row);
+            unsigned col = memunit.col + 6 * side;
+            swbox.setup_row_col(row, col, *vpn);
+            write_mapram_regs(regs, row, *mapram, *vpn, MapRam::SELECTOR_SIZE);
+            if (gress) regs.cfg_regs.mau_cfg_uram_thread[col / 4U] |= 1U << (col % 4U * 8U + row);
+            ++mapram, ++vpn;
+        }
+        if (&logical_row == home) {
+            auto &vh_adr_xbar = regs.rams.array.row[row].vh_adr_xbar;
+            setup_muxctl(
+                vh_adr_xbar.exactmatch_row_hashadr_xbar_ctl[SELECTOR_VHXBAR_HASH_BUS_INDEX],
+                selection_hash);
+            vh_adr_xbar.alu_hashdata_bytemask.alu_hashdata_bytemask_right =
+                bitmask2bytemask(input_xbar[0]->hash_group_bituse());
+            map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_base = minvpn;
+            map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_limit = maxvpn;
+        } else {
+            auto &adr_ctl = map_alu_row.vh_xbars.adr_dist_oflo_adr_xbar_ctl[side];
+            if (home->row >= 8 && logical_row.row < 8) {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = 0;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::OVERFLOW;
+                push_on_overflow = true;
+                BUG_CHECK(options.target == TOFINO);
+            } else {
+                adr_ctl.adr_dist_oflo_adr_xbar_source_index = home->row % 8;
+                adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::METER;
+            }
+            adr_ctl.adr_dist_oflo_adr_xbar_enable = 1;
+        }
+    }
+
+    unsigned meter_group = home->row / 4U;
+    auto &selector_ctl = map_alu.meter_group[meter_group].selector.selector_alu_ctl;
+    selector_ctl.sps_nonlinear_hash_enable = non_linear_hash ? 1 : 0;
+    if (resilient_hash)
+        selector_ctl.resilient_hash_enable = param;
+    else
+        selector_ctl.selector_fair_hash_select = param;
+    selector_ctl.resilient_hash_mode = resilient_hash ? 1 : 0;
+    selector_ctl.selector_enable = 1;
+    auto &delay_ctl = map_alu.meter_alu_group_data_delay_ctl[meter_group];
+    delay_ctl.meter_alu_right_group_delay =
+        Target::METER_ALU_GROUP_DATA_DELAY() + meter_group / 2 + stage->tcam_delay(gress);
+    delay_ctl.meter_alu_right_group_enable =
+        meter_alu_fifo_enable_from_mask(regs, resilient_hash ? 0x7f : 0x3);
+    /* FIXME -- error_ctl should be configurable */
+    auto &error_ctl = map_alu.meter_alu_group_error_ctl[meter_group];
+    error_ctl.meter_alu_group_ecc_error_enable = 1;
+    error_ctl.meter_alu_group_sel_error_enable = 1;
+    error_ctl.meter_alu_group_thread = gress;
+
+    auto &merge = regs.rams.match.merge;
+    auto &adrdist = regs.rams.match.adrdist;
+    for (MatchTable *m : match_tables) {
+        adrdist.adr_dist_meter_adr_icxbar_ctl[m->logical_id] |= 1 << meter_group;
+        // auto &icxbar = adrdist.adr_dist_meter_adr_icxbar_ctl[m->logical_id];
+        // icxbar.address_distr_to_logical_rows = 1 << home->row;
+        // icxbar.address_distr_to_overflow = push_on_overflow;
+        if (auto &act = m->get_action()) {
+            /* FIXME -- can't be attached to mutliple tables ? */
+            unsigned fmt = 3;
+            fmt = std::max(fmt, act->format->log2size);
+            if (auto at = dynamic_cast<ActionTable *>(&(*act)))
+                for (auto &f : at->get_action_formats()) fmt = std::max(fmt, f.second->log2size);
+            merge.mau_selector_action_entry_size[meter_group] = fmt - 3;
+        }  // val in bytes
+        adrdist.mau_ad_meter_virt_lt[meter_group] |= 1U << m->logical_id;
+        adrdist.movereg_ad_meter_alu_to_logical_xbar_ctl[m->logical_id / 8U].set_subfield(
+            4 | meter_group, 3 * (m->logical_id % 8U), 3);
+        setup_logical_alu_map(regs, m->logical_id, meter_group);
+    }
+    if (max_words == 1) adrdist.movereg_meter_ctl[meter_group].movereg_ad_meter_shift = 7;
+    if (push_on_overflow) {
+        adrdist.oflo_adr_user[0] = adrdist.oflo_adr_user[1] = AdrDist::METER;
+        adrdist.deferred_oflo_ctl = 1 << ((home->row - 8) / 2U);
+    }
+    adrdist.packet_action_at_headertime[1][meter_group] = 1;
+    for (auto &hd : hash_dist) hd.write_regs(regs, this);
+    if (gress == INGRESS || gress == GHOST) {
+        merge.meter_alu_thread[0].meter_alu_thread_ingress |= 1U << meter_group;
+        merge.meter_alu_thread[1].meter_alu_thread_ingress |= 1U << meter_group;
+    } else if (gress == EGRESS) {
+        merge.meter_alu_thread[0].meter_alu_thread_egress |= 1U << meter_group;
+        merge.meter_alu_thread[1].meter_alu_thread_egress |= 1U << meter_group;
+    }
+    if (gress == EGRESS) {
+        regs.rams.map_alu.meter_group[meter_group].meter.meter_ctl.meter_alu_egress = 1;
+    }
+}
+
+template <>
+void SelectionTable::setup_logical_alu_map(Target::Tofino::mau_regs &regs, int logical_id,
+                                           int alu) {
+    auto &merge = regs.rams.match.merge;
+    if (max_words > 1) merge.mau_logical_to_meter_alu_map.set_subfield(16 | logical_id, 5 * alu, 5);
+    merge.mau_meter_alu_to_logical_map[logical_id / 8U].set_subfield(4 | alu, 3 * (logical_id % 8U),
+                                                                     3);
+}
+template <>
+void SelectionTable::setup_logical_alu_map(Target::JBay::mau_regs &regs, int logical_id, int alu) {
+    auto &merge = regs.rams.match.merge;
+    merge.mau_logical_to_meter_alu_map[logical_id / 8U] |= (1U << alu) << ((logical_id % 8U) * 4);
+    merge.mau_meter_alu_to_logical_map[logical_id / 8U].set_subfield(4 | alu, 3 * (logical_id % 8U),
+                                                                     3);
+}
+
+std::vector<int> SelectionTable::determine_spare_bank_memory_units() const {
+    if (bound_stateful) return bound_stateful->determine_spare_bank_memory_units();
+    return {};
+}
+
+void SelectionTable::gen_tbl_cfg(json::vector &out) const {
+    // Stage table size reflects how many RAM lines are available for the selector, according
+    // to henry wang.
+    int size = (layout_size() - 1) * 1024;
+    json::map &tbl = *base_tbl_cfg(out, "selection", size);
+    tbl["selection_type"] = resilient_hash ? "resilient" : "fair";
+    tbl["selector_name"] = p4_table ? p4_table->p4_name() : "undefined";
+    tbl["selection_key_name"] = "undefined";  // FIXME!
+    std::string hr = how_referenced();
+    if (hr.empty()) hr = indirect ? "indirect" : "direct";
+    tbl["how_referenced"] = hr;
+    if (pool_sizes.size() > 0)
+        tbl["max_port_pool_size"] = *std::max_element(std::begin(pool_sizes), std::end(pool_sizes));
+    for (MatchTable *m : match_tables) {
+        if (auto &act = m->get_action()) {
+            if (auto at = dynamic_cast<ActionTable *>(&(*act))) {
+                tbl["bound_to_action_data_table_handle"] = act->handle();
+                break;
+            }
+        }
+    }
+    json::map &stage_tbl = *add_stage_tbl_cfg(tbl, "selection", size);
+    add_pack_format(stage_tbl, 128, 1, 1);
+    stage_tbl["memory_resource_allocation"] =
+        gen_memory_resource_allocation_tbl_cfg("sram", layout, bound_stateful != nullptr);
+    add_alu_index(stage_tbl, "meter_alu_index");
+    stage_tbl["sps_scramble_enable"] = non_linear_hash;
+    if (context_json) stage_tbl.merge(*context_json);
+}
+
+DEFINE_TABLE_TYPE(SelectionTable)
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void SelectionTable::write_merge_regs,
+                      (mau_regs & regs, MatchTable *match, int type, int bus,
+                       const std::vector<Call::Arg> &args),
+                      { write_merge_regs_vt(regs, match, type, bus, args); })
diff --git a/backends/tofino/bf-asm/slist.h b/backends/tofino/bf-asm/slist.h
new file mode 100644
index 00000000000..d68a7dd6bad
--- /dev/null
+++ b/backends/tofino/bf-asm/slist.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_SLIST_H_
+#define BACKENDS_TOFINO_BF_ASM_SLIST_H_
+
+template <class T>
+class slist {
+    const slist *next;
+    T value;
+
+ public:
+    explicit slist(T v) : next(nullptr), value(v) {}
+    slist(T v, const slist *n) : next(n), value(v) {}
+    typedef T value_type;
+    class iterator : public std::iterator<std::forward_iterator_tag, T> {
+        friend class slist;
+        const slist *ptr;
+        iterator() : ptr(nullptr) {}
+        explicit iterator(const slist *p) : ptr(p) {}
+
+     public:
+        iterator &operator++() {
+            ptr = ptr->next;
+            return *this;
+        }
+        bool operator==(const iterator &a) const { return ptr == a.ptr; }
+        bool operator!=(const iterator &a) const { return ptr != a.ptr; }
+        const T &operator*() const { return ptr->value; }
+        const T *operator->() const { return &ptr->value; }
+    };
+    typedef iterator const_iterator;
+
+    iterator begin() const { return iterator(this); }
+    iterator end() const { return iterator(); }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_SLIST_H_ */
diff --git a/backends/tofino/bf-asm/sram_match.cpp b/backends/tofino/bf-asm/sram_match.cpp
new file mode 100644
index 00000000000..697a796d028
--- /dev/null
+++ b/backends/tofino/bf-asm/sram_match.cpp
@@ -0,0 +1,1505 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/action_bus.h"
+#include "backends/tofino/bf-asm/input_xbar.h"
+#include "backends/tofino/bf-asm/instruction.h"
+#include "backends/tofino/bf-asm/mask_counter.h"
+#include "backends/tofino/bf-asm/misc.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "lib/algorithm.h"
+#include "lib/hex.h"
+
+Table::Format::Field *SRamMatchTable::lookup_field(const std::string &n,
+                                                   const std::string &act) const {
+    auto *rv = format ? format->field(n) : nullptr;
+    if (!rv && gateway) rv = gateway->lookup_field(n, act);
+    if (!rv && !act.empty()) {
+        if (auto call = get_action()) rv = call->lookup_field(n, act);
+    }
+    if (!rv && n == "immediate" && !::Phv::get(gress, stage->stageno, n)) {
+        static Format::Field default_immediate(nullptr, 32, Format::Field::USED_IMMED);
+        rv = &default_immediate;
+    }
+    return rv;
+}
+
+const char *SRamMatchTable::Ram::desc() const {
+    static char buffer[256], *p = buffer;
+    char *end = buffer + sizeof(buffer), *rv;
+    do {
+        if (end - p < 7) p = buffer;
+        rv = p;
+        if (stage >= 0)
+            p += snprintf(p, end - p, "Ram %d,%d,%d", stage, row, col);
+        else if (row >= 0)
+            p += snprintf(p, end - p, "Ram %d,%d", row, col);
+        else
+            p += snprintf(p, end - p, "Lamb %d", col);
+    } while (p++ >= end);
+    return rv;
+}
+
+/* calculate the 18-bit byte/nybble mask tofino uses for matching in a 128-bit word */
+static unsigned tofino_bytemask(int lo, int hi) {
+    unsigned rv = 0;
+    for (unsigned i = lo / 4U; i <= hi / 4U; i++) rv |= 1U << (i < 28 ? i / 2 : i - 14);
+    return rv;
+}
+
+/**
+ * Determining the result bus for an entry, if that entry has no overhead.  The result bus
+ * is still needed to get the direct address location to find action data / run an
+ * instruction, etc.
+ *
+ * This section maps the allocation scheme used in the TableFormat::Use in p4c, found
+ * in the function result_bus_words
+ */
+void SRamMatchTable::no_overhead_determine_result_bus_usage() {
+    bitvec result_bus_words;
+
+    for (int i = 0; i < static_cast<int>(group_info.size()); i++) {
+        BUG_CHECK(group_info[i].overhead_word < 0);
+        if (group_info[i].match_group.size() == 1) {
+            group_info[i].result_bus_word = group_info[i].match_group.begin()->first;
+            result_bus_words.setbit(group_info[i].result_bus_word);
+        }
+    }
+
+    for (int i = 0; i < static_cast<int>(group_info.size()); i++) {
+        if (group_info[i].overhead_word < 0 && group_info[i].match_group.size() > 1) {
+            bool result_bus_set = false;
+            for (auto match_group : group_info[i].match_group) {
+                if (result_bus_words.getbit(match_group.first)) {
+                    group_info[i].result_bus_word = match_group.first;
+                    result_bus_set = true;
+                }
+            }
+            if (!result_bus_set)
+                group_info[i].result_bus_word = group_info[i].match_group.begin()->first;
+            LOG1("  format group " << i << " no overhead multiple match groups");
+        }
+    }
+}
+
+void SRamMatchTable::verify_format(Target::Tofino) {
+    if (format->log2size < 7) format->log2size = 7;
+    format->pass1(this);
+    group_info.resize(format->groups());
+    unsigned fmt_width = (format->size + 127) / 128;
+    if (word_info.size() > fmt_width) {
+        warning(format->lineno, "Match group map wider than format, padding out format");
+        format->size = word_info.size() * 128;
+        fmt_width = word_info.size();
+        while ((1U << format->log2size) < format->size) ++format->log2size;
+    }
+    for (unsigned i = 0; i < format->groups(); i++) {
+        auto &info = group_info[i];
+        info.tofino_mask.resize(fmt_width);
+        if (Format::Field *match = format->field("match", i)) {
+            for (auto &piece : match->bits) {
+                unsigned word = piece.lo / 128;
+                if (word != piece.hi / 128)
+                    error(format->lineno,
+                          "'match' field must be explictly split across "
+                          "128-bit boundary in table %s",
+                          name());
+                info.tofino_mask[word] |= tofino_bytemask(piece.lo % 128, piece.hi % 128);
+                info.match_group[word] = -1;
+            }
+        }
+        if (auto *version = format->field("version", i)) {
+            if (version->bits.size() != 1) error(format->lineno, "'version' field cannot be split");
+            auto &piece = version->bits[0];
+            unsigned word = piece.lo / 128;
+            if (version->size != 4 || (piece.lo % 4) != 0)
+                error(format->lineno,
+                      "'version' field not 4 bits and nibble aligned "
+                      "in table %s",
+                      name());
+            info.tofino_mask[word] |= tofino_bytemask(piece.lo % 128, piece.hi % 128);
+            info.match_group[word] = -1;
+        }
+        for (unsigned j = 0; j < i; j++)
+            for (unsigned word = 0; word < fmt_width; word++)
+                if (group_info[j].tofino_mask[word] & info.tofino_mask[word]) {
+                    int bit = ffs(group_info[j].tofino_mask[word] & info.tofino_mask[word]) - 1;
+                    if (bit >= 14) bit += 14;
+                    error(format->lineno, "Match groups %d and %d both use %s %d in word %d", i, j,
+                          bit > 20 ? "nibble" : "byte", bit, word);
+                    break;
+                }
+        for (auto it = format->begin(i); it != format->end(i); it++) {
+            Format::Field &f = it->second;
+            if (it->first == "match" || it->first == "version" || it->first == "proxy_hash")
+                continue;
+            if (f.bits.size() != 1) {
+                error(format->lineno, "Can't deal with split field %s", it->first.c_str());
+                continue;
+            }
+            unsigned limit = Target::MAX_OVERHEAD_OFFSET();
+            if (it->first == "next") limit = Target::MAX_OVERHEAD_OFFSET_NEXT();
+            unsigned word = f.bit(0) / 128;
+            if (info.overhead_word < 0) {
+                info.overhead_word = word;
+                format->overhead_word = word;
+                LOG5("Setting overhead word for format : " << word);
+                info.overhead_bit = f.bit(0) % 128;
+                info.match_group[word] = -1;
+            } else if (info.overhead_word != static_cast<int>(word)) {
+                error(format->lineno, "Match overhead group %d split across words", i);
+            } else if (word != f.bit(f.size - 1) / 128 || f.bit(f.size - 1) % 128 >= limit) {
+                error(format->lineno, "Match overhead field %s(%d) not in bottom %d bits",
+                      it->first.c_str(), i, limit);
+            }
+            if (!info.match_group.count(word))
+                error(format->lineno, "Match overhead in group %d in word with no match?", i);
+            if ((unsigned)info.overhead_bit > f.bit(0) % 128) info.overhead_bit = f.bit(0) % 128;
+        }
+        info.vpn_offset = i;
+    }
+    if (word_info.empty()) {
+        word_info.resize(fmt_width);
+        if (format->field("next")) {
+            /* 'next' for match group 0 must be in bit 0, so make the format group with
+             * overhead in bit 0 match group 0 in its overhead word */
+            for (unsigned i = 0; i < group_info.size(); i++) {
+                if (group_info[i].overhead_bit == 0) {
+                    BUG_CHECK(error_count > 0 || word_info[group_info[i].overhead_word].empty());
+                    group_info[i].match_group[group_info[i].overhead_word] = 0;
+                    word_info[group_info[i].overhead_word].push_back(i);
+                }
+            }
+        }
+        for (unsigned i = 0; i < group_info.size(); i++) {
+            if (group_info[i].match_group.size() > 1) {
+                for (auto &mgrp : group_info[i].match_group) {
+                    if (mgrp.second >= 0) continue;
+                    if ((mgrp.second = word_info[mgrp.first].size()) > 1)
+                        error(format->lineno, "Too many multi-word groups using word %d",
+                              mgrp.first);
+                    word_info[mgrp.first].push_back(i);
+                }
+            }
+        }
+    } else {
+        if (word_info.size() != fmt_width)
+            error(mgm_lineno, "Match group map doesn't match format size");
+        for (unsigned i = 0; i < word_info.size(); i++) {
+            for (unsigned j = 0; j < word_info[i].size(); j++) {
+                int grp = word_info[i][j];
+                if (grp < 0 || (unsigned)grp >= format->groups()) {
+                    error(mgm_lineno, "Invalid group number %d", grp);
+                } else if (!group_info[grp].match_group.count(i)) {
+                    error(mgm_lineno, "Format group %d doesn't match in word %d", grp, i);
+                } else {
+                    group_info[grp].match_group[i] = j;
+                    auto *next = format->field("next", grp);
+                    if (!next && hit_next.size() > 1) next = format->field("action", grp);
+                    if (next) {
+                        if (next->bit(0) / 128 != i) continue;
+                        static unsigned limit[5] = {0, 8, 32, 32, 32};
+                        unsigned bit = next->bit(0) % 128U;
+                        if (!j && bit)
+                            error(mgm_lineno,
+                                  "Next(%d) field must start at bit %d to be in "
+                                  "match group 0",
+                                  grp, i * 128);
+                        else if (j && (!bit || bit > limit[j]))
+                            warning(mgm_lineno,
+                                    "Next(%d) field must start in range %d..%d "
+                                    "to be in match group %d",
+                                    grp, i * 128 + 1, i * 128 + limit[j], j);
+                    }
+                }
+            }
+        }
+    }
+    if (hit_next.size() > 1 && !format->field("next") && !format->field("action"))
+        error(format->lineno, "No 'next' field in format");
+    if (error_count > 0) return;
+
+    for (int i = 0; i < static_cast<int>(group_info.size()); i++) {
+        if (group_info[i].match_group.size() == 1) {
+            for (auto &mgrp : group_info[i].match_group) {
+                if (mgrp.second >= 0) continue;
+                if ((mgrp.second = word_info[mgrp.first].size()) > 4)
+                    error(format->lineno, "Too many match groups using word %d", mgrp.first);
+                word_info[mgrp.first].push_back(i);
+            }
+        }
+        // Determining the result bus word, where the overhead is supposed to be
+    }
+
+    bool has_overhead_word = false;
+    bool overhead_word_set = false;
+    for (int i = 0; i < static_cast<int>(group_info.size()); i++) {
+        if (overhead_word_set) BUG_CHECK((group_info[i].overhead_word >= 0) == has_overhead_word);
+        if (group_info[i].overhead_word >= 0) {
+            has_overhead_word = true;
+            group_info[i].result_bus_word = group_info[i].overhead_word;
+        }
+        overhead_word_set = true;
+    }
+
+    if (!has_overhead_word) no_overhead_determine_result_bus_usage();
+
+    /**
+     * Determining the result bus for an entry, if that entry has no overhead.  The result bus
+     * is still needed to get the direct address location to find action data / run an
+     * instruction, etc.
+     *
+     * This section maps the allocation scheme used in the TableFormat::Use in p4c, found
+     * in the function result_bus_words
+     */
+
+    for (int i = 0; i < static_cast<int>(group_info.size()); i++) {
+        LOG1("  masks: " << hexvec(group_info[i].tofino_mask));
+        for (auto &mgrp : group_info[i].match_group)
+            LOG1("    match group " << mgrp.second << " in word " << mgrp.first);
+    }
+
+    for (unsigned i = 0; i < word_info.size(); i++)
+        LOG1("  word " << i << " groups: " << word_info[i]);
+    if (options.match_compiler && 0) {
+        /* hack to match the compiler's nibble usage -- if any of the top 4 nibbles is
+         * unused in a word, mark it as used by any group that uses the other nibble of the
+         * byte, UNLESS it is used for the version.  This is ok, as the unused nibble will
+         * end up being masked off by the match_mask anyways */
+        for (unsigned word = 0; word < word_info.size(); word++) {
+            unsigned used_nibbles = 0;
+            for (auto group : word_info[word])
+                used_nibbles |= group_info[group].tofino_mask[word] >> 14;
+            for (unsigned nibble = 0; nibble < 4; nibble++) {
+                if (!((used_nibbles >> nibble) & 1) && ((used_nibbles >> (nibble ^ 1)) & 1)) {
+                    LOG1("  ** fixup nibble " << nibble << " in word " << word);
+                    for (auto group : word_info[word])
+                        if ((group_info[group].tofino_mask[word] >> (14 + (nibble ^ 1))) & 1) {
+                            if (auto *version = format->field("version", group)) {
+                                if (version->bit(0) == word * 128 + (nibble ^ 1) * 4 + 112) {
+                                    LOG1("      skip group " << group << " (version)");
+                                    continue;
+                                }
+                            }
+                            group_info[group].tofino_mask[word] |= 1 << (14 + nibble);
+                            LOG1("      adding to group " << group);
+                        }
+                }
+            }
+        }
+    }
+
+    verify_match(fmt_width);
+}
+
+void SRamMatchTable::verify_format_pass2(Target::Tofino) {}
+
+/**
+ * Guarantees that each match field is a PHV field, which is the standard unless the table is
+ * a proxy hash table.
+ */
+bool SRamMatchTable::verify_match_key() {
+    for (auto *match_key : match) {
+        auto phv_p = dynamic_cast<Phv::Ref *>(match_key);
+        if (phv_p == nullptr) {
+            error(match_key->get_lineno(), "A non PHV match key in table %s", name());
+            continue;
+        }
+        auto phv_ref = *phv_p;
+        if (phv_ref.check() && phv_ref->reg.mau_id() < 0)
+            error(phv_ref.lineno, "%s not accessable in mau", phv_ref->reg.name);
+    }
+    auto match_format = format->field("match");
+    if (match_format && match.empty()) {
+        BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+        for (auto ixbar_element : *input_xbar[0]) {
+            match.emplace_back(new Phv::Ref(ixbar_element.second.what));
+        }
+    }
+    return error_count == 0;
+}
+
+std::unique_ptr<json::map> SRamMatchTable::gen_memory_resource_allocation_tbl_cfg(
+    const Way &way) const {
+    json::map mra;
+    unsigned vpn_ctr = 0;
+    unsigned fmt_width = format ? (format->size + 127) / 128 : 0;
+    unsigned ramdepth = way.isLamb() ? LAMB_DEPTH_BITS : SRAM_DEPTH_BITS;
+    if (hash_fn_ids.count(way.group_xme) > 0)
+        mra["hash_function_id"] = hash_fn_ids.at(way.group_xme);
+    mra["hash_entry_bit_lo"] = way.index;
+    mra["hash_entry_bit_hi"] = way.index + ramdepth + way.subword_bits - 1;
+    mra["number_entry_bits"] = ramdepth;
+    mra["number_subword_bits"] = way.subword_bits;
+    if (way.select) {
+        int lo = way.select.min().index(), hi = way.select.max().index();
+        mra["hash_select_bit_lo"] = lo;
+        mra["hash_select_bit_hi"] = hi;
+        if (way.select.popcount() != hi - lo + 1) {
+            warning(way.lineno, "driver does not support discontinuous bits in a way mask");
+            mra["hash_select_bit_mask"] = way.select.getrange(lo, 32);
+        }
+    } else {
+        mra["hash_select_bit_lo"] = mra["hash_select_bit_hi"] = 40;
+    }
+    mra["number_select_bits"] = way.select.popcount();
+    mra["memory_type"] = way.isLamb() ? "lamb" : "sram";
+    json::vector mem_units;
+    json::vector &mem_units_and_vpns = mra["memory_units_and_vpns"] = json::vector();
+    int way_uses_lambs = -1;  // don't know yet
+    for (auto &ram : way.rams) {
+        if (ram.isLamb()) {
+            BUG_CHECK(way_uses_lambs != 0, "mixed lambs and memories in a way");
+            way_uses_lambs = 1;
+        } else {
+            BUG_CHECK(way_uses_lambs != 1, "mixed lambs and memories in a way");
+            way_uses_lambs = 0;
+            if (mem_units.empty())
+                vpn_ctr = layout_get_vpn(ram);
+            else
+                BUG_CHECK(vpn_ctr == layout_get_vpn(ram));
+        }
+        mem_units.push_back(json_memunit(ram));
+        if (mem_units.size() == fmt_width) {
+            json::map tmp;
+            tmp["memory_units"] = std::move(mem_units);
+            mem_units = json::vector();
+            json::vector vpns;
+            for (auto &grp : group_info) vpns.push_back(vpn_ctr + grp.vpn_offset);
+            vpn_ctr += group_info.size();
+            if (group_info.empty())  // FIXME -- can this happen?
+                vpns.push_back(vpn_ctr++);
+            tmp["vpns"] = std::move(vpns);
+            mem_units_and_vpns.push_back(std::move(tmp));
+        }
+    }
+    BUG_CHECK(mem_units.empty());
+    return json::mkuniq<json::map>(std::move(mra));
+}
+
+/**
+ * The purpose of this function is to generate the hash_functions JSON node.  The hash functions
+ * are for the driver to determine what RAM/RAM line to write the match data into during entry
+ * adds.
+ *
+ * The JSON nodes for the hash functions are the following:
+ *     - hash_bits - A vector determining what each bit is calculated from.  Look at the comments
+ *           over the function gen_hash_bits
+ *     The following two fields are required for High Availability mode and Entry Reads from HW
+ *     - ghost_bit_to_hash_bit - A vector describing where the ghost bits are in the hash matrix
+ *     - ghost_bit_info - A vector indicating which p4 fields are used as the ghost bits
+ *     The following field is only necessary for dynamic_key_masks
+ *     - hash_function_number - which of the 8 hash functions this table is using.
+ *
+ * The order of the hash functions must coordinate to the order of the hash_function_ids used
+ * in the Way JSON, as this is how a single way knows which hash function to use for its lookup
+ */
+void SRamMatchTable::add_hash_functions(json::map &stage_tbl) const {
+    BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+    auto &ht = input_xbar[0]->get_hash_tables();
+    if (ht.size() == 0) return;
+    // Output cjson node only if hash tables present
+    std::map<int, bitvec> hash_bits_per_group;
+    for (auto &way : ways) {
+        int depth = way.isLamb() ? LAMB_DEPTH_BITS : SRAM_DEPTH_BITS;
+        if (format->field("match")) {
+            // cuckoo or BPH
+        } else {
+            depth += ceil_log2(format->groups());
+            if (format->size < 128) depth += 7 - ceil_log2(format->size);
+        }
+        bitvec way_impact;
+        way_impact.setrange(way.index, depth);
+        way_impact |= way.select;
+        hash_bits_per_group[way.group_xme] |= way_impact;
+    }
+
+    // Order so that the order is the same of the hash_function_ids in the ways
+    // FIXME -- this seems pointless, as iterating over a std::map will always be
+    // in order.  So this loop could go away and the later loop be over hash_bits_per_group
+    std::vector<std::pair<int, bitvec>> hash_function_to_hash_bits(hash_fn_ids.size());
+    for (auto entry : hash_bits_per_group) {
+        int hash_fn_id = hash_fn_ids.at(entry.first);
+        if (hash_fn_id >= hash_fn_ids.size()) BUG();
+        hash_function_to_hash_bits[hash_fn_id] = entry;
+    }
+
+    json::vector &hash_functions = stage_tbl["hash_functions"] = json::vector();
+    for (auto entry : hash_function_to_hash_bits) {
+        int hash_group_no = entry.first;
+
+        json::map hash_function;
+        json::vector &hash_bits = hash_function["hash_bits"] = json::vector();
+        hash_function["hash_function_number"] = hash_group_no;
+        json::vector &ghost_bits_to_hash_bits = hash_function["ghost_bit_to_hash_bit"] =
+            json::vector();
+        json::vector &ghost_bits_info = hash_function["ghost_bit_info"] = json::vector();
+        // Get the hash group data
+        if (auto *hash_group = input_xbar[0]->get_hash_group(hash_group_no)) {
+            // Process only hash tables used per hash group
+            for (unsigned id : bitvec(hash_group->tables)) {
+                auto hash_table = input_xbar[0]->get_hash_table(id);
+                gen_hash_bits(hash_table, InputXbar::HashTable(InputXbar::HashTable::EXACT, id),
+                              hash_bits, hash_group_no, entry.second);
+            }
+        } else {
+            for (auto &ht : input_xbar[0]->get_hash_tables())
+                gen_hash_bits(ht.second, ht.first, hash_bits, hash_group_no, entry.second);
+        }
+        gen_ghost_bits(hash_group_no, ghost_bits_to_hash_bits, ghost_bits_info);
+        hash_functions.push_back(std::move(hash_function));
+    }
+}
+
+void SRamMatchTable::verify_match(unsigned fmt_width) {
+    if (!verify_match_key()) return;
+    // Build the match_by_bit
+    unsigned bit = 0;
+    for (auto &r : match) {
+        match_by_bit.emplace(bit, r);
+        bit += r->size();
+    }
+    auto match_format = format->field("match");
+    if ((unsigned)bit != (match_format ? match_format->size : 0))
+        warning(match[0]->get_lineno(),
+                "Match width %d for table %s doesn't match format match "
+                "width %d",
+                bit, name(), match_format ? match_format->size : 0);
+    match_in_word.resize(fmt_width);
+    for (unsigned i = 0; i < format->groups(); i++) {
+        Format::Field *match = format->field("match", i);
+        if (!match) continue;
+        unsigned bit = 0;
+        for (auto &piece : match->bits) {
+            auto mw = --match_by_bit.upper_bound(bit);
+            int lo = bit - mw->first;
+            while (mw != match_by_bit.end() && mw->first < bit + piece.size()) {
+                if ((piece.lo + mw->first - bit) % 8U != (mw->second->slicelobit() % 8U))
+                    error(mw->second->get_lineno(),
+                          "bit within byte misalignment matching %s in "
+                          "match group %d of table %s",
+                          mw->second->name(), i, name());
+                int hi =
+                    std::min((unsigned)mw->second->size() - 1, bit + piece.size() - mw->first - 1);
+                BUG_CHECK((unsigned)piece.lo / 128 < fmt_width);
+                // merge_phv_vec(match_in_word[piece.lo/128], Phv::Ref(mw->second, lo, hi));
+
+                if (auto phv_p = dynamic_cast<Phv::Ref *>(mw->second)) {
+                    auto phv_ref = *phv_p;
+                    auto vec = split_phv_bytes(Phv::Ref(phv_ref, lo, hi));
+                    for (auto ref : vec) {
+                        match_in_word[piece.lo / 128].emplace_back(new Phv::Ref(ref));
+                    }
+
+                } else if (auto hash_p = dynamic_cast<HashMatchSource *>(mw->second)) {
+                    match_in_word[piece.lo / 128].push_back(new HashMatchSource(*hash_p));
+                } else {
+                    BUG();
+                }
+                lo = 0;
+                ++mw;
+            }
+            bit += piece.size();
+        }
+    }
+    for (unsigned i = 0; i < fmt_width; i++) {
+        std::string match_word_info = "[ ";
+        std::string sep = "";
+        for (auto entry : match_in_word[i]) {
+            match_word_info += sep + entry->toString();
+            sep = ", ";
+        }
+        LOG1("  match in word " << i << ": " << match_word_info);
+    }
+}
+
+bool SRamMatchTable::parse_ram(const value_t &v, std::vector<Ram> &res) {
+    if (!CHECKTYPE(v, tVEC)) return true;  // supress added message
+    for (auto &el : v.vec)                 // all elements must be positive integers
+        if (el.type != tINT || el.i < 0) return false;
+    switch (v.vec.size) {
+        case 1:  // lamb unit
+            if (v[0].i < Target::SRAM_LAMBS_PER_STAGE()) {
+                res.emplace_back(v[0].i);
+                return true;
+            }
+            break;
+        case 2:                                       // row, col
+            if (Target::SRAM_GLOBAL_ACCESS()) break;  // stage required
+            if (v[0].i < Target::SRAM_ROWS(gress) && v[1].i < Target::SRAM_UNITS_PER_ROW()) {
+                res.emplace_back(v[0].i, v[1].i);
+                return true;
+            }
+            break;
+        case 3:  // stage, row, col
+            if (Target::SRAM_GLOBAL_ACCESS() && v[0].i < Target::NUM_STAGES(gress) &&
+                v[1].i < Target::SRAM_ROWS(gress) && v[2].i < Target::SRAM_UNITS_PER_ROW()) {
+                res.emplace_back(v[0].i, v[1].i, v[2].i);
+                return true;
+            }
+            break;
+        default:
+            break;
+    }
+    return false;
+}
+
+bool SRamMatchTable::parse_way(const value_t &v) {
+    Way way = {};
+    way.lineno = v.lineno;
+    if (!CHECKTYPE2(v, tVEC, tMAP)) return true;  // supress added message
+    if (v.type == tVEC) {
+        // DEPRECATED -- old style "raw" way for tofino1/2
+        if (v.vec.size < 3 || v[0].type != tINT || v[1].type != tINT || v[2].type != tINT ||
+            v[0].i < 0 || v[1].i < 0 || v[2].i < 0 || v[0].i >= Target::EXACT_HASH_GROUPS() ||
+            v[1].i >= EXACT_HASH_ADR_GROUPS || v[2].i >= (1 << EXACT_HASH_SELECT_BITS)) {
+            return false;
+        }
+        way.group_xme = v[0].i;
+        way.index = v[1].i * EXACT_HASH_ADR_BITS;
+        way.select = bitvec(v[2].i) << EXACT_HASH_FIRST_SELECT_BIT;
+        for (int i = 3; i < v.vec.size; i++) {
+            if (!CHECKTYPE(v[i], tVEC)) return true;  // supress added message
+            if (!parse_ram(v[i], way.rams)) error(v[i].lineno, "invalid ram in way");
+        }
+    } else {
+        int index_size = 0;
+        for (auto &kv : MapIterChecked(v.map)) {
+            if ((kv.key == "group" || kv.key == "xme") && CHECKTYPE(kv.value, tINT)) {
+                if ((way.group_xme = kv.value.i) >= Target::IXBAR_HASH_GROUPS())
+                    error(kv.value.lineno, "%s %ld out of range", kv.key.s, kv.value.i);
+            } else if (kv.key == "index") {
+                if (!CHECKTYPE2(kv.value, tINT, tRANGE)) continue;
+                if (kv.value.type == tINT) {
+                    way.index = kv.value.i;
+                } else {
+                    way.index = kv.value.range.lo;
+                    way.index_hi = kv.value.range.hi;
+                    index_size = kv.value.range.hi - kv.value.range.lo + 1;
+                }
+                if (way.index > Target::IXBAR_HASH_INDEX_MAX() ||
+                    way.index % Target::IXBAR_HASH_INDEX_STRIDE() != 0)
+                    error(kv.value.lineno, "invalid way index %d", way.index);
+            } else if (kv.key == "select") {
+                if (kv.value.type == tCMD && kv.value == "&") {
+                    if (CHECKTYPE2(kv.value[1], tINT, tRANGE) && CHECKTYPE(kv.value[2], tINT)) {
+                        way.select = bitvec(kv.value[2].i);
+                        if (kv.value[1].type == tINT) {
+                            way.select <<= kv.value[1].i;
+                        } else {
+                            way.select <<= kv.value[1].range.lo;
+                            if (kv.value[1].range.hi < way.select.max().index())
+                                error(kv.value.lineno, "invalid select mask for range");
+                        }
+                    }
+                } else if (kv.value.type == tRANGE) {
+                    way.select.setrange(kv.value.range.lo,
+                                        kv.value.range.hi - kv.value.range.lo + 1);
+                } else {
+                    error(kv.value.lineno, "invalid select %s", value_desc(&kv.value));
+                }
+            } else if (kv.key == "rams" && CHECKTYPE(kv.value, tVEC)) {
+                for (auto &ram : kv.value.vec) {
+                    if (!CHECKTYPE(ram, tVEC)) break;
+                    if (!parse_ram(ram, way.rams)) error(ram.lineno, "invalid ram in way");
+                }
+            }
+        }
+        if (index_size) {
+            // FIXME -- currently this code is assuming the index bits cover just the ram index
+            // bits and the subword bits, and not any select bits.  Perhaps that is wrong an it
+            // should include the select bits.
+            if (way.rams.empty()) {
+                error(v.lineno, "no rams in way");
+            } else {
+                way.subword_bits = index_size - (way.isLamb() ? LAMB_DEPTH_BITS : SRAM_DEPTH_BITS);
+                if (way.subword_bits < 0) error(v.lineno, "index range too small for way rams");
+            }
+        }
+    }
+    ways.push_back(way);
+    return true;
+}
+
+void SRamMatchTable::common_sram_setup(pair_t &kv, const VECTOR(pair_t) & data) {
+    if (kv.key == "ways") {
+        if (!CHECKTYPE(kv.value, tVEC)) return;
+        for (auto &w : kv.value.vec)
+            if (!parse_way(w)) error(w.lineno, "invalid way descriptor");
+    } else if (kv.key == "match") {
+        if (kv.value.type == tVEC) {
+            for (auto &v : kv.value.vec) {
+                if (v == "hash_group")
+                    match.emplace_back(new HashMatchSource(v));
+                else
+                    match.emplace_back(new Phv::Ref(gress, stage->stageno, v));
+            }
+        } else {
+            if (kv.value == "hash_group")
+                match.emplace_back(new HashMatchSource(kv.value));
+            else
+                match.emplace_back(new Phv::Ref(gress, stage->stageno, kv.value));
+        }
+    } else if (kv.key == "match_group_map") {
+        mgm_lineno = kv.value.lineno;
+        if (CHECKTYPE(kv.value, tVEC)) {
+            word_info.resize(kv.value.vec.size);
+            for (int i = 0; i < kv.value.vec.size; i++)
+                if (CHECKTYPE(kv.value[i], tVEC)) {
+                    if (kv.value[i].vec.size > 5)
+                        error(kv.value[i].lineno, "Too many groups for word %d", i);
+                    for (auto &v : kv.value[i].vec)
+                        if (CHECKTYPE(v, tINT)) word_info[i].push_back(v.i);
+                }
+        }
+    } else {
+        warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key), name());
+    }
+}
+
+void SRamMatchTable::common_sram_checks() {
+    if (Target::SRAM_GLOBAL_ACCESS())
+        alloc_global_srams();
+    else
+        alloc_rams(false, stage->sram_use, &stage->sram_search_bus_use);
+    if (layout_size() > 0 && !format) error(lineno, "No format specified in table %s", name());
+    if (!action.set() && !actions)
+        error(lineno, "Table %s has neither action table nor immediate actions", name());
+    if (actions && !action_bus) action_bus = ActionBus::create();
+    if (input_xbar.empty()) input_xbar.emplace_back(InputXbar::create(this));
+}
+
+void SRamMatchTable::alloc_global_busses() { BUG(); }
+
+void SRamMatchTable::pass1() {
+    LOG1("### SRam match table " << name() << " pass1 " << loc());
+    if (format) {
+        verify_format();
+        setup_ways();
+        determine_word_and_result_bus();
+    }
+    if (Target::SRAM_GLOBAL_ACCESS())
+        alloc_global_busses();
+    else
+        alloc_busses(stage->sram_search_bus_use, Layout::SEARCH_BUS);
+    MatchTable::pass1();
+    if (action_enable >= 0)
+        if (action.args.size() < 1 || action.args[0].size() <= (unsigned)action_enable)
+            error(lineno, "Action enable bit %d out of range for action selector", action_enable);
+    if (gateway) {
+        if (!gateway->layout.empty()) {
+            for (auto &row : layout) {
+                if (row.row == gateway->layout[0].row && row.bus == gateway->layout[0].bus &&
+                    !row.memunits.empty()) {
+                    unsigned gw_use = gateway->input_use() & 0xff;
+                    auto &way = way_map.at(row.memunits[0]);
+                    for (auto &grp : group_info) {
+                        if (gw_use & grp.tofino_mask[way.word]) {
+                            error(gateway->lineno,
+                                  "match bus conflict between match and gateway"
+                                  " on table %s",
+                                  name());
+                            break;
+                        }
+                    }
+                    break;
+                }
+            }
+        }
+    }
+}
+
+void SRamMatchTable::setup_hash_function_ids() {
+    unsigned hash_fn_id = 0;
+    for (auto &w : ways) {
+        if (hash_fn_ids.count(w.group_xme) == 0) hash_fn_ids[w.group_xme] = hash_fn_id++;
+    }
+}
+
+void SRamMatchTable::setup_ways() {
+    unsigned fmt_width = (format->size + 127) / 128;
+    if (ways.empty()) {
+        error(lineno, "No ways defined in table %s", name());
+    } else if (ways[0].rams.empty()) {
+        for (auto &w : ways)
+            if (!w.rams.empty()) {
+                error(w.lineno, "Must specify rams for all ways in tabls %s, or none", name());
+                return;
+            }
+        if (layout.size() % fmt_width != 0) {
+            error(lineno, "Rows is not a multiple of width in table %s", name());
+            return;
+        }
+        for (unsigned i = 0; i < layout.size(); ++i) {
+            unsigned first = (i / fmt_width) * fmt_width;
+            if (layout[i].memunits.size() != layout[first].memunits.size())
+                error(layout[i].lineno, "Row size mismatch within wide table %s", name());
+        }
+        if (error_count > 0) return;
+        unsigned ridx = 0, cidx = 0;
+        for (auto &way : ways) {
+            if (ridx >= layout.size()) {
+                error(way.lineno, "Not enough rams for ways in table %s", name());
+                break;
+            }
+            unsigned size = 1U << way.select.popcount();
+            for (unsigned i = 0; i < size; i++) {
+                for (unsigned word = 0; word < fmt_width; ++word) {
+                    BUG_CHECK(ridx + word < layout.size());
+                    auto &row = layout[ridx + word];
+                    BUG_CHECK(cidx < row.memunits.size());
+                    way.rams.push_back(row.memunits[cidx]);
+                }
+                if (++cidx == layout[ridx].memunits.size()) {
+                    ridx += fmt_width;
+                    cidx = 0;
+                }
+            }
+        }
+        if (ridx < layout.size())
+            error(ways[0].lineno, "Too many rams for ways in table %s", name());
+    } else {
+        std::set<Ram> rams;
+        for (auto &row : layout) {
+            for (auto &unit : row.memunits) {
+                BUG_CHECK(!rams.count(unit), "%s duplicate in table", unit.desc());
+                rams.insert(unit);
+            }
+        }
+        int way = -1;
+        for (auto &w : ways) {
+            ++way;
+            int index = -1;
+            if (table_type() != ATCAM) {
+                if ((w.rams.size() != (1U << w.select.popcount()) * fmt_width))
+                    error(w.lineno, "Depth of way doesn't match number of rams in table %s",
+                          name());
+            } else {
+                // Allowed to not fully match, as the partition index can be set from the
+                // control plane
+                if (!((w.rams.size() <= (1U << w.select.popcount()) * fmt_width) &&
+                      (w.rams.size() % fmt_width) == 0))
+                    error(w.lineno, "RAMs in ATCAM is not a legal multiple of the format width %s",
+                          name());
+            }
+            for (auto &ram : w.rams) {
+                ++index;
+                if (way_map.count(ram)) {
+                    if (way == way_map.at(ram).way)
+                        error(w.lineno, "%s used twice in way %d of table %s", ram.desc(), way,
+                              name());
+                    else
+                        error(w.lineno, "%s used ways %d and %d of table %s", ram.desc(), way,
+                              way_map.at(ram).way, name());
+                    continue;
+                }
+                way_map[ram].way = way;
+                if (!ram.isLamb() && !rams.count(ram))
+                    error(w.lineno, "%s in way %d not part of table %s", ram.desc(), way, name());
+                rams.erase(ram);
+            }
+        }
+        for (const auto &unit : rams) {
+            error(lineno, "%s not in any way of table %s", unit.desc(), name());
+        }
+    }
+    if (error_count > 0) return;
+    int way = 0;
+    for (auto &w : ways) {
+        MaskCounter bank(w.select.getrange(EXACT_HASH_FIRST_SELECT_BIT, 32));
+        unsigned index = 0, word = 0;
+        int col = -1;
+        for (auto &ram : w.rams) {
+            auto &wm = way_map[ram];
+            wm.way = way;
+            wm.index = index;
+            wm.word = fmt_width - word - 1;
+            wm.bank = bank;
+            if (word && col != ram.col)
+                error(w.lineno, "Wide exact match split across columns %d and %d", col, ram.col);
+            col = ram.col;
+            ++index;
+            if (++word == fmt_width) {
+                word = 0;
+                bank++;
+            }
+        }
+        ++way;
+    }
+    setup_hash_function_ids();
+}
+
+/**
+ * Either fills out the word/result bus information each row, if it is not provided directly by
+ * the compiler, or verifies that the word/result_bus information matches directly with
+ * what has been calculated through the way information provided.
+ */
+void SRamMatchTable::determine_word_and_result_bus() {
+    for (auto &row : layout) {
+        int word = -1;
+        bool word_set = false;
+        for (auto &ram : row.memunits) {
+            auto &way = way_map.at(ram);
+            if (word_set) {
+                BUG_CHECK(word == way.word);
+            } else {
+                word = way.word;
+                word_set = true;
+            }
+        }
+        if (row.word_initialized()) {
+            if (word != row.word)
+                error(lineno, "Word on row %d bus %d does not align with word in RAM", row.row,
+                      row.bus.at(Layout::SEARCH_BUS));
+        } else {
+            row.word = word;
+        }
+    }
+
+    for (auto &row : layout) {
+        bool result_bus_needed = false;
+        if (row.word < 0) {
+            // row with no rams -- assume it needs a result bus for the payload
+            result_bus_needed = true;
+        } else {
+            for (auto group_in_word : word_info.at(row.word)) {
+                if (group_info[group_in_word].result_bus_word == row.word) result_bus_needed = true;
+            }
+        }
+        if (!row.bus.count(Layout::RESULT_BUS) && result_bus_needed)
+            row.bus[Layout::RESULT_BUS] = row.bus.at(Layout::SEARCH_BUS);
+        if (row.bus.count(Layout::RESULT_BUS)) {
+            auto *old = stage->match_result_bus_use[row.row][row.bus.at(Layout::RESULT_BUS)];
+            if (old && old != this)
+                error(row.lineno,
+                      "inconsistent use of match result bus %d on row %d between "
+                      "table %s and %s",
+                      row.row, row.bus.at(Layout::RESULT_BUS), name(), old->name());
+            stage->match_result_bus_use[row.row][row.bus.at(Layout::RESULT_BUS)] = this;
+        }
+    }
+}
+
+int SRamMatchTable::determine_pre_byteswizzle_loc(MatchSource *ms, int lo, int hi, int word) {
+    auto phv_p = dynamic_cast<Phv::Ref *>(ms);
+    BUG_CHECK(phv_p);
+    auto phv_ref = *phv_p;
+    Phv::Slice sl(*phv_ref, lo, hi);
+    BUG_CHECK(word_ixbar_group[word] >= 0);
+    return find_on_ixbar(sl, word_ixbar_group[word]);
+}
+
+template <class REGS>
+void SRamMatchTable::write_attached_merge_regs(REGS &regs, int bus, int word, int word_group) {
+    int group = word_info[word][word_group];
+    auto &merge = regs.rams.match.merge;
+    for (auto &st : attached.stats) {
+        if (group_info[group].result_bus_word == static_cast<int>(word)) {
+            merge.mau_stats_adr_exact_shiftcount[bus][word_group] =
+                st->to<CounterTable>()->determine_shiftcount(st, group, word, 0);
+        } else if (options.match_compiler) {
+            /* unused, so should not be set... */
+            merge.mau_stats_adr_exact_shiftcount[bus][word_group] = 7;
+        }
+        break; /* all must be the same, only config once */
+    }
+    for (auto &m : attached.meters) {
+        if (group_info[group].overhead_word == static_cast<int>(word) ||
+            group_info[group].overhead_word == -1) {
+            m->to<MeterTable>()->setup_exact_shift(regs, bus, group, word, word_group, m,
+                                                   attached.meter_color);
+        } else if (options.match_compiler) {
+            /* unused, so should not be set... */
+            merge.mau_meter_adr_exact_shiftcount[bus][word_group] = 16;
+        }
+        break; /* all must be the same, only config once */
+    }
+    for (auto &s : attached.statefuls) {
+        if (group_info[group].overhead_word == static_cast<int>(word) ||
+            group_info[group].overhead_word == -1) {
+            merge.mau_meter_adr_exact_shiftcount[bus][word_group] =
+                s->to<StatefulTable>()->determine_shiftcount(s, group, word, 0);
+        } else if (options.match_compiler) {
+            /* unused, so should not be set... */
+            merge.mau_meter_adr_exact_shiftcount[bus][word_group] = 16;
+        }
+        break; /* all must be the same, only config once */
+    }
+}
+
+template <class REGS>
+void SRamMatchTable::write_regs_vt(REGS &regs) {
+    LOG1("### SRam match table " << name() << " write_regs " << loc());
+    MatchTable::write_regs(regs, 0, this);
+    auto &merge = regs.rams.match.merge;
+    unsigned fmt_width = format ? (format->size + 127) / 128 : 0;
+    bitvec match_mask;
+    match_mask.setrange(0, 128 * fmt_width);
+    version_nibble_mask.setrange(0, 32 * fmt_width);
+    for (unsigned i = 0; format && i < format->groups(); i++) {
+        if (Format::Field *match = format->field("match", i)) {
+            for (auto &piece : match->bits) match_mask.clrrange(piece.lo, piece.hi + 1 - piece.lo);
+        }
+        if (Format::Field *version = format->field("version", i)) {
+            match_mask.clrrange(version->bit(0), version->size);
+            version_nibble_mask.clrrange(version->bit(0) / 4, 1);
+        }
+    }
+    Format::Field *next = format ? format->field("next") : nullptr;
+    if (format && !next && hit_next.size() > 1) next = format->field("action");
+
+    /* iterating through rows in the sram array;  while in this loop, 'row' is the
+     * row we're on, 'word' is which word in a wide full-way the row is for, and 'way'
+     * is which full-way of the match table the row is for.  For compatibility with the
+     * compiler, we iterate over rows and ways in order, and words from msb to lsb (reversed) */
+    int index = -1;
+    for (auto &row : layout) {
+        index++; /* index of the row in the layout */
+        int search_bus = ::get(row.bus, Layout::SEARCH_BUS, -1);
+        /* setup match logic in rams */
+        auto &rams_row = regs.rams.array.row[row.row];
+        auto &vh_adr_xbar = rams_row.vh_adr_xbar;
+        bool first = true;
+        int hash_group = -1;
+        unsigned word = ~0;
+        auto vpn_iter = row.vpns.begin();
+        for (auto &memunit : row.memunits) {
+            int col = memunit.col;
+            BUG_CHECK(memunit.stage == INT_MIN && memunit.row == row.row, "bogus %s in row %d",
+                      memunit.desc(), row.row);
+            auto &way = way_map.at(memunit);
+            if (first) {
+                hash_group = ways[way.way].group_xme;
+                word = way.word;
+                setup_muxctl(vh_adr_xbar.exactmatch_row_hashadr_xbar_ctl[search_bus], hash_group);
+                first = false;
+            } else if (hash_group != ways[way.way].group_xme || int(word) != way.word) {
+                auto first_way = way_map.at(row.memunits[0]);
+                error(ways[way.way].lineno,
+                      "table %s ways #%d and #%d use the same row bus "
+                      "(%d.%d) but different %s",
+                      name(), first_way.way, way.way, row.row, search_bus,
+                      int(word) == way.word ? "hash groups" : "word order");
+                hash_group = ways[way.way].group_xme;
+                word = way.word;
+            }
+            setup_muxctl(vh_adr_xbar.exactmatch_mem_hashadr_xbar_ctl[col],
+                         ways[way.way].index / EXACT_HASH_ADR_BITS + search_bus * 5);
+            if (options.match_compiler || ways[way.way].select) {
+                // Glass always sets this.  When mask == 0, bank will also be 0, and the
+                // comparison will always match, so the bus need not be read (inp_sel).
+                // CSR suggests it should NOT be set if not needed to save power.
+                auto &bank_enable = vh_adr_xbar.exactmatch_bank_enable[col];
+                bank_enable.exactmatch_bank_enable_bank_mask =
+                    ways[way.way].select.getrange(EXACT_HASH_FIRST_SELECT_BIT, 32);
+                bank_enable.exactmatch_bank_enable_bank_id = way.bank;
+                bank_enable.exactmatch_bank_enable_inp_sel |= 1 << search_bus;
+            }
+            auto &ram = rams_row.ram[col];
+            for (unsigned i = 0; i < 4; i++)
+                ram.match_mask[i] = match_mask.getrange(way.word * 128U + i * 32, 32);
+
+            if (next) {
+                for (int group : word_info[way.word]) {
+                    if (group_info[group].result_bus_word != way.word) continue;
+                    int pos = (next->by_group[group]->bit(0) % 128) - 1;
+                    auto &n = ram.match_next_table_bitpos;
+                    switch (group_info[group].result_bus_word_group()) {
+                        case 0:
+                            break;
+                        case 1:
+                            n.match_next_table1_bitpos = pos;
+                            break;
+                        case 2:
+                            n.match_next_table2_bitpos = pos;
+                            break;
+                        case 3:
+                            n.match_next_table3_bitpos = pos;
+                            break;
+                        case 4:
+                            n.match_next_table4_bitpos = pos;
+                            break;
+                        default:
+                            BUG();
+                    }
+                }
+            }
+
+            ram.unit_ram_ctl.match_ram_logical_table = logical_id;
+            ram.unit_ram_ctl.match_ram_write_data_mux_select = 7; /* unused */
+            ram.unit_ram_ctl.match_ram_read_data_mux_select = 7;  /* unused */
+            ram.unit_ram_ctl.match_ram_matchdata_bus1_sel = search_bus;
+            if (row.bus.count(Layout::RESULT_BUS))
+                ram.unit_ram_ctl.match_result_bus_select = 1 << row.bus.at(Layout::RESULT_BUS);
+            if (auto cnt = word_info[way.word].size())
+                ram.unit_ram_ctl.match_entry_enable = ~(~0U << cnt);
+            auto &unitram_config =
+                regs.rams.map_alu.row[row.row].adrmux.unitram_config[col / 6][col % 6];
+            unitram_config.unitram_type = 1;
+            unitram_config.unitram_logical_table = logical_id;
+            switch (gress) {
+                case INGRESS:
+                case GHOST:
+                    unitram_config.unitram_ingress = 1;
+                    break;
+                case EGRESS:
+                    unitram_config.unitram_egress = 1;
+                    break;
+                default:
+                    BUG();
+            }
+            unitram_config.unitram_enable = 1;
+
+            int vpn = *vpn_iter++;
+            std::vector<int> vpn01;
+            auto groups_in_word = word_info[way.word];
+            // Action format is made up of multiple groups (groups_in_format) which can be spread
+            // across multiple words. The match_group_map specifies which groups are within each
+            // word. For an N pack across M words if N > M, we have one or more words with multiple
+            // groups.
+            // Below code assigns VPN for each group(groups_in_word) within a word which are indexed
+            // separately from groups_in_format.
+            // E.g.
+            // format: {
+            //   action(0): 0..0, immediate(0): 2..9,   version(0): 112..115, match(0): 18..71,
+            //   action(1): 1..1, immediate(1): 10..17, version(1): 116..119,
+            //         match(1): [ 194..199, 72..111, 120..127  ],
+            //   action(2): 128..128, immediate(2): 129..136, version(2): 240..243,
+            //         match(2): 138..191,
+            //   action(3): 256..256, immediate(3): 257..264, version(3): 368..371,
+            //          match(3): 266..319,
+            //   action(4): 384..384, immediate(4): 385..392, version(4): 496..499,
+            //      match(4): 394..447 }
+            //   match_group_map: [ [ 1, 0 ], [ 1, 2], [ 3 ], [ 4 ]  ]
+            //                        ^         ^
+            // }
+            // In the above example the "format" specifies the 5 groups packed across 4 RAMs.These
+            // are the groups_in_format
+            // The "match_group_map" specifies the groups within each word.
+            // Group 1 is spread across word 0 & word 1.
+            // Within word 0 - group 1 is group_in_word 0 and group 0 is group_in_word 1
+            // Within word 1 - group 1 is group_in_word 0 and group 2 is group_in_word 1
+            // This distinction is used while specifying the config register in setting the subfield
+            // on match_ram_vpn_lsbs.
+            for (auto group_in_word = 0; group_in_word < groups_in_word.size(); group_in_word++) {
+                auto group_in_format = groups_in_word[group_in_word];
+                int overhead_word = group_info[group_in_format].overhead_word;
+                int group_vpn = vpn + group_info[group_in_format].vpn_offset;
+                bool ok = false;
+                for (unsigned i = 0; i < vpn01.size(); ++i) {
+                    if (vpn01[i] == group_vpn >> 2) {
+                        ok = true;
+                        group_vpn = (group_vpn & 3) + (i << 2);
+                        break;
+                    }
+                }
+                if (!ok) {
+                    if (vpn01.size() >= 2) {
+                        error(mgm_lineno > 0 ? mgm_lineno : lineno,
+                              "Too many diverse vpns in table layout for %s", name());
+                        break;
+                    }
+                    vpn01.push_back(group_vpn >> 2);
+                    group_vpn &= 3;
+                    if (vpn01.size() == 1) {
+                        ram.match_ram_vpn.match_ram_vpn0 = vpn01.back();
+                    } else {
+                        ram.match_ram_vpn.match_ram_vpn1 = vpn01.back();
+                        group_vpn |= 4;
+                    }
+                }
+                ram.match_ram_vpn.match_ram_vpn_lsbs.set_subfield(group_vpn, group_in_word * 3, 3);
+            }
+
+            int word_group = 0;
+            for (int group : word_info[way.word]) {
+                unsigned mask = group_info[group].tofino_mask[way.word];
+                ram.match_bytemask[word_group].mask_bytes_0_to_13 = ~mask & 0x3fff;
+                ram.match_bytemask[word_group].mask_nibbles_28_to_31 = ~(mask >> 14) & 0xf;
+                word_group++;
+            }
+            for (; word_group < 5; word_group++) {
+                ram.match_bytemask[word_group].mask_bytes_0_to_13 = 0x3fff;
+                ram.match_bytemask[word_group].mask_nibbles_28_to_31 = 0xf;
+            }
+            if (gress == EGRESS)
+                regs.cfg_regs.mau_cfg_uram_thread[col / 4U] |= 1U << (col % 4U * 8U + row.row);
+            rams_row.emm_ecc_error_uram_ctl[timing_thread(gress)] |= 1U << (col - 2);
+        }
+        /* setup input xbars to get data to the right places on the bus(es) */
+        bool using_match = false;
+        // Loop for determining the config to indicate which bytes from the search bus
+        // are compared to the bytes on the RAM line
+        if (!row.memunits.empty()) {
+            auto &byteswizzle_ctl = rams_row.exactmatch_row_vh_xbar_byteswizzle_ctl[search_bus];
+            for (unsigned i = 0; format && i < format->groups(); i++) {
+                if (Format::Field *match = format->field("match", i)) {
+                    unsigned bit = 0;
+                    for (auto &piece : match->bits) {
+                        if (piece.lo / 128U != word) {
+                            bit += piece.size();
+                            continue;
+                        }
+                        using_match = true;
+                        for (unsigned fmt_bit = piece.lo; fmt_bit <= piece.hi;) {
+                            unsigned byte = (fmt_bit % 128) / 8;
+                            unsigned bits_in_byte = (byte + 1) * 8 - (fmt_bit % 128);
+                            if (fmt_bit + bits_in_byte > piece.hi + 1)
+                                bits_in_byte = piece.hi + 1 - fmt_bit;
+                            auto it = --match_by_bit.upper_bound(bit);
+                            int lo = bit - it->first;
+                            int hi = lo + bits_in_byte - 1;
+                            int bus_loc = determine_pre_byteswizzle_loc(it->second, lo, hi, word);
+                            BUG_CHECK(bus_loc >= 0 && bus_loc < 16);
+                            for (unsigned b = 0; b < bits_in_byte; b++, fmt_bit++)
+                                byteswizzle_ctl[byte][fmt_bit % 8U] = 0x10 + bus_loc;
+                            bit += bits_in_byte;
+                        }
+                    }
+                    BUG_CHECK(bit == match->size);
+                }
+                if (Format::Field *version = format->field("version", i)) {
+                    if (version->bit(0) / 128U != word) continue;
+                    ///> if no match, but a version/valid is, the vh_xbar needs to be
+                    ///> enabled.  This was preventing anything from running
+                    using_match = true;
+                    for (unsigned j = 0; j < version->size; ++j) {
+                        unsigned bit = version->bit(j);
+                        unsigned byte = (bit % 128) / 8;
+                        byteswizzle_ctl[byte][bit % 8U] = 8;
+                    }
+                }
+            }
+            if (using_match) {
+                auto &vh_xbar_ctl = rams_row.vh_xbar[search_bus].exactmatch_row_vh_xbar_ctl;
+                if (word_ixbar_group[word] >= 0) {
+                    setup_muxctl(vh_xbar_ctl, word_ixbar_group[word]);
+                } else {
+                    // Need the bus for version/valid, but don't care what other data is on it.  So
+                    // just set the enable without actually selecting an input -- if another table
+                    // is sharing the bus, it will set it, otherwise we'll get ixbar group 0
+                    vh_xbar_ctl.exactmatch_row_vh_xbar_enable = 1;
+                }
+                vh_xbar_ctl.exactmatch_row_vh_xbar_thread = timing_thread(gress);
+            }
+        }
+        /* setup match central config to extract results of the match */
+        ssize_t r_bus = -1;
+        if (row.bus.count(Layout::RESULT_BUS)) r_bus = row.row * 2 + row.bus.at(Layout::RESULT_BUS);
+        // If the result bus is not to be used, then the registers are not necessary to set up
+        // for shift/mask/default etc.
+        /* FIXME -- factor this where possible with ternary match code */
+        if (action) {
+            if (auto adt = action->to<ActionTable>()) {
+                if (r_bus >= 0) {
+                    /* FIXME -- support for multiple sizes of action data? */
+                    merge.mau_actiondata_adr_mask[0][r_bus] = adt->determine_mask(action);
+                    merge.mau_actiondata_adr_vpn_shiftcount[0][r_bus] =
+                        adt->determine_vpn_shiftcount(action);
+                }
+            }
+        }
+
+        if (format && word < word_info.size()) {
+            for (unsigned word_group = 0; word_group < word_info[word].size(); word_group++) {
+                int group = word_info[word][word_group];
+                if (group_info[group].result_bus_word == static_cast<int>(word)) {
+                    BUG_CHECK(r_bus >= 0);
+                    if (format->immed) {
+                        BUG_CHECK(format->immed->by_group[group]->bit(0) / 128U == word);
+                        merge.mau_immediate_data_exact_shiftcount[r_bus][word_group] =
+                            format->immed->by_group[group]->bit(0) % 128;
+                    }
+                    if (instruction) {
+                        int shiftcount = 0;
+                        if (auto field = instruction.args[0].field()) {
+                            assert(field->by_group[group]->bit(0) / 128U == word);
+                            shiftcount = field->by_group[group]->bit(0) % 128U;
+                        } else if (auto field = instruction.args[1].field()) {
+                            assert(field->by_group[group]->bit(0) / 128U == word);
+                            shiftcount = field->by_group[group]->bit(0) % 128U;
+                        }
+                        merge.mau_action_instruction_adr_exact_shiftcount[r_bus][word_group] =
+                            shiftcount;
+                    }
+                }
+                /* FIXME -- factor this where possible with ternary match code */
+                if (action) {
+                    if (group_info[group].result_bus_word == static_cast<int>(word)) {
+                        BUG_CHECK(r_bus >= 0);
+                        merge.mau_actiondata_adr_exact_shiftcount[r_bus][word_group] =
+                            action->determine_shiftcount(action, group, word, 0);
+                    }
+                }
+                if (attached.selector) {
+                    if (group_info[group].result_bus_word == static_cast<int>(word)) {
+                        BUG_CHECK(r_bus >= 0);
+                        auto sel = get_selector();
+                        merge.mau_meter_adr_exact_shiftcount[r_bus][word_group] =
+                            sel->determine_shiftcount(attached.selector, group, word, 0);
+                        merge.mau_selectorlength_shiftcount[0][r_bus] =
+                            sel->determine_length_shiftcount(attached.selector_length, group, word);
+                        merge.mau_selectorlength_mask[0][r_bus] =
+                            sel->determine_length_mask(attached.selector_length);
+                        merge.mau_selectorlength_default[0][r_bus] =
+                            sel->determine_length_default(attached.selector_length);
+                    }
+                }
+                if (idletime) {
+                    if (group_info[group].result_bus_word == static_cast<int>(word)) {
+                        BUG_CHECK(r_bus >= 0);
+                        merge.mau_idletime_adr_exact_shiftcount[r_bus][word_group] =
+                            idletime->direct_shiftcount();
+                    }
+                }
+                if (r_bus >= 0) write_attached_merge_regs(regs, r_bus, word, word_group);
+            }
+        } else if (format) {
+            // If we have a result bus without any attached memories, program
+            // the registers on this row because a subset of the registers have been
+            // programmed elsewhere and it can break things if we have a partial configuration.
+            // FIXME: avoid programming any registers if we don't actually use the result bus.
+            if (r_bus >= 0) write_attached_merge_regs(regs, r_bus, 0, 0);
+        }
+        for (auto &ram : row.memunits) {
+            int word_group = 0;
+            auto &merge_col = merge.col[ram.col];
+            for (int group : word_info[word]) {
+                int result_bus_word = group_info[group].result_bus_word;
+                if (int(word) == result_bus_word) {
+                    BUG_CHECK(r_bus >= 0);
+                    merge_col.row_action_nxtable_bus_drive[row.row] |= 1 << (r_bus % 2);
+                }
+                if (word_group < 2) {
+                    auto &way = way_map.at(ram);
+                    int idx = way.index + word - result_bus_word;
+                    int overhead_row = ways[way.way].rams[idx].row;
+                    auto &hitmap_ixbar = merge_col.hitmap_output_map[2 * row.row + word_group];
+                    setup_muxctl(hitmap_ixbar,
+                                 overhead_row * 2 + group_info[group].result_bus_word_group());
+                }
+                ++word_group;
+            }
+            // setup_muxctl(merge.col[ram.col].hitmap_output_map[bus],
+            //                layout[index+word].row*2 + layout[index+word].bus);
+        }
+        // if (gress == EGRESS)
+        //     merge.exact_match_delay_config.exact_match_bus_thread |= 1 << bus;
+        if (r_bus >= 0) {
+            merge.exact_match_phys_result_en[r_bus / 8U] |= 1U << (r_bus % 8U);
+            merge.exact_match_phys_result_thread[r_bus / 8U] |= timing_thread(gress)
+                                                                << (r_bus % 8U);
+            if (stage->tcam_delay(gress))
+                merge.exact_match_phys_result_delay[r_bus / 8U] |= 1U << (r_bus % 8U);
+        }
+    }
+
+    merge.exact_match_logical_result_en |= 1 << logical_id;
+    if (stage->tcam_delay(gress) > 0) merge.exact_match_logical_result_delay |= 1 << logical_id;
+    if (actions) actions->write_regs(regs, this);
+    if (gateway) gateway->write_regs(regs);
+    if (idletime) idletime->write_regs(regs);
+    for (auto &hd : hash_dist) hd.write_regs(regs, this);
+}
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void SRamMatchTable::write_regs, (mau_regs & regs),
+                      { write_regs_vt(regs); })
+
+std::string SRamMatchTable::get_match_mode(const Phv::Ref &pref, int offset) const {
+    return "unused";
+}
+
+void SRamMatchTable::add_field_to_pack_format(json::vector &field_list, unsigned basebit,
+                                              std::string name, const Table::Format::Field &field,
+                                              const Table::Actions::Action *act) const {
+    if (name != "match") {
+        // FIXME -- tofino always pads out the wordsize so basebit is always 0.
+        basebit = 0;
+        Table::add_field_to_pack_format(field_list, basebit, name, field, act);
+        return;
+    }
+    LOG3("Adding fields for " << name << " - " << field << " to pack format for SRAM table "
+                              << this->name() << " in action : " << act);
+    unsigned bit = 0;
+    for (auto &piece : field.bits) {
+        auto mw = --match_by_bit.upper_bound(bit);
+        int lo = bit - mw->first;
+        int lsb_mem_word_idx = piece.lo / MEM_WORD_WIDTH;
+        int msb_mem_word_idx = piece.hi / MEM_WORD_WIDTH;
+        int offset = piece.lo % MEM_WORD_WIDTH;
+        while (mw != match_by_bit.end() && mw->first < bit + piece.size()) {
+            std::string source = "";
+            std::string immediate_name = "";
+            std::string mw_name = mw->second->name();
+            int start_bit = 0;
+
+            get_cjson_source(mw_name, source, start_bit);
+            if (source == "")
+                error(lineno, "Cannot determine proper source for field %s", name.c_str());
+            std::string field_name, global_name = "";
+            std::string match_mode;
+            if (auto phv_p = dynamic_cast<Phv::Ref *>(mw->second)) {
+                field_name = mw->second->name();
+                // If the name has a slice in it, remove it and add the lo bit of
+                // the slice to field_bit.  This takes the place of
+                // canon_field_list(), rather than extracting the slice component
+                // of the field name, if present, and appending it to the key name.
+                int slice_offset = remove_name_tail_range(field_name);
+                start_bit = lo + slice_offset + mw->second->fieldlobit();
+                global_name = field_name;
+                auto p = find_p4_param(field_name, "", start_bit);
+                if (!p && !p4_params_list.empty()) {
+                    warning(lineno,
+                            "Cannot find field name %s in p4_param_order "
+                            "for table %s",
+                            field_name.c_str(), this->name());
+                } else if (p && !p->key_name.empty()) {
+                    field_name = p->key_name;
+                }
+                match_mode = get_match_mode(*phv_p, mw->first);
+            } else if (dynamic_cast<HashMatchSource *>(mw->second)) {
+                field_name = "--proxy_hash--";
+                match_mode = "unused";
+                start_bit = mw->second->fieldlobit();
+            } else {
+                BUG();
+            }
+
+            field_list.push_back(json::map{{"field_name", json::string(field_name)},
+                                           {"global_name", json::string(global_name)},
+                                           {"source", json::string(source)},
+                                           {"lsb_mem_word_offset", json::number(offset)},
+                                           {"start_bit", json::number(start_bit)},
+                                           {"immediate_name", json::string(immediate_name)},
+                                           {"lsb_mem_word_idx", json::number(lsb_mem_word_idx)},
+                                           {"msb_mem_word_idx", json::number(msb_mem_word_idx)},
+                                           // FIXME-JSON
+                                           {"match_mode", json::string(match_mode)},
+                                           {"enable_pfe", json::False()},  // FIXME-JSON
+                                           {"field_width", json::number(mw->second->size())}});
+            LOG5("Adding json field  " << field_list.back());
+            offset += mw->second->size();
+            lo = 0;
+            ++mw;
+        }
+        bit += piece.size();
+    }
+}
+
+void SRamMatchTable::add_action_cfgs(json::map &tbl, json::map &stage_tbl) const {
+    if (actions) {
+        actions->gen_tbl_cfg(tbl["actions"]);
+        actions->add_action_format(this, stage_tbl);
+    } else if (action && action->actions) {
+        action->actions->gen_tbl_cfg(tbl["actions"]);
+        action->actions->add_action_format(this, stage_tbl);
+    }
+}
+
+unsigned SRamMatchTable::get_format_width() const {
+    return format ? (format->size + 127) / 128 : 0;
+}
+
+unsigned SRamMatchTable::get_number_entries() const {
+    unsigned fmt_width = get_format_width();
+    unsigned number_entries = 0;
+    if (format) number_entries = layout_size() / fmt_width * format->groups() * entry_ram_depth();
+    return number_entries;
+}
+
+json::map *SRamMatchTable::add_common_sram_tbl_cfgs(json::map &tbl, std::string match_type,
+                                                    std::string stage_table_type) const {
+    common_tbl_cfg(tbl);
+    json::map &match_attributes = tbl["match_attributes"];
+    json::vector &stage_tables = match_attributes["stage_tables"];
+    json::map *stage_tbl_ptr =
+        add_stage_tbl_cfg(match_attributes, stage_table_type.c_str(), get_number_entries());
+    json::map &stage_tbl = *stage_tbl_ptr;
+    // This is a only a glass required field, as it is only required when no default action
+    // is specified, which is impossible for Brig through p4-16
+    stage_tbl["default_next_table"] = Stage::end_of_pipe();
+    match_attributes["match_type"] = match_type;
+    add_hash_functions(stage_tbl);
+    add_action_cfgs(tbl, stage_tbl);
+    add_result_physical_buses(stage_tbl);
+    MatchTable::gen_idletime_tbl_cfg(stage_tbl);
+    merge_context_json(tbl, stage_tbl);
+    add_all_reference_tables(tbl);
+    return stage_tbl_ptr;
+}
+
+int SRamMatchTable::find_problematic_vpn_offset() const {
+    // Any single word of a match that contains 3 or more groups whose min and max vpn_offset
+    // differs by more than 5 is going to be a problem.  We need to permute the offsets so that
+    // does not happen
+    if (group_info.size() <= 6) return -1;  // can't differ by more than 5
+    for (auto &word : word_info) {
+        if (word.size() <= 2) continue;  // can't be a problem
+        int minvpn = -1, maxvpn = -1, avg = 0;
+        for (auto group : word) {
+            int vpn_offset = group_info[group].vpn_offset;
+            if (minvpn < 0)
+                minvpn = maxvpn = vpn_offset;
+            else if (minvpn > vpn_offset)
+                minvpn = vpn_offset;
+            else if (maxvpn < vpn_offset)
+                maxvpn = vpn_offset;
+            avg += vpn_offset;
+        }
+        if (maxvpn - minvpn > 5) {
+            if (minvpn + maxvpn > (2 * avg) / word.size())
+                minvpn = maxvpn;  // look for the max to move, instead of min
+            for (auto group : word) {
+                if (group_info[group].vpn_offset == minvpn) return group;
+            }
+            BUG("failed to find the group vpn we just saw");
+        }
+    }
+    return -1;  // no problem found
+}
+
+void SRamMatchTable::alloc_vpns() {
+    if (error_count > 0 || no_vpns || layout_size() == 0 || layout[0].vpns.size() > 0) return;
+    int period, width, depth;
+    const char *period_name;
+    vpn_params(width, depth, period, period_name);
+    std::map<Ram, int *> vpn_for;
+    for (auto &row : layout) {
+        row.vpns.resize(row.memunits.size());
+        int i = 0;
+        for (auto &ram : row.memunits) vpn_for[ram] = &row.vpns[i++];
+    }
+    int vpn = 0, word = 0;
+    for (auto &way : ways) {
+        for (auto unit : way.rams) {
+            *vpn_for[unit] = vpn;
+            if (++word == width) {
+                word = 0;
+                vpn += period;
+            }
+        }
+    }
+
+    int fix = find_problematic_vpn_offset();
+    if (fix >= 0) {
+        // Swap it with the middle one.  That should fix all the cases we've seen
+        int middle = group_info.size() / 2;
+        BUG_CHECK(middle != fix, "vpn_offset fix doesn't work");
+        std::swap(group_info[fix].vpn_offset, group_info[middle].vpn_offset);
+        BUG_CHECK(find_problematic_vpn_offset() < 0, "vpn_offset fix did not work");
+    }
+}
diff --git a/backends/tofino/bf-asm/stage.cpp b/backends/tofino/bf-asm/stage.cpp
new file mode 100644
index 00000000000..2e80360f1b7
--- /dev/null
+++ b/backends/tofino/bf-asm/stage.cpp
@@ -0,0 +1,857 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+
+#include <time.h>
+
+#include <fstream>
+
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/target.h"
+#include "deparser.h"
+#include "input_xbar.h"
+#include "lib/range.h"
+#include "misc.h"
+#include "parser.h"
+#include "phv.h"
+#include "sections.h"
+#include "top_level.h"
+
+extern std::string asmfile_name;
+
+unsigned char Stage::action_bus_slot_map[ACTION_DATA_BUS_BYTES];
+unsigned char Stage::action_bus_slot_size[ACTION_DATA_BUS_SLOTS];
+
+AsmStage AsmStage::singleton_object;
+
+#include "jbay/stage.cpp"    // NOLINT(build/include)
+#include "tofino/stage.cpp"  // NOLINT(build/include)
+
+AsmStage::AsmStage() : Section("stage") {
+    int slot = 0, byte = 0;
+    for (int i = 0; i < ACTION_DATA_8B_SLOTS; i++) {
+        Stage::action_bus_slot_map[byte++] = slot;
+        Stage::action_bus_slot_size[slot++] = 8;
+    }
+    for (int i = 0; i < ACTION_DATA_16B_SLOTS; i++) {
+        Stage::action_bus_slot_map[byte++] = slot;
+        Stage::action_bus_slot_map[byte++] = slot;
+        Stage::action_bus_slot_size[slot++] = 16;
+    }
+    for (int i = 0; i < ACTION_DATA_32B_SLOTS; i++) {
+        Stage::action_bus_slot_map[byte++] = slot;
+        Stage::action_bus_slot_map[byte++] = slot;
+        Stage::action_bus_slot_map[byte++] = slot;
+        Stage::action_bus_slot_map[byte++] = slot;
+        Stage::action_bus_slot_size[slot++] = 32;
+    }
+    BUG_CHECK(byte == ACTION_DATA_BUS_BYTES);
+    BUG_CHECK(slot == ACTION_DATA_BUS_SLOTS);
+}
+
+void AsmStage::start(int lineno, VECTOR(value_t) args) {
+    while (int(pipe.size()) < Target::NUM_MAU_STAGES()) pipe.emplace_back(pipe.size(), false);
+    if (args.size != 2 || args[0].type != tINT ||
+        (args[1] != "ingress" && args[1] != "egress" &&
+         (args[1] != "ghost" || options.target < JBAY))) {
+        error(lineno, "stage must specify number and ingress%s or egress",
+              options.target >= JBAY ? ", ghost" : "");
+    } else if (args[0].i < 0) {
+        error(lineno, "invalid stage number");
+    } else if ((unsigned)args[0].i >= pipe.size()) {
+        while ((unsigned)args[0].i >= pipe.size()) pipe.emplace_back(pipe.size(), false);
+    }
+}
+
+void AsmStage::input(VECTOR(value_t) args, value_t data) {
+    if (!CHECKTYPE(data, tMAP)) return;
+    int stageno = args[0].i;
+    gress_t gress =
+        args[1] == "ingress"  ? INGRESS
+        : args[1] == "egress" ? EGRESS
+        : args[1] == "ghost" && options.target >= JBAY
+            ? GHOST
+            : (error(args[1].lineno, "Invalid thread %s", value_desc(args[1])), INGRESS);
+    auto &stage = stages(gress);
+    BUG_CHECK(stageno >= 0 && (unsigned)stageno < stage.size());
+    if (stages_seen[gress][stageno])
+        error(args[0].lineno, "Duplicate stage %d %s", stageno, to_string(gress).c_str());
+    stages_seen[gress][stageno] = 1;
+    for (auto &kv : MapIterChecked(data.map, true)) {
+        if (kv.key == "dependency") {
+            if (stageno == 0) warning(kv.key.lineno, "Stage dependency in stage 0 will be ignored");
+            if (gress == GHOST) {
+                error(kv.key.lineno,
+                      "Can't specify dependency in ghost thread; it is "
+                      "locked to ingress");
+            } else if (kv.value == "concurrent") {
+                stage[stageno].stage_dep[gress] = Stage::CONCURRENT;
+                if (stageno == Target::NUM_MAU_STAGES() / 2 && options.target == TOFINO)
+                    error(kv.value.lineno, "stage %d must be match dependent", stageno);
+                else if (!Target::SUPPORT_CONCURRENT_STAGE_DEP())
+                    error(kv.value.lineno, "no concurrent execution on %s", Target::name());
+            } else if (kv.value == "action") {
+                stage[stageno].stage_dep[gress] = Stage::ACTION_DEP;
+                if (stageno == Target::NUM_MAU_STAGES() / 2 && options.target == TOFINO)
+                    error(kv.value.lineno, "stage %d must be match dependent", stageno);
+            } else if (kv.value == "match") {
+                stage[stageno].stage_dep[gress] = Stage::MATCH_DEP;
+            } else {
+                error(kv.value.lineno, "Invalid stage dependency %s", value_desc(kv.value));
+            }
+            continue;
+
+        } else if (kv.key == "mpr_stage_id") {
+            stage[stageno].verify_have_mpr(kv.key.s, kv.key.lineno);
+            if CHECKTYPE (kv.value, tINT) {
+                if (kv.value.i > stageno)
+                    error(kv.value.lineno,
+                          "mpr_stage_id value cannot be greater than current stage.");
+                stage[stageno].mpr_stage_id[gress] = kv.value.i;
+
+                /* Intermediate stage must carry the mpr glob_exec and long_branch bitmap.
+                 * If they have been left off by the compiler, we need to propagate the bits;
+                 * if the compiler has provided them, we assume it did so correctly
+                 * DANGER -- this assumes the stages appear in the .bfa file in order (at
+                 * least for each gress)
+                 */
+                if (kv.value.i != stageno) {
+                    for (int inter_stage = kv.value.i + 1; inter_stage < stageno; inter_stage++) {
+                        if (!stages_seen[gress][inter_stage]) {
+                            stage[inter_stage].mpr_bus_dep_glob_exec[gress] |=
+                                stage[kv.value.i].mpr_bus_dep_glob_exec[gress];
+                            stage[inter_stage].mpr_bus_dep_long_branch[gress] |=
+                                stage[kv.value.i].mpr_bus_dep_long_branch[gress];
+                        }
+                    }
+                }
+            }
+            continue;
+        } else if (kv.key == "mpr_always_run") {
+            stage[stageno].verify_have_mpr(kv.key.s, kv.key.lineno);
+            if CHECKTYPE (kv.value, tINT) {
+                stage[stageno].mpr_always_run |= kv.value.i;
+            }
+            continue;
+        } else if (kv.key == "mpr_bus_dep_glob_exec") {
+            stage[stageno].verify_have_mpr(kv.key.s, kv.key.lineno);
+            if CHECKTYPE (kv.value, tINT) {
+                stage[stageno].mpr_bus_dep_glob_exec[gress] = kv.value.i;
+            }
+            continue;
+        } else if (kv.key == "mpr_bus_dep_long_brch") {
+            stage[stageno].verify_have_mpr(kv.key.s, kv.key.lineno);
+            if CHECKTYPE (kv.value, tINT) {
+                stage[stageno].mpr_bus_dep_long_branch[gress] = kv.value.i;
+            }
+            continue;
+        } else if (kv.key == "mpr_next_table_lut") {
+            stage[stageno].verify_have_mpr(kv.key.s, kv.key.lineno);
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto &lut : kv.value.map) {
+                    if (!CHECKTYPE(lut.key, tINT) || lut.key.i >= LOGICAL_TABLES_PER_STAGE)
+                        error(lut.key.lineno, "Invalid mpr_next_table_lut key.");
+                    if (!CHECKTYPE(lut.value, tINT) ||
+                        lut.value.i >= (1 << LOGICAL_TABLES_PER_STAGE))
+                        error(lut.value.lineno, "Invalid mpr_next_table_lut value.");
+                    stage[stageno].mpr_next_table_lut[gress][lut.key.i] = lut.value.i;
+                }
+            }
+            continue;
+        } else if (kv.key == "mpr_glob_exec_lut") {
+            stage[stageno].verify_have_mpr(kv.key.s, kv.key.lineno);
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto &lut : kv.value.map) {
+                    if (!CHECKTYPE(lut.key, tINT) || lut.key.i >= LOGICAL_TABLES_PER_STAGE)
+                        error(lut.key.lineno, "Invalid mpr_glob_exec_lut key.");
+                    if (!CHECKTYPE(lut.value, tINT) ||
+                        lut.value.i >= (1 << LOGICAL_TABLES_PER_STAGE))
+                        error(lut.value.lineno, "Invalid mpr_glob_exec_lut value.");
+                    stage[stageno].mpr_glob_exec_lut[lut.key.i] |= lut.value.i;
+                }
+            }
+            continue;
+        } else if (kv.key == "mpr_long_brch_lut") {
+            stage[stageno].verify_have_mpr(kv.key.s, kv.key.lineno);
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto &lut : kv.value.map) {
+                    if (!CHECKTYPE(lut.key, tINT) || lut.key.i >= MAX_LONGBRANCH_TAGS)
+                        error(lut.key.lineno, "Invalid mpr_long_brch_lut key.");
+                    if (!CHECKTYPE(lut.value, tINT) ||
+                        lut.value.i >= (1 << LOGICAL_TABLES_PER_STAGE))
+                        error(lut.value.lineno, "Invalid mpr_long_brch_lut value.");
+                    stage[stageno].mpr_long_brch_lut[lut.key.i] |= lut.value.i;
+                }
+            }
+            continue;
+        } else if (kv.key == "error_mode") {
+            if (gress == GHOST)
+                error(kv.key.lineno, "Can't specify error mode in ghost thread");
+            else
+                stage[stageno].error_mode[gress].input(kv.value);
+            continue;
+        } else if (Target::SUPPORT_ALWAYS_RUN() && kv.key == "always_run_action") {
+            if (gress == GHOST)
+                error(kv.key.lineno, "No always run action for ghost thread, must use ingress");
+            else
+                stage[stageno].tables.push_back(new AlwaysRunTable(gress, &stage[stageno], kv));
+            continue;
+        }
+        if (!CHECKTYPEM(kv.key, tCMD, "table declaration")) continue;
+        if (!CHECKTYPE(kv.value, tMAP)) continue;
+        auto tt = Table::Type::get(kv.key[0].s);
+        if (!tt) {
+            error(kv.key[0].lineno, "Unknown table type '%s'", kv.key[0].s);
+            continue;
+        }
+        if (kv.key.vec.size < 2) {
+            error(kv.key.lineno, "Need table name");
+            continue;
+        }
+        if (!CHECKTYPE(kv.key[1], tSTR)) continue;
+        if (kv.key.vec.size > 2 && !CHECKTYPE(kv.key[2], tINT)) continue;
+        if (kv.key.vec.size > 3) warning(kv.key[3].lineno, "Ignoring extra stuff after table");
+        if (auto old = ::get(Table::all, kv.key[1].s)) {
+            error(kv.key[1].lineno, "Table %s already defined", kv.key[1].s);
+            warning(old->lineno, "previously defined here");
+            continue;
+        }
+        if (Table *table = tt->create(kv.key.lineno, kv.key[1].s, gress, &stage[stageno],
+                                      kv.key.vec.size > 2 ? kv.key[2].i : -1, kv.value.map)) {
+            stage[stageno].tables.push_back(table);
+        }
+    }
+}
+
+void AsmStage::process() {
+    for (auto &stage : pipe) {
+        stage.pass1_logical_id = stage.pass1_tcam_id = -1;
+        for (auto table : stage.tables) table->pass0();
+    }
+    for (auto &stage : pipe) {
+        for (auto table : stage.tables) table->pass1();
+        if (options.target == TOFINO) {
+            if (&stage - &pipe[0] == Target::NUM_MAU_STAGES() / 2) {
+                /* to turn the corner, the middle stage must always be match dependent */
+                for (gress_t gress : Range(INGRESS, EGRESS))
+                    stage.stage_dep[gress] = Stage::MATCH_DEP;
+            }
+        }
+        if (options.match_compiler || 1) {
+            /* FIXME -- do we really want to do this?  In theory different stages could
+             * FIXME -- use the same PHV slots differently, but the compiler always uses them
+             * FIXME -- consistently, so we need this to get bit-identical results
+             * FIXME -- we also don't correctly determine liveness, so need this */
+            for (gress_t gress : Range(INGRESS, GHOST)) {
+                Phv::setuse(gress, stage.match_use[gress]);
+                Phv::setuse(gress, stage.action_use[gress]);
+                Phv::setuse(gress, stage.action_set[gress]);
+            }
+        }
+    }
+    for (auto &stage : pipe) {
+        for (auto table : stage.tables) table->pass2();
+        std::sort(stage.tables.begin(), stage.tables.end(),
+                  [](Table *a, Table *b) { return a->logical_id < b->logical_id; });
+    }
+    for (auto &stage : pipe) {
+        for (auto table : stage.tables) table->pass3();
+    }
+}
+
+void AsmStage::output(json::map &ctxt_json) {
+    if (int(pipe.size()) > Target::NUM_MAU_STAGES()) {
+        auto lineno = pipe.back().tables.empty() ? 0 : pipe.back().tables[0]->lineno;
+        error(lineno, "%s supports up to %d stages, using %zd", Target::name(),
+              Target::NUM_MAU_STAGES(), pipe.size());
+    }
+
+    // If we encounter errors, no binary is generated, however we still proceed
+    // to generate the context.json with whatever info is provided in the .bfa.
+    // This can be inspected in p4i for debugging.
+    if (error_count > 0) {
+        options.binary = NO_BINARY;
+        error(0, "Due to errors, no binary will be generated");
+    }
+    if (pipe.empty()) return;
+
+    /* Allow to set any stage as match dependent based on a pattern - Should never be used for
+     * normal compilation */
+    if (options.target != TOFINO && !options.stage_dependency_pattern.empty()) {
+        for (gress_t gress : Range(INGRESS, EGRESS)) {
+            auto &stage = stages(gress);
+            unsigned i = 0;
+            for (auto ch : options.stage_dependency_pattern) {
+                if (ch == '1') {
+                    LOG1("explicitly setting stage " << i << " " << gress
+                                                     << " as match dependent on previous stage");
+                    stage[i].stage_dep[gress] = Stage::MATCH_DEP;
+                }
+                if (++i >= stage.size()) break;
+            }
+        }
+    }
+
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        auto &stage = stages(gress);
+        bitvec set_regs = stage[0].action_set[gress];
+        for (unsigned i = 1; i < stage.size(); i++) {
+            if (!stage[i].stage_dep[gress]) {
+                if (stage[i].match_use[gress].intersects(set_regs)) {
+                    LOG1("stage " << i << " " << gress << " is match dependent on previous stage");
+                    stage[i].stage_dep[gress] = Stage::MATCH_DEP;
+                } else if (stage[i].action_use[gress].intersects(set_regs)) {
+                    LOG1("stage " << i << " " << gress << " is action dependent on previous stage");
+                    stage[i].stage_dep[gress] = Stage::ACTION_DEP;
+                } else {
+                    LOG1("stage " << i << " " << gress << " is concurrent with previous stage");
+                    if (!Target::SUPPORT_CONCURRENT_STAGE_DEP())
+                        stage[i].stage_dep[gress] = Stage::ACTION_DEP;
+                    else
+                        stage[i].stage_dep[gress] = Stage::CONCURRENT;
+                }
+            }
+            if (stage[i].stage_dep[gress] == Stage::MATCH_DEP)
+                set_regs = stage[i].action_set[gress];
+            else
+                set_regs |= stage[i].action_set[gress];
+        }
+    }
+
+    // Propagate group_table_use so we can estimate latencies.
+    propagate_group_table_use();
+
+    // In Tofino, add match-dependent stages if latency is not the minimum
+    // egress latency. There is no such requirement for JBAY - COMPILER-757
+    if (options.target == TOFINO) {
+        // Compute Egress Latency
+        auto total_cycles = compute_latency(EGRESS);
+        if (!options.disable_egress_latency_padding) {
+            // Get non match dependent stages
+            bitvec non_match_dep;
+            for (unsigned i = 1; i < pipe.size(); i++) {
+                auto stage_dep = pipe[i].stage_dep[EGRESS];
+                if (stage_dep != Stage::MATCH_DEP) non_match_dep.setbit(i);
+            }
+            // Add match-dependent stages and re-evaluate latency
+            while (total_cycles < Target::Tofino::MINIMUM_REQUIRED_EGRESS_PIPELINE_LATENCY) {
+                if (non_match_dep == bitvec(0)) break;
+                auto non_match_dep_stage = non_match_dep.min().index();
+                pipe[non_match_dep_stage].stage_dep[EGRESS] = Stage::MATCH_DEP;
+                LOG3("Converting egress stage "
+                     << non_match_dep_stage
+                     << " to match dependent to meet minimum egress pipeline latency requirement");
+                non_match_dep.clrbit(non_match_dep_stage);
+                total_cycles = compute_latency(EGRESS);
+            }
+        } else {
+            if (total_cycles < Target::Tofino::MINIMUM_REQUIRED_EGRESS_PIPELINE_LATENCY) {
+                warning(0,
+                        "User disabled adding latency to the egress MAU pipeline "
+                        "to meet its minimum requirements. This may result in under "
+                        "run in certain port speed configurations.");
+            }
+        }
+    }
+
+    // Re-propagate group_table_use to account for any stages that may now be match dependent.
+    propagate_group_table_use();
+
+    for (auto &stage : pipe) SWITCH_FOREACH_TARGET(options.target, stage.output<TARGET>(ctxt_json);)
+
+    if (options.log_hashes) {
+        std::ofstream hash_out;
+        std::string fname = options.output_dir + "/logs/mau.hashes.log";
+        hash_out.open(fname.c_str());
+        if (hash_out) {
+            for (auto &stage : pipe) stage.log_hashes(hash_out);
+            hash_out.close();
+        }
+    }
+}
+
+void AsmStage::propagate_group_table_use() {
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        auto &stage = stages(gress);
+        stage[0].group_table_use[gress] = stage[0].table_use[gress];
+        for (unsigned i = 1; i < stage.size(); i++) {
+            stage[i].group_table_use[gress] = stage[i].table_use[gress];
+            if (stage[i].stage_dep[gress] != Stage::MATCH_DEP)
+                stage[i].group_table_use[gress] |= stage[i - 1].group_table_use[gress];
+        }
+        for (int i = stage.size() - 1; i > 0; i--)
+            if (stage[i].stage_dep[gress] != Stage::MATCH_DEP)
+                stage[i - 1].group_table_use[gress] |= stage[i].group_table_use[gress];
+    }
+}
+
+unsigned AsmStage::compute_latency(gress_t gress) {
+    // FIXME -- this is Tofino1 only, so should be in target specific code somewhere
+    auto total_cycles = 4;  // There are 4 extra cycles between stages 5 & 6 of the MAU
+    for (unsigned i = 1; i < pipe.size(); i++) {
+        auto stage_dep = pipe[i].stage_dep[gress];
+        auto contribute = 0;
+        if (stage_dep == Stage::MATCH_DEP) {
+            contribute = pipe[i].pipelength(gress);
+        } else if (stage_dep == Stage::ACTION_DEP) {
+            contribute = 2;
+        } else if (stage_dep == Stage::CONCURRENT) {
+            contribute = 1;
+        }
+        total_cycles += contribute;
+    }
+    return total_cycles;
+}
+
+static FakeTable invalid_rams("RAMS NOT PRESENT");
+
+std::map<int, std::pair<bool, int>> Stage_data::teop = {
+    {0, {false, INT_MAX}}, {1, {false, INT_MAX}}, {2, {false, INT_MAX}}, {3, {false, INT_MAX}}};
+
+Stage::Stage(int stage, bool egress_only) : Stage_data(stage, egress_only) {
+    static_assert(sizeof(Stage_data) == sizeof(Stage),
+                  "All non-static Stage fields must be in Stage_data");
+    table_use[0] = table_use[1] = NONE;
+    stage_dep[0] = stage_dep[1] = NONE;
+    error_mode[0] = error_mode[1] = DefaultErrorMode::get();
+    for (int i = 0; i < Target::SRAM_ROWS(egress_only ? EGRESS : INGRESS); i++)
+        for (int j = 0; j < Target::SRAM_REMOVED_COLUMNS(); j++) sram_use[i][j] = &invalid_rams;
+}
+
+Stage::~Stage() {
+    for (auto *ref : all_refs) *ref = nullptr;
+}
+
+int Stage::first_table(gress_t gress) {
+    for (auto &st : AsmStage::stages(gress)) {
+        int min_logical_id = INT_MAX;
+        for (auto tbl : st.tables) {
+            if (tbl->gress != gress) continue;
+            if (tbl->logical_id < 0) continue;  // ignore phase 0
+            if (tbl->logical_id < min_logical_id) min_logical_id = tbl->logical_id;
+        }
+        if (min_logical_id != INT_MAX) {
+            BUG_CHECK((min_logical_id & ~0xf) == 0);
+            return (st.stageno << 4) + min_logical_id;
+        }
+    }
+    return -1;
+}
+
+Stage *Stage::stage(gress_t gress, int stageno) {
+    if (stageno < 0 || stageno >= AsmStage::stages(gress).size()) return nullptr;
+    return &AsmStage::stages(gress).at(stageno);
+}
+
+Stage::Stage(Stage &&a) : Stage_data(std::move(a)) {
+    for (auto *ref : all_refs) *ref = this;
+}
+
+bitvec Stage::imem_use_all() const {
+    bitvec rv;
+    for (auto &u : imem_use) rv |= u;
+    return rv;
+}
+
+int Stage::tcam_delay(gress_t gress) const {
+    if (group_table_use[timing_thread(gress)] & Stage::USE_TCAM) return 2;
+    if (group_table_use[timing_thread(gress)] & Stage::USE_WIDE_SELECTOR) return 2;
+    return 0;
+}
+
+int Stage::adr_dist_delay(gress_t gress) const {
+    if (group_table_use[timing_thread(gress)] & Stage::USE_SELECTOR)
+        return 8;
+    else if (group_table_use[timing_thread(gress)] & Stage::USE_STATEFUL_DIVIDE)
+        return 6;
+    else if (group_table_use[timing_thread(gress)] & Stage::USE_STATEFUL)
+        return 4;
+    else if (group_table_use[timing_thread(gress)] & Stage::USE_METER_LPF_RED)
+        return 4;
+    else
+        return 0;
+}
+
+/* Calculate the meter_alu delay for a meter/stateful ALU based on both things
+ * used globally in the current stage group, and whether this ALU uses a divmod
+ * (in which case it will already have an extra 2-cycle delay */
+int Stage::meter_alu_delay(gress_t gress, bool uses_divmod) const {
+    if (group_table_use[timing_thread(gress)] & Stage::USE_SELECTOR)
+        return uses_divmod ? 2 : 4;
+    else if (group_table_use[timing_thread(gress)] & Stage::USE_STATEFUL_DIVIDE)
+        return uses_divmod ? 0 : 2;
+    else
+        return 0;
+}
+
+int Stage::cycles_contribute_to_latency(gress_t gress) {
+    if (stage_dep[gress] == MATCH_DEP || stageno == 0)
+        return pipelength(gress);
+    else if (stage_dep[gress] == CONCURRENT && options.target == TOFINO)
+        return 1;
+    else
+        return 2;  // action dependency
+}
+
+int Stage::pipelength(gress_t gress) const {
+    return Target::MAU_BASE_DELAY() + tcam_delay(gress) + adr_dist_delay(gress);
+}
+
+int Stage::pred_cycle(gress_t gress) const {
+    return Target::MAU_BASE_PREDICATION_DELAY() + tcam_delay(gress);
+}
+
+void Stage::verify_have_mpr(std::string key, int line_number) {
+    if (!Target::HAS_MPR())
+        error(line_number, "%s is not available on target %s.", key.c_str(), Target::name());
+}
+
+template <class TARGET>
+void Stage::write_common_regs(typename TARGET::mau_regs &regs) {
+    /* FIXME -- most of the values set here are 'placeholder' constants copied
+     * from build_pipeline_output_2.py in the compiler */
+    auto &merge = regs.rams.match.merge;
+    auto &adrdist = regs.rams.match.adrdist;
+    // merge.exact_match_delay_config.exact_match_delay_ingress = tcam_delay(INGRESS);
+    // merge.exact_match_delay_config.exact_match_delay_egress = tcam_delay(EGRESS);
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        if (tcam_delay(gress) > 0) {
+            merge.exact_match_delay_thread[0] |= 1U << gress;
+            merge.exact_match_delay_thread[1] |= 1U << gress;
+            merge.exact_match_delay_thread[2] |= 1U << gress;
+        }
+        regs.rams.match.adrdist.adr_dist_pipe_delay[gress][0] =
+            regs.rams.match.adrdist.adr_dist_pipe_delay[gress][1] = adr_dist_delay(gress);
+        regs.dp.action_output_delay[gress] = pipelength(gress) - 3;
+        regs.dp.pipelength_added_stages[gress] = pipelength(gress) - TARGET::MAU_BASE_DELAY;
+        if (stageno > 0 && stage_dep[gress] == MATCH_DEP)
+            regs.dp.match_ie_input_mux_sel |= 1 << gress;
+    }
+
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        if (stageno == 0) {
+            /* Credit is set to 2 - Every 512 cycles the credit is reset and every
+             * bubble request decrements this credit. Acts like a filter to cap bubble
+             * requests */
+            adrdist.bubble_req_ctl[gress].bubble_req_fltr_crd = 0x2;
+            adrdist.bubble_req_ctl[gress].bubble_req_fltr_en = 0x1;
+        }
+        adrdist.bubble_req_ctl[gress].bubble_req_interval = 0x100;
+        adrdist.bubble_req_ctl[gress].bubble_req_en = 0x1;
+        adrdist.bubble_req_ctl[gress].bubble_req_interval_eop = 0x100;
+        adrdist.bubble_req_ctl[gress].bubble_req_en_eop = 0x1;
+        adrdist.bubble_req_ctl[gress].bubble_req_ext_fltr_en = 0x1;
+    }
+
+    regs.dp.phv_fifo_enable.phv_fifo_ingress_action_output_enable =
+        stage_dep[INGRESS] != ACTION_DEP;
+    regs.dp.phv_fifo_enable.phv_fifo_egress_action_output_enable = stage_dep[EGRESS] != ACTION_DEP;
+    if (stageno != AsmStage::numstages() - 1) {
+        regs.dp.phv_fifo_enable.phv_fifo_ingress_final_output_enable =
+            this[1].stage_dep[INGRESS] == ACTION_DEP;
+        regs.dp.phv_fifo_enable.phv_fifo_egress_final_output_enable =
+            this[1].stage_dep[EGRESS] == ACTION_DEP;
+    }
+
+    /* Error handling related */
+    for (gress_t gress : Range(INGRESS, EGRESS)) error_mode[gress].write_regs(regs, this, gress);
+
+    /*--------------------
+     * Since a stats ALU enable bit is missing from mau_cfg_stats_alu_lt, need to make sure that for
+     * unused stats ALUs, they are programmed to point to a logical table that is either unused or
+     * to one that does not use a stats table. */
+
+    bool unused_stats_alus = false;
+    for (auto &salu : regs.cfg_regs.mau_cfg_stats_alu_lt)
+        if (!salu.modified()) unused_stats_alus = true;
+    if (unused_stats_alus) {
+        unsigned avail = 0xffff;
+        int no_stats = -1;
+        /* odd pattern of tests to replicate what the old compiler does */
+        for (auto tbl : tables) {
+            avail &= ~(1U << tbl->logical_id);
+            if (no_stats < 0 && (!tbl->get_attached() || tbl->get_attached()->stats.empty()))
+                no_stats = tbl->logical_id;
+        }
+        if (avail) {
+            for (int i = 15; i >= 0; --i)
+                if ((avail >> i) & 1) {
+                    no_stats = i;
+                    break;
+                }
+        }
+        for (auto &salu : regs.cfg_regs.mau_cfg_stats_alu_lt)
+            if (!salu.modified()) salu = no_stats;
+    }
+}
+
+void Stage::log_hashes(std::ofstream &out) const {
+    out << "+-----------------------------------------------------------+" << std::endl;
+    out << "   Stage " << stageno << std::endl;
+    out << "+-----------------------------------------------------------+" << std::endl;
+    bool logged = false;
+    for (auto xbar : ixbar_use) {
+        if (xbar.first.type == InputXbar::Group::EXACT) {
+            for (auto use : xbar.second) {
+                if (use) logged |= use->log_hashes(out);
+            }
+        }
+    }
+    if (!logged) {
+        out << "  Unused" << std::endl;
+    }
+    // Need to use other variables?
+    out << std::endl;
+}
+
+template <class REGS>
+void Stage::gen_gfm_json_info(REGS &regs, std::ostream &out) {
+    auto &hash = regs.dp.xbar_hash.hash;
+    auto &gfm = hash.galois_field_matrix;
+    out << &gfm << "\n";
+    out << "Col  :    ";
+    for (auto c = 0; c < GALOIS_FIELD_MATRIX_COLUMNS; c++) {
+        out << std::setw(3) << c;
+    }
+    out << " | Row Parity \n";
+    for (auto r = 0; r < gfm.size(); r++) {
+        out << "Row " << std::dec << r << ": \n";
+        out << "  Byte 0 :";
+        unsigned byte0_parity = 0;
+        unsigned byte1_parity = 0;
+        for (auto c = 0; c < GALOIS_FIELD_MATRIX_COLUMNS; c++) {
+            out << std::setw(3) << std::hex << gfm[r][c].byte0;
+            byte0_parity ^= gfm[r][c].byte0;
+        }
+        out << " | " << std::setw(3) << parity(byte0_parity) << "\n";
+        out << "  Byte 1 :";
+        for (auto c = 0; c < GALOIS_FIELD_MATRIX_COLUMNS; c++) {
+            out << std::setw(3) << std::hex << gfm[r][c].byte1;
+            byte1_parity ^= gfm[r][c].byte1;
+        }
+        out << " | " << std::setw(3) << parity(byte1_parity) << "\n";
+    }
+
+    out << "\n";
+    auto &grp_enable = regs.dp.hashout_ctl.hash_parity_check_enable;
+    for (int grp = 0; grp < 8; grp++) {
+        out << "Hash Group : " << grp << "\n";
+        out << "Hash Seed : ";
+        int seed_parity = 0;
+        bitvec hash_seed;
+        for (int bit = 51; bit >= 0; bit--) {
+            auto seed_bit = (hash.hash_seed[bit] >> grp) & 0x1;
+            hash_seed[bit] = seed_bit;
+            out << seed_bit;
+            seed_parity ^= seed_bit;
+        }
+        out << " (" << hash_seed << ")";
+        out << "\n";
+        auto seed_parity_enable = ((grp_enable >> grp) & 0x1) ? "True" : "False";
+        out << "Hash Seed Parity Enable : " << seed_parity_enable;
+        out << "\n";
+        out << "Hash Seed Parity : " << (seed_parity ? "Odd" : "Even");
+        out << "\n";
+        out << "\n";
+    }
+}
+
+template <class REGS>
+void Stage::fixup_regs(REGS &regs) {
+    if (options.condense_json) {
+        // if any part of the gf matrix is enabled, we can't elide any part of it when
+        // generating .cfg.json, as otherwise walle will generate an invalid block write
+        if (options.gen_json && !regs.dp.xbar_hash.hash.galois_field_matrix.disabled())
+            regs.dp.xbar_hash.hash.galois_field_matrix.enable();
+    }
+    // Enable mapram_config and imem regs -
+    // These are cached by the driver, so if they are disabled they wont go
+    // into tofino.bin as dma block writes and driver will complain
+    // The driver needs the regs to do parity error correction at runtime and it
+    // checks for the base address of the register blocks to do a block DMA
+    // during tofino.bin download
+    regs.dp.imem.enable();
+    for (int row = 0; row < SRAM_ROWS; row++)
+        for (int col = 0; col < MAPRAM_UNITS_PER_ROW; col++)
+            regs.rams.map_alu.row[row].adrmux.mapram_config[col].enable();
+}
+
+template <class TARGET>
+void Stage::output(json::map &ctxt_json, bool egress_only) {
+    auto *regs = new typename TARGET::mau_regs();
+    declare_registers(regs, egress_only, stageno);
+    json::vector &ctxt_tables = ctxt_json["tables"];
+    for (auto table : tables) {
+        table->write_regs(*regs);
+        table->gen_tbl_cfg(ctxt_tables);
+        if (auto gw = table->get_gateway()) gw->gen_tbl_cfg(ctxt_tables);
+    }
+    write_regs(*regs, egress_only);
+
+    // Output GFM
+    if (gfm_out) gen_gfm_json_info(*regs, *gfm_out);
+
+    if (options.condense_json) regs->disable_if_reset_value();
+
+    fixup_regs(*regs);
+    char buf[64];
+    snprintf(buf, sizeof(buf), "regs.match_action_stage%s.%02x", egress_only ? ".egress" : "",
+             stageno);
+    if (error_count == 0 && options.gen_json)
+        regs->emit_json(*open_output("%s.cfg.json", buf), stageno);
+    auto NUM_STAGES = egress_only ? Target::NUM_EGRESS_STAGES() : Target::NUM_MAU_STAGES();
+    if (stageno < NUM_STAGES) TopLevel::all->set_mau_stage(stageno, buf, regs, egress_only);
+    gen_mau_stage_characteristics(*regs, ctxt_json["mau_stage_characteristics"]);
+    gen_configuration_cache(*regs, ctxt_json["configuration_cache"]);
+    if (stageno == NUM_STAGES - 1 && Target::OUTPUT_STAGE_EXTENSION())
+        gen_mau_stage_extension(*regs, ctxt_json["mau_stage_extension"]);
+}
+
+template <class REGS>
+void Stage::gen_mau_stage_characteristics(REGS &regs, json::vector &stg_characteristics) {
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        json::map anon;
+        anon["stage"] = stageno;
+        anon["gress"] = P4Table::direction_name(gress);
+        anon["match_dependent"] = (regs.dp.cur_stage_dependency_on_prev[gress] == 0) ? true : false;
+        anon["clock_cycles"] = pipelength(gress);
+        anon["predication_cycle"] = pred_cycle(gress);
+        anon["cycles_contribute_to_latency"] = cycles_contribute_to_latency(gress);
+        stg_characteristics.push_back(std::move(anon));
+    }
+}
+
+template <class REGS>
+void Stage::gen_configuration_cache(REGS &regs, json::vector &cfg_cache) {
+    BUG();  // Must be specialized for target -- no generic implementation
+}
+
+template <class REGS>
+void Stage::gen_configuration_cache_common(REGS &regs, json::vector &cfg_cache) {
+    std::string reg_fqname;
+    std::string reg_name;
+    unsigned reg_value;
+    std::string reg_value_str;
+    unsigned reg_width = 8;  // this means number of hex characters
+
+    // meter_sweep_ctl
+    auto &meter_sweep_ctl = regs.rams.match.adrdist.meter_sweep_ctl;
+    for (int i = 0; i < 4; i++) {
+        reg_fqname = "mau[" + std::to_string(stageno) + "].rams.match.adrdist.meter_sweep_ctl[" +
+                     std::to_string(i) + "]";
+        if (options.match_compiler) {  // FIXME: Temp fix to match glass typo
+            reg_fqname = "mau[" + std::to_string(stageno) +
+                         "].rams.match.adrdist.meter_sweep_ctl.meter_sweep_ctl[" +
+                         std::to_string(i) + "]";
+        }
+        reg_name = "stage_" + std::to_string(stageno) + "_meter_sweep_ctl_" + std::to_string(i);
+        reg_value = (meter_sweep_ctl[i].meter_sweep_en & 0x00000001) |
+                    ((meter_sweep_ctl[i].meter_sweep_offset & 0x0000003F) << 1) |
+                    ((meter_sweep_ctl[i].meter_sweep_size & 0x0000003F) << 7) |
+                    ((meter_sweep_ctl[i].meter_sweep_remove_hole_pos & 0x00000003) << 13) |
+                    ((meter_sweep_ctl[i].meter_sweep_remove_hole_en & 0x00000001) << 16) |
+                    ((meter_sweep_ctl[i].meter_sweep_interval & 0x0000001F) << 17);
+        if ((reg_value != 0) || (options.match_compiler)) {
+            reg_value_str = int_to_hex_string(reg_value, reg_width);
+            add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+        }
+    }
+
+    // meter_ctl is different for Tofino and Tofino2, so it is added in
+    // specialized functions.
+
+    // statistics_ctl
+    auto &statistics_ctl = regs.rams.map_alu.stats_wrap;
+    for (int i = 0; i < 4; i++) {
+        reg_fqname = "mau[" + std::to_string(stageno) + "].rams.map_alu.stats_wrap[" +
+                     std::to_string(i) + "]" + ".stats.statistics_ctl";
+        reg_name = "stage_" + std::to_string(stageno) + "_statistics_ctl_" + std::to_string(i);
+        reg_value =
+            (statistics_ctl[i].stats.statistics_ctl.stats_entries_per_word & 0x00000007) |
+            ((statistics_ctl[i].stats.statistics_ctl.stats_process_bytes & 0x00000001) << 3) |
+            ((statistics_ctl[i].stats.statistics_ctl.stats_process_packets & 0x00000001) << 4) |
+            ((statistics_ctl[i].stats.statistics_ctl.lrt_enable & 0x00000001) << 5) |
+            ((statistics_ctl[i].stats.statistics_ctl.stats_alu_egress & 0x00000001) << 6) |
+            ((statistics_ctl[i].stats.statistics_ctl.stats_bytecount_adjust & 0x00003FFF) << 7) |
+            ((statistics_ctl[i].stats.statistics_ctl.stats_alu_error_enable & 0x00000001) << 21);
+        if ((reg_value != 0) || (options.match_compiler)) {
+            reg_value_str = int_to_hex_string(reg_value, reg_width);
+            add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+        }
+    }
+
+    // match_input_xbar_din_power_ctl
+    auto &mixdpctl = regs.dp.match_input_xbar_din_power_ctl;
+    reg_value_str = "";
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 16; j++) {
+            reg_value = mixdpctl[i][j];
+            reg_value_str = reg_value_str + int_to_hex_string(reg_value, reg_width);
+        }
+    }
+    if (!check_zero_string(reg_value_str) || options.match_compiler) {
+        reg_fqname = "mau[" + std::to_string(stageno) + "].dp.match_input_xbar_din_power_ctl";
+        reg_name = "stage_" + std::to_string(stageno) + "_match_input_xbar_din_power_ctl";
+        add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+    }
+
+    // hash_seed
+    auto &hash_seed = regs.dp.xbar_hash.hash.hash_seed;
+    reg_value_str = "";
+    for (int i = 0; i < 52; i++) {
+        reg_value = hash_seed[i];
+        reg_value_str = reg_value_str + int_to_hex_string(reg_value, reg_width);
+    }
+    if (!check_zero_string(reg_value_str) || options.match_compiler) {
+        reg_fqname = "mau[" + std::to_string(stageno) + "].dp.xbar_hash.hash.hash_seed";
+        reg_name = "stage_" + std::to_string(stageno) + "_hash_seed";
+        add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+    }
+
+    // parity_group_mask
+    auto &parity_group_mask = regs.dp.xbar_hash.hash.parity_group_mask;
+    reg_value_str = "";
+    for (int i = 0; i < 8; i++) {
+        for (int j = 0; j < 2; j++) {
+            reg_value = parity_group_mask[i][j];
+            reg_value_str = reg_value_str + int_to_hex_string(reg_value, reg_width);
+        }
+    }
+    if (!check_zero_string(reg_value_str) || options.match_compiler) {
+        reg_fqname = "mau[" + std::to_string(stageno) + "].dp.xbar_hash.hash.parity_group_mask";
+        reg_name = "stage_" + std::to_string(stageno) + "_parity_group_mask";
+        add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+    }
+}
+
+template <class REGS>
+void Stage::write_teop_regs(REGS &regs) {
+    BUG_CHECK(Target::SUPPORT_TRUE_EOP(), "teop not supported on target");
+    // Set teop bus delay regs on current stage if previous stage is driving teop
+    for (auto t : teop) {
+        if (t.second.first && t.second.second < stageno) {
+            auto delay_en = (stage_dep[EGRESS] != Stage::ACTION_DEP);
+            if (delay_en) {
+                auto delay = pipelength(EGRESS) - 4;
+                auto &adrdist = regs.rams.match.adrdist;
+                adrdist.teop_bus_ctl[t.first].teop_bus_ctl_delay = delay;
+                adrdist.teop_bus_ctl[t.first].teop_bus_ctl_delay_en = delay_en;
+            }
+        }
+    }
+}
diff --git a/backends/tofino/bf-asm/stage.h b/backends/tofino/bf-asm/stage.h
new file mode 100644
index 00000000000..f5f2cdae2c8
--- /dev/null
+++ b/backends/tofino/bf-asm/stage.h
@@ -0,0 +1,227 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_STAGE_H_
+#define BACKENDS_TOFINO_BF_ASM_STAGE_H_
+
+#include <fstream>
+#include <vector>
+
+#include "alloc.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "error_mode.h"
+#include "input_xbar.h"
+#include "lib/bitvec.h"
+
+class Stage_data {
+    /* we encapsulate all the Stage non-static fields in a base class to automate the
+     * generation of the move construtor properly */
+ public:
+    int stageno;
+    std::vector<Table *> tables;
+    std::set<Stage **> all_refs;
+    BFN::Alloc2Dbase<Table *> sram_use;
+    BFN::Alloc2D<Table *, SRAM_ROWS, 2> sram_search_bus_use;
+    BFN::Alloc3Dbase<Table *> stm_hbus_use;
+    BFN::Alloc2D<Table *, SRAM_ROWS, 2> match_result_bus_use;
+    BFN::Alloc2D<Table *, SRAM_ROWS, MAPRAM_UNITS_PER_ROW> mapram_use;
+    BFN::Alloc2Dbase<Table *> tcam_use;
+    BFN::Alloc2Dbase<Table *> tcam_match_bus_use;
+    BFN::Alloc2D<std::pair<Table *, int>, TCAM_ROWS, 2> tcam_byte_group_use;
+    BFN::Alloc1Dbase<Table *> local_tind_use;
+    BFN::Alloc2D<Table *, SRAM_ROWS, 2> tcam_indirect_bus_use;
+    BFN::Alloc2D<GatewayTable *, SRAM_ROWS, 2> gw_unit_use;
+    BFN::Alloc2D<GatewayTable *, SRAM_ROWS, 2> gw_payload_use;
+    BFN::Alloc1D<Table *, LOGICAL_TABLES_PER_STAGE> logical_id_use;
+    BFN::Alloc1D<Table *, PHYSICAL_TABLES_PER_STAGE> physical_id_use;
+    BFN::Alloc1D<Table *, TCAM_TABLES_PER_STAGE> tcam_id_use;
+    ordered_map<InputXbar::Group, std::vector<InputXbar *>> ixbar_use;
+    BFN::Alloc1D<Table *, TCAM_XBAR_INPUT_BYTES> tcam_ixbar_input;
+    BFN::Alloc1Dbase<std::vector<InputXbar *>> hash_table_use;
+    BFN::Alloc1Dbase<std::vector<InputXbar *>> hash_group_use;
+    BFN::Alloc1D<std::vector<HashDistribution *>, 6> hash_dist_use;
+    BFN::Alloc1Dbase<ActionTable *> action_unit_use;
+    BFN::Alloc1Dbase<Synth2Port *> dp_unit_use;
+    BFN::Alloc1D<Table *, ACTION_DATA_BUS_SLOTS> action_bus_use;
+    BFN::Alloc1D<Table *, LOGICAL_SRAM_ROWS> action_data_use, meter_bus_use, stats_bus_use,
+        selector_adr_bus_use, overflow_bus_use;
+    BFN::Alloc1D<Table *, IDLETIME_BUSSES> idletime_bus_use;
+    bitvec action_bus_use_bit_mask;
+    BFN::Alloc2D<Table::Actions::Action *, 2, ACTION_IMEM_ADDR_MAX> imem_addr_use;
+    bitvec imem_use[ACTION_IMEM_SLOTS];
+    BFN::Alloc1D<Table::NextTables *, MAX_LONGBRANCH_TAGS> long_branch_use;
+    unsigned long_branch_thread[3] = {0};
+    unsigned long_branch_terminate = 0;
+
+    // for timing, ghost thread is tied to ingress, so we track ghost as ingress here
+    enum {
+        USE_TCAM = 1,
+        USE_STATEFUL = 4,
+        USE_METER = 8,
+        USE_METER_LPF_RED = 16,
+        USE_SELECTOR = 32,
+        USE_WIDE_SELECTOR = 64,
+        USE_STATEFUL_DIVIDE = 128
+    };
+    int /* enum */ table_use[2], group_table_use[2];
+
+    enum { NONE = 0, CONCURRENT = 1, ACTION_DEP = 2, MATCH_DEP = 3 } stage_dep[2];
+    bitvec match_use[3], action_use[3], action_set[3];
+
+    // there's no error mode registers for ghost thread, so we don't allow it to be set
+    ErrorMode error_mode[2];
+
+    // MPR stage config
+    int mpr_stage_id[3] = {0};  // per-gress
+    int mpr_always_run = 0;
+    int mpr_bus_dep_glob_exec[3] = {0};
+    int mpr_bus_dep_long_branch[3] = {0};
+    // per gress, per logical table
+    BFN::Alloc2D<int, 3, LOGICAL_TABLES_PER_STAGE> mpr_next_table_lut;
+    // per global execute bit
+    BFN::Alloc1D<int, LOGICAL_TABLES_PER_STAGE> mpr_glob_exec_lut;
+    // per long branch tag
+    BFN::Alloc1D<int, MAX_LONGBRANCH_TAGS> mpr_long_brch_lut;
+
+    int pass1_logical_id = -1, pass1_tcam_id = -1;
+
+    // True egress accounting (4 buses) Tofino2
+    static std::map<int, std::pair<bool, int>> teop;
+
+ protected:
+    Stage_data(int stage, bool egress_only)
+        : stageno(stage),
+          sram_use(Target::SRAM_ROWS(egress_only ? EGRESS : INGRESS), Target::SRAM_UNITS_PER_ROW()),
+          stm_hbus_use(Target::SRAM_ROWS(egress_only ? EGRESS : INGRESS),
+                       Target::SRAM_HBUS_SECTIONS_PER_STAGE(), Target::SRAM_HBUSSES_PER_ROW()),
+          tcam_use(Target::TCAM_ROWS(), Target::TCAM_UNITS_PER_ROW()),
+          tcam_match_bus_use(Target::TCAM_ROWS(), Target::TCAM_MATCH_BUSSES()),
+          local_tind_use(Target::LOCAL_TIND_UNITS()),
+          hash_table_use(Target::EXACT_HASH_TABLES()),
+          hash_group_use(Target::EXACT_HASH_GROUPS()),
+          action_unit_use(Target::ARAM_UNITS_PER_STAGE()),
+          dp_unit_use(Target::DP_UNITS_PER_STAGE()) {}
+    Stage_data(const Stage_data &) = delete;
+    Stage_data(Stage_data &&) = default;
+    ~Stage_data() {}
+};
+
+class Stage : public Stage_data {
+ public:
+    static unsigned char action_bus_slot_map[ACTION_DATA_BUS_BYTES];
+    static unsigned char action_bus_slot_size[ACTION_DATA_BUS_SLOTS];  // size in bits
+
+    explicit Stage(int stageno, bool egress_only);
+    Stage(const Stage &) = delete;
+    Stage(Stage &&);
+    ~Stage();
+    template <class TARGET>
+    void output(json::map &ctxt_json, bool egress_only = false);
+    template <class REGS>
+    void fixup_regs(REGS &regs);
+    template <class REGS>
+    void gen_configuration_cache_common(REGS &regs, json::vector &cfg_cache);
+    template <class REGS>
+    void gen_configuration_cache(REGS &regs, json::vector &cfg_cache);
+    template <class REGS>
+    void gen_gfm_json_info(REGS &regs, std::ostream &out);
+    template <class REGS>
+    void gen_mau_stage_characteristics(REGS &regs, json::vector &stg_characteristics);
+    template <class REGS>
+    void gen_mau_stage_extension(REGS &regs, json::map &extend);
+    template <class REGS>
+    void write_regs(REGS &regs, bool egress_only);
+    template <class TARGET>
+    void write_common_regs(typename TARGET::mau_regs &regs);
+    template <class REGS>
+    void write_teop_regs(REGS &regs);
+    int adr_dist_delay(gress_t gress) const;
+    int meter_alu_delay(gress_t gress, bool uses_divmod) const;
+    int pipelength(gress_t gress) const;
+    int pred_cycle(gress_t gress) const;
+    int tcam_delay(gress_t gress) const;
+    int cycles_contribute_to_latency(gress_t gress);
+    void verify_have_mpr(std::string key, int line_number);
+    static int first_table(gress_t gress);
+    static unsigned end_of_pipe() { return Target::END_OF_PIPE(); }
+    static Stage *stage(gress_t gress, int stageno);
+    void log_hashes(std::ofstream &out) const;
+    bitvec imem_use_all() const;
+};
+
+class AsmStage : public Section {
+    void start(int lineno, VECTOR(value_t) args);
+    void input(VECTOR(value_t) args, value_t data);
+    void output(json::map &);
+
+    /// Propagates group_table_use to adjacent stages that are not match-dependent.
+    void propagate_group_table_use();
+
+    unsigned compute_latency(gress_t gress);
+    AsmStage();
+    ~AsmStage() {}
+    std::vector<Stage> pipe;
+    static AsmStage singleton_object;
+    bitvec stages_seen[NUM_GRESS_T];
+
+ public:
+    void process();
+    static int numstages() { return singleton_object.pipe.size(); }
+    static std::vector<Stage> &stages(gress_t gress) { return singleton_object.pipe; }
+
+    // for gtest
+    void reset_stage(Stage &stage) {
+        for (auto &tbl : stage.tables) tbl->all->clear();
+        stage.tables.clear();
+        stage.all_refs.clear();
+        stage.sram_use.clear();
+        stage.sram_search_bus_use.clear();
+        stage.stm_hbus_use.clear();
+        stage.match_result_bus_use.clear();
+        stage.mapram_use.clear();
+        stage.tcam_use.clear();
+        stage.tcam_match_bus_use.clear();
+        stage.tcam_byte_group_use.clear();
+        stage.gw_unit_use.clear();
+        stage.gw_payload_use.clear();
+        stage.logical_id_use.clear();
+        stage.physical_id_use.clear();
+        stage.tcam_id_use.clear();
+        stage.ixbar_use.clear();
+        stage.tcam_ixbar_input.clear();
+        stage.hash_table_use.clear();
+        stage.hash_group_use.clear();
+        stage.hash_dist_use.clear();
+        stage.action_bus_use.clear();
+        stage.action_data_use.clear();
+        stage.meter_bus_use.clear();
+        stage.stats_bus_use.clear();
+        stage.selector_adr_bus_use.clear();
+        stage.overflow_bus_use.clear();
+        stage.idletime_bus_use.clear();
+        stage.imem_addr_use.clear();
+        stage.long_branch_use.clear();
+    }
+
+    void reset() {
+        stages_seen[INGRESS].clear();
+        stages_seen[EGRESS].clear();
+        for (auto &stage : pipe) reset_stage(stage);
+    }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_STAGE_H_ */
diff --git a/backends/tofino/bf-asm/stateful.cpp b/backends/tofino/bf-asm/stateful.cpp
new file mode 100644
index 00000000000..ab125df0abc
--- /dev/null
+++ b/backends/tofino/bf-asm/stateful.cpp
@@ -0,0 +1,676 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "tofino/stateful.h"
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "data_switchbox.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "jbay/stateful.h"
+#include "lib/algorithm.h"
+#include "misc.h"
+
+void StatefulTable::parse_register_params(int idx, const value_t &val) {
+    if (idx < 0 || idx > Target::STATEFUL_REGFILE_ROWS())
+        error(lineno,
+              "Index out of range of the number of the register file rows (%d). "
+              "Reduce the number of large constants or RegisterParams.",
+              Target::STATEFUL_REGFILE_ROWS());
+    if (const_vals.size() <= size_t(idx)) const_vals.resize(idx + 1);
+    if (CHECKTYPE(val, tMAP) && val.map.size == 1)
+        if (CHECKTYPE(val.map.data->key, tSTR) && CHECKTYPE(val.map.data->value, tINT))
+            const_vals[idx] = std::move(const_info_t(val.lineno, val.map.data->value.i, true,
+                                                     std::string(val.map.data->key.s)));
+}
+
+void StatefulTable::setup(VECTOR(pair_t) & data) {
+    common_init_setup(data, false, P4Table::Stateful);
+    if (!format) error(lineno, "No format specified in table %s", name());
+    for (auto &kv : MapIterChecked(data, true)) {
+        if (common_setup(kv, data, P4Table::Stateful)) {
+        } else if (kv.key == "initial_value") {
+            if (CHECKTYPE(kv.value, tMAP)) {
+                for (auto &v : kv.value.map) {
+                    if (v.key == "lo") {
+                        if (CHECKTYPE2(v.value, tINT, tBIGINT)) {
+                            if (v.value.type == tINT) {
+                                initial_value_lo = v.value.i;
+                            } else {
+                                initial_value_lo = v.value.bigi.data[0];
+                                if (v.value.bigi.size > 1) initial_value_hi = v.value.bigi.data[1];
+                            }
+                        }
+                    } else if (v.key == "hi") {
+                        if (CHECKTYPE(v.value, tINT)) initial_value_hi = v.value.i;
+                    }
+                }
+            }
+        } else if (kv.key == "input_xbar") {
+            if (CHECKTYPE(kv.value, tMAP))
+                input_xbar.emplace_back(InputXbar::create(this, false, kv.key, kv.value.map));
+        } else if (kv.key == "data_bytemask") {
+            if (CHECKTYPE(kv.value, tINT)) data_bytemask = kv.value.i;
+        } else if (kv.key == "hash_bytemask") {
+            if (CHECKTYPE(kv.value, tINT)) hash_bytemask = kv.value.i;
+        } else if (kv.key == "hash_dist") {
+            /* parsed in common_init_setup */
+        } else if (kv.key == "actions") {
+            if (CHECKTYPE(kv.value, tMAP)) actions.reset(new Actions(this, kv.value.map));
+        } else if (kv.key == "selection_table") {
+            bound_selector = kv.value;
+        } else if (kv.key == "register_params") {
+            if (!CHECKTYPE2(kv.value, tVEC, tMAP)) continue;
+            if (kv.value.type == tVEC) {
+                for (auto &v : kv.value.vec) parse_register_params(const_vals.size(), v);
+            } else {
+                for (auto &v : kv.value.map)
+                    if (CHECKTYPE(v.key, tINT)) parse_register_params(v.key.i, v.value);
+            }
+        } else if (kv.key == "math_table") {
+            if (!CHECKTYPE(kv.value, tMAP)) continue;
+            math_table.lineno = kv.value.lineno;
+            for (auto &v : kv.value.map) {
+                if (v.key == "data") {
+                    if (!CHECKTYPE2(v.value, tVEC, tMAP)) continue;
+                    if (v.value.type == tVEC) {
+                        parse_vector(math_table.data, v.value);
+                    } else {
+                        math_table.data.resize(16);
+                        for (auto &d : v.value.map)
+                            if (CHECKTYPE(d.key, tINT) && CHECKTYPE(d.value, tINT)) {
+                                if (d.key.i < 0 || d.key.i >= 16)
+                                    error(v.key.lineno, "invalid index for math_table");
+                                else
+                                    math_table.data[v.key.i] = v.value.i;
+                            }
+                    }
+                } else if (v.key == "invert") {
+                    math_table.invert = get_bool(v.value);
+                } else if (v.key == "shift") {
+                    if (CHECKTYPE(v.value, tINT)) math_table.shift = v.value.i;
+                } else if (v.key == "scale") {
+                    if (CHECKTYPE(v.value, tINT)) math_table.scale = v.value.i;
+                } else if (v.key.type == tINT && v.key.i >= 0 && v.key.i < 16) {
+                    if (CHECKTYPE(v.value, tINT)) math_table.data[v.key.i] = v.value.i;
+                } else {
+                    error(v.key.lineno, "Unknow item %s in math_table", value_desc(kv.key));
+                }
+            }
+        } else if (options.target >= JBAY && setup_jbay(kv)) {
+            /* jbay specific extensions done in setup_jbay */
+            // FIXME -- these should probably be based on individual Target::FEATURE() queries
+        } else if (kv.key == "log_vpn") {
+            logvpn_lineno = kv.value.lineno;
+            if (CHECKTYPE2(kv.value, tINT, tRANGE)) {
+                if (kv.value.type == tINT) {
+                    logvpn_min = logvpn_max = kv.value.i;
+                } else {
+                    logvpn_min = kv.value.range.lo;
+                    logvpn_max = kv.value.range.hi;
+                }
+            }
+        } else if (kv.key == "pred_shift") {
+            if (CHECKTYPE(kv.value, tINT))
+                if ((pred_shift = kv.value.i) < 0 || pred_shift >= 32 || (pred_shift & 3) != 0)
+                    error(kv.value.lineno, "Invalid pred_shift value %d: %s", pred_shift,
+                          pred_shift < 0     ? "negative"
+                          : pred_shift >= 32 ? "too large"
+                                             : "must be a mulitple of 4");
+        } else if (kv.key == "pred_comb_shift") {
+            if (CHECKTYPE(kv.value, tINT))
+                if ((pred_comb_shift = kv.value.i) < 0 || pred_comb_shift >= 32)
+                    error(kv.value.lineno, "Invalid pred_comb_shift value %d: %s", pred_comb_shift,
+                          pred_comb_shift < 0 ? "negative" : "too large");
+        } else if (kv.key == "busy_value" && Target::SUPPORT_SALU_FAST_CLEAR()) {
+            if (CHECKTYPE(kv.value, tINT)) busy_value = kv.value.i;
+        } else if (kv.key == "clear_value" && Target::SUPPORT_SALU_FAST_CLEAR()) {
+            if (CHECKTYPE2(kv.value, tINT, tBIGINT))
+                clear_value = get_bitvec(kv.value, 128, "Value too large for 128 bits");
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+}
+
+bool match_table_layouts(Table *t1, Table *t2) {
+    if (t1->layout.size() != t2->layout.size()) return false;
+    auto it = t2->layout.begin();
+    for (auto &row : t1->layout) {
+        if (row.row != it->row) return false;
+        if (row.memunits != it->memunits) return false;
+        if (row.maprams.empty()) row.maprams = it->maprams;
+        if (row.maprams != it->maprams) return false;
+        ++it;
+    }
+    return true;
+}
+
+void StatefulTable::MathTable::check() {
+    if (data.size() > 16) error(lineno, "math table only has 16 data entries");
+    data.resize(16);
+    for (auto &v : data)
+        if (v < 0 || v >= 256) error(lineno, "%d out of range for math_table data", v);
+    if (shift < -1 || shift > 1) error(lineno, "%d out of range for math_table shift", shift);
+    if (scale < -32 || scale >= 32) error(lineno, "%d out of range for math_table scale", scale);
+}
+
+void StatefulTable::pass1() {
+    LOG1("### Stateful table " << name() << " pass1 " << loc());
+    if (!p4_table)
+        p4_table = P4Table::alloc(P4Table::Stateful, this);
+    else
+        p4_table->check(this);
+    alloc_vpns();
+    if (bound_selector.check()) {
+        if (layout.empty())
+            layout = bound_selector->layout;
+        else if (!match_table_layouts(this, bound_selector))
+            error(layout[0].lineno, "Layout in %s does not match selector %s", name(),
+                  bound_selector->name());
+        // Add a back reference to this table
+        if (!bound_selector->get_stateful()) bound_selector->set_stateful(this);
+        if (logical_id < 0) logical_id = bound_selector->logical_id;
+    } else {
+        alloc_maprams();
+        if (Target::SRAM_GLOBAL_ACCESS())
+            alloc_global_srams();
+        else
+            alloc_rams(true, stage->sram_use);
+    }
+    std::sort(layout.begin(), layout.end(),
+              [](const Layout &a, const Layout &b) -> bool { return a.row > b.row; });
+    stage->table_use[timing_thread(gress)] |= Stage::USE_STATEFUL;
+    for (auto &hd : hash_dist) hd.pass1(this, HashDistribution::OTHER, false);
+    for (auto &ixb : input_xbar) ixb->pass1();
+    int prev_row = -1;
+    for (auto &row : layout) {
+        if (prev_row >= 0)
+            need_bus(lineno, stage->overflow_bus_use, row.row, "Overflow");
+        else
+            need_bus(lineno, stage->meter_bus_use, row.row, "Meter data");
+        for (int r = (row.row + 1) | 1; r < prev_row; r += 2)
+            need_bus(lineno, stage->overflow_bus_use, r, "Overflow");
+        prev_row = row.row;
+    }
+    unsigned idx = 0, size = 0;
+    for (auto &fld : *format) {
+        switch (idx++) {
+            case 0:
+                if ((size = fld.second.size) != 1 && size != 8 && size != 16 && size != 32 &&
+                    ((size != 64 && size != 128) || options.target == TOFINO)) {
+                    error(format->lineno, "invalid size %d for stateful format field %s", size,
+                          fld.first.c_str());
+                    break;
+                }
+                break;
+            case 1:
+                if (size != fld.second.size)
+                    error(format->lineno, "stateful fields must be the same size");
+                else if (size == 1)
+                    error(format->lineno, "one bit stateful tables can only have a single field");
+                break;
+            default:
+                error(format->lineno, "only two fields allowed in a stateful table");
+        }
+    }
+    if ((idx == 2) && (format->size == 2 * size)) dual_mode = true;
+    if (actions) {
+        actions->pass1(this);
+        bool stop = false;
+        for (auto &act : *actions) {
+            for (auto &inst : act.instr) {
+                if (inst->salu_output()) {
+                    need_bus(layout.at(0).lineno, stage->action_data_use, home_row(),
+                             "action data");
+                    stop = true;
+                    break;
+                }
+            }
+            if (stop) break;
+        }
+    } else {
+        error(lineno, "No actions in stateful table %s", name());
+    }
+    if (math_table) math_table.check();
+    for (auto &r : sbus_learn)
+        if (r.check() && (r->table_type() != STATEFUL || r->stage != stage))
+            error(r.lineno, "%s is not a stateful table in the same stage as %s", r->name(),
+                  name());
+    for (auto &r : sbus_match)
+        if (r.check() && (r->table_type() != STATEFUL || r->stage != stage))
+            error(r.lineno, "%s is not a stateful table in the same stage as %s", r->name(),
+                  name());
+    Synth2Port::pass1();
+    if (underflow_action.set() && (!actions || !actions->exists(underflow_action.name)))
+        error(underflow_action.lineno, "No action %s in table %s", underflow_action.name.c_str(),
+              name());
+    if (overflow_action.set() && (!actions || !actions->exists(overflow_action.name)))
+        error(overflow_action.lineno, "No action %s in table %s", overflow_action.name.c_str(),
+              name());
+}
+
+int StatefulTable::get_const(int lineno, int64_t v) {
+    size_t rv;
+    for (rv = 0; rv < const_vals.size(); rv++) {
+        // Skip constants allocated for RegisterParams as they cannot be shared
+        // as they are subject to change.
+        if (const_vals[rv].is_param) continue;
+        if (const_vals[rv].value == v) break;
+    }
+    if (rv == const_vals.size()) {
+        if (rv >= Target::STATEFUL_REGFILE_ROWS())
+            error(lineno,
+                  "Out of the number of register file rows (%d). Reduce the number"
+                  " of large constants or RegisterParams.",
+                  Target::STATEFUL_REGFILE_ROWS());
+        const_vals.push_back(std::move(const_info_t(lineno, v)));
+    }
+    return rv;
+}
+
+void StatefulTable::pass2() {
+    LOG1("### Stateful table " << name() << " pass2 " << loc());
+    for (auto &ixb : input_xbar) ixb->pass2();
+    if (actions) actions->stateful_pass2(this);
+    if (stateful_counter_mode) {
+        if (logvpn_min < 0) {
+            layout_vpn_bounds(logvpn_min, logvpn_max, true);
+        } else if (!offset_vpn) {
+            int min, max;
+            layout_vpn_bounds(min, max, true);
+            if (logvpn_min < min || logvpn_max > max)
+                error(logvpn_lineno, "log_vpn out of range (%d..%d)", min, max);
+        }
+    }
+
+    for (auto &ixb : input_xbar) {
+        if (!data_bytemask && !hash_bytemask) {
+            hash_bytemask = bitmask2bytemask(ixb->hash_group_bituse()) & phv_byte_mask;
+            // should we also mask off bits not set in the ixbar of this table?
+            // as long as the compiler explicitly zeroes everything in the hash
+            // that needs to be zero, it should be ok.
+            data_bytemask = phv_byte_mask & ~hash_bytemask;
+        }
+    }
+    if (input_xbar.empty()) {
+        if (data_bytemask || hash_bytemask) {
+            error(lineno, "No input_xbar in %s, but %s is present", name(),
+                  data_bytemask ? "data_bytemask" : "hash_bytemask");
+        } else if (phv_byte_mask) {
+            error(lineno, "No input_xbar in %s, but raw phv_%s use is present", name(),
+                  (phv_byte_mask & 1) ? "lo" : "hi");
+        }
+    }
+    for (auto &hd : hash_dist) hd.pass2(this);
+}
+
+void StatefulTable::pass3() { LOG1("### Stateful table " << name() << " pass3 " << loc()); }
+
+int StatefulTable::direct_shiftcount() const {
+    return 64 + METER_ADDRESS_ZERO_PAD - address_shift();
+}
+
+int StatefulTable::indirect_shiftcount() const { return METER_ADDRESS_ZERO_PAD - address_shift(); }
+
+int StatefulTable::address_shift() const { return ceil_log2(format->size) - meter_adr_shift; }
+
+unsigned StatefulTable::per_flow_enable_bit(MatchTable *match) const {
+    if (!per_flow_enable)
+        return METER_ADDRESS_BITS - METER_TYPE_BITS - 1;
+    else
+        return AttachedTable::per_flow_enable_bit(match);
+}
+
+unsigned StatefulTable::determine_shiftcount(Table::Call &call, int group, unsigned word,
+                                             int tcam_shift) const {
+    return determine_meter_shiftcount(call, group, word, tcam_shift);
+}
+
+/** Determine which stateful action a given table action invokes (if any)
+ *  In theory, the stateful action to run could be an action data param or even come from
+ *  hash_dist (so the action could run any stateful action), but currently the compiler will
+ *  never geneate such code.  If we add that ability, we'll need to revisit this, and need
+ *  to revise the context.json appropriately.  Currently, this code will return a nullptr
+ *  for such bfa code (meter_type_arg would be a Field or HashDist)
+ */
+Table::Actions::Action *StatefulTable::action_for_table_action(const MatchTable *tbl,
+                                                               const Actions::Action *act) const {
+    // Check for action args to determine which stateful action is
+    // called. If no args are present skip as the action does not
+    // invoke stateful
+    if (indirect) {
+        for (auto att : act->attached) {
+            if (att != this) continue;
+            if (att.args.size() == 0) continue;
+            auto meter_type_arg = att.args[0];
+            if (meter_type_arg.type == Call::Arg::Name) {
+                // Check if stateful has this called action
+                return actions->action(meter_type_arg.name());
+            } else if (meter_type_arg.type == Call::Arg::Const) {
+                int index = -1;
+                switch (meter_type_arg.value()) {
+                    case STATEFUL_INSTRUCTION_0:
+                        index = 0;
+                        break;
+                    case STATEFUL_INSTRUCTION_1:
+                        index = 1;
+                        break;
+                    case STATEFUL_INSTRUCTION_2:
+                        index = 2;
+                        break;
+                    case STATEFUL_INSTRUCTION_3:
+                        index = 3;
+                        break;
+                }
+                if (index == -1) continue;
+                auto it = actions->begin();
+                while (it != actions->end() && index > 0) {
+                    --index;
+                    ++it;
+                }
+                if (it != actions->end()) return &(*it);
+            }
+        }
+    } else {
+        // If stateful is direct, all user defined actions will
+        // invoke stateful except for the miss action. This is
+        // defined as 'default_only' in p4, if not the compiler
+        // generates a default_only action and adds it
+        // FIXME: Brig should add these generated actions as
+        // default_only in assembly
+        if (!((act->name == tbl->default_action) && tbl->default_only_action)) {
+            // Direct has only one action
+            if (actions) return &*actions->begin();
+        }
+    }
+    return nullptr;
+}
+
+template <class REGS>
+void StatefulTable::write_action_regs_vt(REGS &regs, const Actions::Action *act) {
+    int meter_alu = layout[0].row / 4U;
+    auto &stateful_regs = regs.rams.map_alu.meter_group[meter_alu].stateful;
+    auto &salu_instr_common = stateful_regs.salu_instr_common[act->code];
+    if (act->minmax_use) {
+        salu_instr_common.salu_datasize = 7;
+        salu_instr_common.salu_op_dual = is_dual_mode();
+    } else if (is_dual_mode() || p4c_5192_workaround(act)) {
+        salu_instr_common.salu_datasize = format->log2size - 1;
+        salu_instr_common.salu_op_dual = 1;
+    } else {
+        salu_instr_common.salu_datasize = format->log2size;
+    }
+}
+
+template <class REGS>
+void StatefulTable::write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                                        const std::vector<Call::Arg> &args) {
+    auto &merge = regs.rams.match.merge;
+    unsigned adr_mask = 0U;
+    unsigned per_entry_en_mux_ctl = 0U;
+    unsigned adr_default = 0U;
+    unsigned meter_type_position = 0U;
+    METER_ACCESS_TYPE default_type = match->default_meter_access_type(true);
+    AttachedTable::determine_meter_merge_regs(match, type, bus, args, default_type, adr_mask,
+                                              per_entry_en_mux_ctl, adr_default,
+                                              meter_type_position);
+    merge.mau_meter_adr_default[type][bus] = adr_default;
+    merge.mau_meter_adr_mask[type][bus] = adr_mask;
+    merge.mau_meter_adr_per_entry_en_mux_ctl[type][bus] = per_entry_en_mux_ctl;
+    merge.mau_meter_adr_type_position[type][bus] = meter_type_position;
+}
+
+template <class REGS>
+void StatefulTable::write_regs_vt(REGS &regs) {
+    LOG1("### Stateful table " << name() << " write_regs " << loc());
+    // FIXME -- factor common AttachedTable::write_regs
+    // FIXME -- factor common Synth2Port::write_regs
+    // FIXME -- factor common CounterTable::write_regs
+    // FIXME -- factor common MeterTable::write_regs
+    for (auto &ixb : input_xbar) ixb->write_regs(regs);
+    Layout *home = &layout[0];
+    bool push_on_overflow = false;
+    auto &map_alu = regs.rams.map_alu;
+    auto &merge = regs.rams.match.merge;
+    auto &adrdist = regs.rams.match.adrdist;
+    DataSwitchboxSetup<REGS> swbox(regs, this);
+    int minvpn, maxvpn;
+    layout_vpn_bounds(minvpn, maxvpn, true);
+    if (!bound_selector) {
+        for (Layout &logical_row : layout) {
+            unsigned row = logical_row.row / 2U;
+            unsigned side = logical_row.row & 1; /* 0 == left  1 == right */
+            BUG_CHECK(side == 1);                /* no map rams or alus on left side anymore */
+            auto vpn = logical_row.vpns.begin();
+            auto mapram = logical_row.maprams.begin();
+            auto &map_alu_row = map_alu.row[row];
+            LOG2("# DataSwitchbox.setup(" << row << ") home=" << home->row / 2U);
+            swbox.setup_row(row);
+            for (auto &memunit : logical_row.memunits) {
+                BUG_CHECK(memunit.stage == INT_MIN && memunit.row == logical_row.row,
+                          "bogus %s in logical row %d", memunit.desc(), logical_row.row);
+                unsigned col = memunit.col + 6 * side;
+                swbox.setup_row_col(row, col, *vpn);
+                write_mapram_regs(regs, row, *mapram, *vpn, MapRam::STATEFUL);
+                if (gress)
+                    regs.cfg_regs.mau_cfg_uram_thread[col / 4U] |= 1U << (col % 4U * 8U + row);
+                ++mapram, ++vpn;
+            }
+            /* FIXME -- factor with selector/meter? */
+            if (&logical_row == home) {
+                auto &vh_adr_xbar = regs.rams.array.row[row].vh_adr_xbar;
+                auto &data_ctl = regs.rams.array.row[row].vh_xbar[side].stateful_meter_alu_data_ctl;
+                for (auto &ixb : input_xbar) {
+                    if (hash_bytemask != 0U) {
+                        vh_adr_xbar.alu_hashdata_bytemask.alu_hashdata_bytemask_right =
+                            hash_bytemask;
+                        setup_muxctl(vh_adr_xbar.exactmatch_row_hashadr_xbar_ctl[2 + side],
+                                     ixb->hash_group());
+                    }
+                    if (data_bytemask != 0) {
+                        data_ctl.stateful_meter_alu_data_bytemask = data_bytemask;
+                        data_ctl.stateful_meter_alu_data_xbar_ctl = 8 | ixb->match_group();
+                    }
+                }
+                map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_base = minvpn;
+                map_alu_row.i2portctl.synth2port_vpn_ctl.synth2port_vpn_limit = maxvpn;
+                int meter_group_index = row / 2U;
+                auto &delay_ctl = map_alu.meter_alu_group_data_delay_ctl[meter_group_index];
+                delay_ctl.meter_alu_right_group_delay =
+                    Target::METER_ALU_GROUP_DATA_DELAY() + row / 4 + stage->tcam_delay(gress);
+                delay_ctl.meter_alu_right_group_enable =
+                    meter_alu_fifo_enable_from_mask(regs, phv_byte_mask);
+                auto &error_ctl = map_alu.meter_alu_group_error_ctl[meter_group_index];
+                error_ctl.meter_alu_group_ecc_error_enable = 1;
+                if (output_used) {
+                    auto &action_ctl = map_alu.meter_alu_group_action_ctl[meter_group_index];
+                    action_ctl.right_alu_action_enable = 1;
+                    action_ctl.right_alu_action_delay = stage->meter_alu_delay(gress, divmod_used);
+                    auto &switch_ctl = regs.rams.array.switchbox.row[row].ctl;
+                    switch_ctl.r_action_o_mux_select.r_action_o_sel_action_rd_r_i = 1;
+                    // disable action data address huffman decoding, on the assumtion we're not
+                    // trying to combine this with an action data table on the same home row.
+                    // Otherwise, the huffman decoding will think this is an 8-bit value and
+                    // replicate it.
+                    regs.rams.array.row[row]
+                        .action_hv_xbar.action_hv_xbar_disable_ram_adr
+                        .action_hv_xbar_disable_ram_adr_right = 1;
+                }
+            } else {
+                auto &adr_ctl = map_alu_row.vh_xbars.adr_dist_oflo_adr_xbar_ctl[side];
+                if (home->row >= 8 && logical_row.row < 8) {
+                    adr_ctl.adr_dist_oflo_adr_xbar_source_index = 0;
+                    adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::OVERFLOW;
+                    push_on_overflow = true;
+                    BUG_CHECK(options.target == TOFINO);
+                } else {
+                    adr_ctl.adr_dist_oflo_adr_xbar_source_index = home->row % 8;
+                    adr_ctl.adr_dist_oflo_adr_xbar_source_sel = AdrDist::METER;
+                }
+                adr_ctl.adr_dist_oflo_adr_xbar_enable = 1;
+            }
+        }
+    }
+    if (actions) actions->write_regs(regs, this);
+    unsigned meter_group = home->row / 4U;
+    for (MatchTable *m : match_tables) {
+        adrdist.mau_ad_meter_virt_lt[meter_group] |= 1U << m->logical_id;
+        adrdist.adr_dist_meter_adr_icxbar_ctl[m->logical_id] |= 1 << meter_group;
+    }
+    if (!bound_selector) {
+        bool first_match = true;
+        for (MatchTable *m : match_tables) {
+            adrdist.adr_dist_meter_adr_icxbar_ctl[m->logical_id] |= 1 << meter_group;
+            adrdist.movereg_ad_meter_alu_to_logical_xbar_ctl[m->logical_id / 8U].set_subfield(
+                4 | meter_group, 3 * (m->logical_id % 8U), 3);
+            if (first_match)
+                adrdist.movereg_meter_ctl[meter_group].movereg_meter_ctl_lt = m->logical_id;
+            if (direct) {
+                if (first_match)
+                    adrdist.movereg_meter_ctl[meter_group].movereg_meter_ctl_direct = 1;
+                adrdist.movereg_ad_direct[MoveReg::METER] |= 1U << m->logical_id;
+            }
+            first_match = false;
+        }
+        adrdist.movereg_meter_ctl[meter_group].movereg_ad_meter_shift = format->log2size;
+        if (push_on_overflow) {
+            adrdist.oflo_adr_user[0] = adrdist.oflo_adr_user[1] = AdrDist::METER;
+            adrdist.deferred_oflo_ctl = 1 << (home->row - 8) / 2U;
+        }
+        adrdist.packet_action_at_headertime[1][meter_group] = 1;
+    }
+    write_logging_regs(regs);
+    for (auto &hd : hash_dist) hd.write_regs(regs, this);
+    if (gress == INGRESS || gress == GHOST) {
+        merge.meter_alu_thread[0].meter_alu_thread_ingress |= 1U << meter_group;
+        merge.meter_alu_thread[1].meter_alu_thread_ingress |= 1U << meter_group;
+    } else if (gress == EGRESS) {
+        merge.meter_alu_thread[0].meter_alu_thread_egress |= 1U << meter_group;
+        merge.meter_alu_thread[1].meter_alu_thread_egress |= 1U << meter_group;
+    }
+    auto &salu = regs.rams.map_alu.meter_group[meter_group].stateful;
+    salu.stateful_ctl.salu_enable = 1;
+    salu.stateful_ctl.salu_output_pred_shift = pred_shift / 4;
+    salu.stateful_ctl.salu_output_pred_comb_shift = pred_comb_shift;
+    // The reset value for the CMP opcode is enabled by default -- we want to disable
+    // any unused cmp units
+    for (auto &inst : salu.salu_instr_cmp_alu) {
+        for (auto &alu : inst) {
+            if (!alu.salu_cmp_opcode.modified()) {
+                alu.salu_cmp_opcode = 2;
+            }
+        }
+    }
+    if (gress == EGRESS) {
+        regs.rams.map_alu.meter_group[meter_group].meter.meter_ctl.meter_alu_egress = 1;
+    }
+    if (math_table) {
+        for (size_t i = 0; i < math_table.data.size(); ++i)
+            salu.salu_mathtable[i / 4U].set_subfield(math_table.data[i], 8 * (i % 4U), 8);
+        salu.salu_mathunit_ctl.salu_mathunit_output_scale = math_table.scale & 0x3fU;
+        salu.salu_mathunit_ctl.salu_mathunit_exponent_invert = math_table.invert;
+        switch (math_table.shift) {
+            case -1:
+                salu.salu_mathunit_ctl.salu_mathunit_exponent_shift = 2;
+                break;
+            case 0:
+                salu.salu_mathunit_ctl.salu_mathunit_exponent_shift = 0;
+                break;
+            case 1:
+                salu.salu_mathunit_ctl.salu_mathunit_exponent_shift = 1;
+                break;
+        }
+    }
+}
+
+void StatefulTable::gen_tbl_cfg(json::vector &out) const {
+    // FIXME -- factor common Synth2Port stuff
+    int size = (layout_size() - 1) * 1024 * (128U / format->size);
+    json::map &tbl = *base_tbl_cfg(out, "stateful", size);
+    unsigned alu_width = format->size / (dual_mode ? 2 : 1);
+    tbl["initial_value_lo"] = initial_value_lo;
+    tbl["initial_value_hi"] = initial_value_hi;
+    tbl["alu_width"] = alu_width;
+    tbl["dual_width_mode"] = dual_mode;
+    json::vector &act_to_sful_instr_slot = tbl["action_to_stateful_instruction_slot"];
+    if (actions) {
+        for (auto &a : *actions) {
+            for (auto &i : a.instr) {
+                if ((i->name() == "set_bit_at") || (i->name() == "set_bitc_at"))
+                    tbl["set_instr_adjust_total"] = a.code;
+                if ((i->name() == "set_bit") || (i->name() == "set_bitc"))
+                    tbl["set_instr"] = a.code;
+                if ((i->name() == "clr_bit_at") || (i->name() == "clr_bitc_at"))
+                    tbl["clr_instr_adjust_total"] = a.code;
+                if ((i->name() == "clr_bit") || (i->name() == "clr_bitc"))
+                    tbl["clr_instr"] = a.code;
+            }
+        }
+    }
+    // Add action handle and instr slot for action which references stateful
+    for (auto *m : match_tables) {
+        if (auto *acts = m->get_actions()) {
+            for (auto &a : *acts) {
+                Actions::Action *stful_action = action_for_table_action(m, &a);
+                if (!stful_action) continue;
+                bool act_present = false;
+                // Do not add handle if already present, if stateful spans
+                // multiple stages this can happen as action handles are unique
+                // and this code will get called again
+                for (auto &s : act_to_sful_instr_slot) {
+                    auto s_handle = s->to<json::map>()["action_handle"];
+                    if (*s_handle->as_number() == a.handle) {
+                        act_present = true;
+                        break;
+                    }
+                }
+                if (act_present) continue;
+                json::map instr_slot;
+                instr_slot["action_handle"] = a.handle;
+                instr_slot["instruction_slot"] = stful_action->code;
+                act_to_sful_instr_slot.push_back(std::move(instr_slot));
+            }
+        }
+    }
+    json::vector &register_file = tbl["register_params"];
+    for (size_t i = 0; i < const_vals.size(); i++) {
+        if (!const_vals[i].is_param) continue;
+        json::map register_file_row;
+        register_file_row["register_file_index"] = i;
+        register_file_row["initial_value"] = const_vals[i].value;
+        register_file_row["name"] = const_vals[i].param_name;
+        register_file_row["handle"] = const_vals[i].param_handle;
+        register_file.push_back(std::move(register_file_row));
+    }
+    if (bound_selector) tbl["bound_to_selection_table_handle"] = bound_selector->handle();
+    json::map &stage_tbl = *add_stage_tbl_cfg(tbl, "stateful", size);
+    add_alu_index(stage_tbl, "meter_alu_index");
+    gen_tbl_cfg(tbl, stage_tbl);
+    if (context_json) stage_tbl.merge(*context_json);
+}
+
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(StatefulTable, TARGET_CLASS)
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void StatefulTable::write_action_regs,
+                      (mau_regs & regs, const Actions::Action *act),
+                      { write_action_regs_vt(regs, act); })
+FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void StatefulTable::write_merge_regs,
+                      (mau_regs & regs, MatchTable *match, int type, int bus,
+                       const std::vector<Call::Arg> &args),
+                      { write_merge_regs_vt(regs, match, type, bus, args); })
diff --git a/backends/tofino/bf-asm/synth2port.cpp b/backends/tofino/bf-asm/synth2port.cpp
new file mode 100644
index 00000000000..f86d33e5254
--- /dev/null
+++ b/backends/tofino/bf-asm/synth2port.cpp
@@ -0,0 +1,177 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "data_switchbox.h"
+#include "input_xbar.h"
+#include "lib/algorithm.h"
+#include "misc.h"
+
+void Synth2Port::common_init_setup(const VECTOR(pair_t) & data, bool, P4Table::type p4type) {
+    setup_layout(layout, data);
+    if (auto *fmt = get(data, "format")) {
+        if (CHECKTYPEPM(*fmt, tMAP, fmt->map.size > 0, "non-empty map"))
+            format.reset(new Format(this, fmt->map));
+    }
+}
+
+bool Synth2Port::common_setup(pair_t &kv, const VECTOR(pair_t) & data, P4Table::type p4type) {
+    if (kv.key == "vpns") {
+        if (kv.value == "null") {
+            no_vpns = true;
+        } else if (CHECKTYPE(kv.value, tVEC)) {
+            setup_vpns(layout, &kv.value.vec, true);
+        }
+    } else if (kv.key == "maprams") {
+        setup_maprams(kv.value);
+    } else if (kv.key == "global_binding") {
+        global_binding = get_bool(kv.value);
+    } else if (kv.key == "per_flow_enable") {
+        if (CHECKTYPE(kv.value, tSTR)) {
+            per_flow_enable = 1;
+            per_flow_enable_param = kv.value.s;
+        }
+    } else if (kv.key == "p4") {
+        if (CHECKTYPE(kv.value, tMAP)) p4_table = P4Table::get(p4type, kv.value.map);
+    } else if (kv.key == "context_json") {
+        setup_context_json(kv.value);
+    } else if (kv.key == "format" || kv.key == "row" || kv.key == "logical_row" ||
+               kv.key == "column" || kv.key == "bus") {
+        /* already done in setup_layout */
+    } else if (kv.key == "logical_bus") {
+        if (CHECKTYPE2(kv.value, tSTR, tVEC)) {
+            if (kv.value.type == tSTR) {
+                if (*kv.value.s != 'A' && *kv.value.s != 'O' && *kv.value.s != 'S')
+                    error(kv.value.lineno, "Invalid logical bus %s", kv.value.s);
+            } else {
+                for (auto &v : kv.value.vec) {
+                    if (CHECKTYPE(v, tSTR)) {
+                        if (*v.s != 'A' && *v.s != 'O' && *v.s != 'S')
+                            error(v.lineno, "Invalid logical bus %s", v.s);
+                    }
+                }
+            }
+        }
+    } else if (kv.key == "home_row") {
+        home_lineno = kv.value.lineno;
+        if (CHECKTYPE2(kv.value, tINT, tVEC)) {
+            if (kv.value.type == tINT) {
+                if (kv.value.i >= 0 || kv.value.i < LOGICAL_SRAM_ROWS)
+                    home_rows.insert(kv.value.i);
+                else
+                    error(kv.value.lineno, "Invalid home row %" PRId64 "", kv.value.i);
+            } else {
+                for (auto &v : kv.value.vec) {
+                    if (CHECKTYPE(v, tINT)) {
+                        if (v.i >= 0 || v.i < LOGICAL_SRAM_ROWS)
+                            home_rows.insert(v.i);
+                        else
+                            error(v.lineno, "Invalid home row %" PRId64 "", v.i);
+                    }
+                }
+            }
+        }
+    } else {
+        return false;
+    }
+    return true;
+}
+
+void Synth2Port::pass1() {
+    LOG1("### Synth2Port table " << name() << " pass1 " << loc());
+    AttachedTable::pass1();
+}
+
+void Synth2Port::alloc_vpns(Target::Tofino) { AttachedTable::alloc_vpns(); }
+
+void Synth2Port::pass2() { LOG1("### Synth2Port table " << name() << " pass2 " << loc()); }
+
+void Synth2Port::pass3() { LOG1("### Synth2Port table " << name() << " pass3 " << loc()); }
+
+json::map *Synth2Port::add_stage_tbl_cfg(json::map &tbl, const char *type, int size) const {
+    json::map &stage_tbl = *AttachedTable::add_stage_tbl_cfg(tbl, type, size);
+    std::string hr = how_referenced();
+    if (hr.empty()) hr = direct ? "direct" : "indirect";
+    tbl["how_referenced"] = hr;
+    int entries = 1;
+    if (format) {
+        BUG_CHECK(format->log2size <= 7);
+        if (format->groups() > 1) {
+            BUG_CHECK(format->log2size == 7);
+            entries = format->groups();
+        } else {
+            entries = 128U >> format->log2size;
+        }
+    }
+    add_pack_format(stage_tbl, 128, 1, entries);
+    stage_tbl["memory_resource_allocation"] =
+        gen_memory_resource_allocation_tbl_cfg("sram", layout, true);
+    return &stage_tbl;
+}
+
+void Synth2Port::add_alu_indexes(json::map &stage_tbl, std::string alu_indexes) const {
+    json::vector home_alu;
+
+    for (auto row : home_rows) home_alu.push_back(row / 4U);
+
+    stage_tbl[alu_indexes] = home_alu.clone();
+}
+
+std::vector<int> Synth2Port::determine_spare_bank_memory_units(Target::Tofino) const {
+    std::vector<int> spare_mem;
+    int vpn_ctr = 0;
+    int minvpn, spare_vpn;
+
+    // Retrieve the Spare VPN
+    layout_vpn_bounds(minvpn, spare_vpn, false);
+    for (auto &row : layout) {
+        auto vpn_itr = row.vpns.begin();
+        for (auto &ram : row.memunits) {
+            BUG_CHECK(ram.stage == INT_MIN && ram.row == row.row, "bogus %s in row %d", ram.desc(),
+                      row.row);
+            if (vpn_itr != row.vpns.end()) vpn_ctr = *vpn_itr++;
+            if (spare_vpn == vpn_ctr) {
+                spare_mem.push_back(json_memunit(ram));
+                if (table_type() == SELECTION || table_type() == COUNTER || table_type() == METER ||
+                    table_type() == STATEFUL)
+                    continue;
+            }
+        }
+    }
+    return spare_mem;
+}
+
+int Synth2Port::get_home_row_for_row(int row) const {
+    for (int home_row : home_rows) {
+        // Tofino1 have an overflow bus in the middle of the SRAM array
+        if (options.target == TOFINO)
+            return home_row;
+        else if (row / 8 == home_row / 8)
+            return home_row;
+    }
+    BUG();
+    return -1;
+}
+
+template <class REGS>
+void Synth2Port::write_regs_vt(REGS &regs) {
+    // FIXME move common Counter/Meter/StatefulTable::write_regs_vt stuff here
+}
+
+REGSETS_IN_CLASS(Tofino, TARGET_OVERLOAD, void Synth2Port::write_regs, (mau_regs & regs),
+                 { write_regs_vt(regs); })
diff --git a/backends/tofino/bf-asm/tables.cpp b/backends/tofino/bf-asm/tables.cpp
new file mode 100644
index 00000000000..5d915918f05
--- /dev/null
+++ b/backends/tofino/bf-asm/tables.cpp
@@ -0,0 +1,3357 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/tables.h"
+
+#include <sstream>
+#include <string>
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "lib/algorithm.h"
+#include "misc.h"
+
+// template specialization declarations
+
+const char *MemUnit::desc() const {
+    static char buffer[256], *p = buffer;
+    char *end = buffer + sizeof(buffer), *rv;
+    do {
+        if (end - p < 7) p = buffer;
+        rv = p;
+        if (stage != INT_MIN)
+            p += snprintf(p, end - p, "Mem %d,%d,%d", stage, row, col);
+        else if (row >= 0)
+            p += snprintf(p, end - p, "Mem %d,%d", row, col);
+        else
+            p += snprintf(p, end - p, "Mem %d", col);
+    } while (p++ >= end);
+    return rv;
+}
+
+bool Table::Layout::operator==(const Table::Layout &a) const {
+    return row == a.row && bus == a.bus && word == a.word && memunits == a.memunits;
+    // ignoring other fields as if the above are all the same, will use the same resources
+}
+
+unsigned StatefulTable::const_info_t::unique_register_param_handle = REGISTER_PARAM_HANDLE_START;
+
+std::map<std::string, Table *> *Table::all;
+std::vector<Table *> *Table::by_uid;
+std::map<std::string, Table::Type *> *Table::Type::all;
+
+Table::Table(int line, std::string &&n, gress_t gr, Stage *s, int lid)
+    :  // NOLINT(whitespace/operators)
+      name_(n),
+      stage(s),
+      gress(gr),
+      lineno(line),
+      logical_id(lid) {
+    if (!all) all = new std::map<std::string, Table *>;
+    if (!by_uid) by_uid = new std::vector<Table *>;
+    uid = by_uid->size();
+    by_uid->push_back(this);
+    if (all->count(name_)) {
+        error(lineno, "Duplicate table %s", name());
+        error(all->at(name_)->lineno, "previously defined here");
+    }
+    all->emplace(name_, this);
+    if (stage) stage->all_refs.insert(&stage);
+}
+Table::~Table() {
+    BUG_CHECK(by_uid && uid >= 0 && uid < by_uid->size(), "invalid uid %d in table", uid);
+    all->erase(name_);
+    (*by_uid)[uid] = nullptr;
+    if (stage) stage->all_refs.erase(&stage);
+    if (all->empty()) {
+        delete all;
+        delete by_uid;
+        all = nullptr;
+        by_uid = nullptr;
+    }
+}
+
+Table::Type::Type(std::string &&name) {  // NOLINT(whitespace/operators)
+    if (!all) all = new std::map<std::string, Type *>();
+    if (get(name)) {
+        fprintf(stderr, "Duplicate table type %s\n", name.c_str());
+        exit(1);
+    }
+    self = all->emplace(name, this).first;
+}
+
+Table::Type::~Type() {
+    all->erase(self);
+    if (all->empty()) {
+        delete all;
+        all = nullptr;
+    }
+}
+
+Table::NextTables::NextTables(value_t &v) : lineno(v.lineno) {
+    if (v.type == tVEC && (Target::LONG_BRANCH_TAGS() > 0 || v.vec.size == 0)) {
+        for (auto &el : v.vec)
+            if (CHECKTYPE(el, tSTR)) next.emplace(el);
+    } else if (CHECKTYPE(v, tSTR)) {
+        if (v != "END") next.emplace(v);
+    }
+}
+
+bool Table::NextTables::can_use_lb(int stage, const NextTables &lbrch) {
+    if (options.disable_long_branch) return false;
+    if (!lbrch.subset_of(*this)) return false;
+    return true;
+}
+
+void Table::NextTables::resolve_long_branch(const Table *tbl,
+                                            const std::map<int, NextTables> &lbrch) {
+    if (resolved) return;
+    resolved = true;
+    for (auto &lb : lbrch) {
+        if (can_use_lb(tbl->stage->stageno, lb.second)) {
+            lb_tags |= 1U << lb.first;
+        }
+    }
+    for (auto &lb : tbl->long_branch) {
+        if (can_use_lb(tbl->stage->stageno, lb.second)) {
+            lb_tags |= 1U << lb.first;
+        }
+    }
+    for (auto &n : next) {
+        if (!n) continue;
+        if (Target::LONG_BRANCH_TAGS() > 0 && !options.disable_long_branch) {
+            if (n->stage->stageno <= tbl->stage->stageno + 1)  // local or global exec
+                continue;
+            auto lb_covers = [this, n](const std::pair<const int, NextTables> &lb) -> bool {
+                return ((lb_tags >> lb.first) & 1) && lb.second.next.count(n);
+            };
+            if (std::any_of(lbrch.begin(), lbrch.end(), lb_covers)) continue;
+            if (std::any_of(tbl->long_branch.begin(), tbl->long_branch.end(), lb_covers)) continue;
+        }
+        if (next_table_) {
+            error(n.lineno, "Can't have multiple next tables for table %s", tbl->name());
+            break;
+        }
+        next_table_ = n;
+    }
+}
+
+unsigned Table::NextTables::next_in_stage(int stage) const {
+    unsigned rv = 0;
+    for (auto &n : next)
+        if (n->stage->stageno == stage) rv |= 1U << n->logical_id;
+    return rv;
+}
+
+bool Table::NextTables::need_next_map_lut() const {
+    BUG_CHECK(resolved);
+    return next.size() > 1 || (next.size() == 1 && !next_table_);
+}
+
+void Table::NextTables::force_single_next_table() {
+    BUG_CHECK(resolved);  // must be resolved already
+    if (next.size() > 1)
+        error(lineno,
+              "Can't support multiple next tables; next is directly in overhead "
+              "without using 8-entry lut");
+    if (next.size() == 1) next_table_ = *next.begin();
+}
+
+int Table::table_id() const { return (stage->stageno << 4) + logical_id; }
+
+void Table::Call::setup(const value_t &val, Table *tbl) {
+    if (!CHECKTYPE2(val, tSTR, tCMD)) return;
+    if (val.type == tSTR) {
+        Ref::operator=(val);
+        return;
+    }
+    Ref::operator=(val[0]);
+    for (int i = 1; i < val.vec.size; i++) {
+        int mode;
+        if (val[i].type == tINT) {
+            args.emplace_back(val[i].i);
+        } else if (val[i].type == tCMD && val[i] == "hash_dist") {
+            if (PCHECKTYPE(val[i].vec.size > 1, val[i][1], tINT)) {
+                if (auto hd = tbl->find_hash_dist(val[i][1].i))
+                    args.emplace_back(hd);
+                else
+                    error(val[i].lineno, "hash_dist %" PRId64 " not defined in table %s",
+                          val[i][1].i, tbl->name());
+            }
+        } else if ((mode = StatefulTable::parse_counter_mode(val[i])) >= 0) {
+            args.emplace_back(Arg::Counter, mode);
+        } else if (!CHECKTYPE(val[i], tSTR)) {
+            // syntax error message emit by CHEKCTYPE
+        } else if (auto arg = tbl->lookup_field(val[i].s)) {
+            if (arg->bits.size() != 1) error(val[i].lineno, "arg fields can't be split in format");
+            args.emplace_back(arg);
+        } else {
+            args.emplace_back(val[i].s);
+        }
+    }
+    lineno = val.lineno;
+}
+
+unsigned Table::Call::Arg::size() const {
+    switch (type) {
+        case Field:
+            return fld ? fld->size : 0;
+        case HashDist:
+            return hd ? hd->expand >= 0 ? 23 : 16 : 0;
+        case Counter:
+            return 23;
+        case Const:
+        case Name:
+            return 0;
+        default:
+            BUG();
+    }
+    return -1;
+}
+
+static void add_row(int lineno, std::vector<Table::Layout> &layout, int row) {
+    layout.push_back(Table::Layout(lineno, row));
+}
+
+static int add_rows(std::vector<Table::Layout> &layout, const value_t &rows) {
+    if (!CHECKTYPE2(rows, tINT, tRANGE)) return 1;
+    if (rows.type == tINT) {
+        add_row(rows.lineno, layout, rows.i);
+    } else {
+        int step = rows.range.lo > rows.range.hi ? -1 : 1;
+        for (int i = rows.range.lo; i != rows.range.hi; i += step) add_row(rows.lineno, layout, i);
+        add_row(rows.lineno, layout, rows.range.hi);
+    }
+    return 0;
+}
+
+static int add_col(int lineno, int stage, Table::Layout &row, int col) {
+    for (auto &mu : row.memunits) {
+        if (mu.stage == stage && mu.col == col) {
+            error(lineno, "column %d duplicated", col);
+            return 1;
+        }
+    }
+    row.memunits.emplace_back(stage, row.row, col);
+    return 0;
+}
+
+static int add_cols(int stage, Table::Layout &row, const value_t &cols) {
+    int rv = 0;
+    if (cols.type == tVEC) {
+        if (cols.vec.size == 1) return add_cols(stage, row, cols.vec[0]);
+        for (auto &col : cols.vec) {
+            if (col.type == tVEC) {
+                error(col.lineno, "Column shape doesn't match rows");
+                rv |= 1;
+            } else {
+                rv |= add_cols(stage, row, col);
+            }
+        }
+        return rv;
+    }
+    if (cols.type == tMAP && Target::SRAM_GLOBAL_ACCESS()) {
+        bitvec stages_seen;
+        for (auto &kv : cols.map) {
+            if (kv.key == "stage" && kv.key.type == tCMD && kv.key[1].type == tINT)
+                stage = kv.key[1].i;
+            else {
+                error(kv.key.lineno, "syntax error, expecting a stage number");
+                continue;
+            }
+            if (stage < 0 || stage > Target::NUM_MAU_STAGES()) {
+                error(kv.key.lineno, "stage %d out of range", stage);
+            } else if (stages_seen[stage]) {
+                error(kv.key.lineno, "duplicate stage %d", stage);
+            } else {
+                rv |= add_cols(stage, row, kv.value);
+            }
+        }
+        return rv;
+    }
+    if (!CHECKTYPE2(cols, tINT, tRANGE)) return 1;
+    if (cols.type == tINT) return add_col(cols.lineno, stage, row, cols.i);
+    int step = cols.range.lo > cols.range.hi ? -1 : 1;
+    for (int i = cols.range.lo; i != cols.range.hi; i += step)
+        rv |= add_col(cols.lineno, stage, row, i);
+    rv |= add_col(cols.lineno, stage, row, cols.range.hi);
+    return rv;
+}
+
+static int add_stages(Table::Layout &row, const value_t &stages) {
+    int rv = 0;
+    if (stages.type == tVEC) {
+        if (stages.vec.size == 1) return add_stages(row, stages.vec[0]);
+        for (auto &stg : stages.vec) {
+            if (stg.type == tVEC) {
+                error(stg.lineno, "Stages shape doesn't match rows");
+                rv |= 1;
+            } else {
+                rv |= add_stages(row, stg);
+            }
+        }
+        return rv;
+    }
+    if (!CHECKTYPE2(stages, tINT, tRANGE)) return 1;
+    if (stages.type == tINT) return add_col(stages.lineno, stages.i, row, 0);
+    int step = stages.range.lo > stages.range.hi ? -1 : 1;
+    for (int i = stages.range.lo; i != stages.range.hi; i += step)
+        rv |= add_col(stages.lineno, i, row, 0);
+    rv |= add_col(stages.lineno, stages.range.hi, row, 0);
+    return rv;
+}
+
+std::ostream &operator<<(std::ostream &out, const Table::Layout::bus_type_t type) {
+    switch (type) {
+        case Table::Layout::SEARCH_BUS:
+            return out << "search_bus";
+        case Table::Layout::RESULT_BUS:
+            return out << "result_bus";
+        case Table::Layout::TIND_BUS:
+            return out << "tind_bus";
+        case Table::Layout::IDLE_BUS:
+            return out << "idle_bus";
+        case Table::Layout::L2R_BUS:
+            return out << "l2r bus";
+        case Table::Layout::R2L_BUS:
+            return out << "r2l bus";
+        default:
+            return out << "[bus_t " << static_cast<int>(type) << "]";
+    }
+}
+
+std::ostream &operator<<(std::ostream &out, const Table::Layout &l) {
+    if (l.home_row) out << "home_";
+    out << "row=" << l.row;
+    for (auto [type, idx] : l.bus) out << " " << type << "=" << idx;
+    if (l.word >= 0) out << " word=" << l.word;
+    if (!l.memunits.empty()) {
+        const char *sep = "";
+        out << " [";
+        for (auto &unit : l.memunits) {
+            out << sep << unit;
+            sep = ", ";
+        }
+        out << ']';
+    }
+    if (!l.vpns.empty()) {
+        const char *sep = "";
+        out << " vpns=[";
+        for (auto vpn : l.vpns) {
+            out << sep << vpn;
+            sep = ", ";
+        }
+        out << ']';
+    }
+    if (!l.maprams.empty()) {
+        const char *sep = "";
+        out << " maprams=[";
+        for (auto mr : l.maprams) {
+            out << sep << mr;
+            sep = ", ";
+        }
+        out << ']';
+    }
+    return out;
+}
+
+int Table::setup_layout_attrib(std::vector<Layout> &layout, const value_t &data, const char *what,
+                               int Layout::*attr) {
+    if (!CHECKTYPE2(data, tINT, tVEC)) {
+        return 1;
+    } else if (data.type == tVEC) {
+        if (data.vec.size != static_cast<int>(layout.size())) {
+            error(data.lineno, "%s shape doesn't match rows", what);
+            return 1;
+        } else {
+            for (int i = 0; i < data.vec.size; i++) {
+                if (CHECKTYPE(data.vec[i], tINT))
+                    layout[i].*attr = data.vec[i].i;
+                else
+                    return 1;
+            }
+        }
+    } else {
+        for (auto &lrow : layout) lrow.*attr = data.i;
+    }
+    return 0;
+}
+
+int Table::setup_layout_bus_attrib(std::vector<Layout> &layout, const value_t &data,
+                                   const char *what, Layout::bus_type_t type) {
+    int limit = Target::NUM_BUS_OF_TYPE(type);
+    int err = 0;
+    if (limit <= 0) {
+        error(data.lineno, "No %s on target %s", to_string(type).c_str(), Target::name());
+        return 1;
+    } else if (!CHECKTYPE2(data, tINT, tVEC)) {
+        return 1;
+    } else if (data.type == tVEC) {
+        if (data.vec.size != static_cast<int>(layout.size())) {
+            error(data.lineno, "%s shape doesn't match rows", what);
+            return 1;
+        } else {
+            for (int i = 0; i < data.vec.size; i++) {
+                if (!CHECKTYPE(data.vec[i], tINT)) return 1;
+                if (data.vec[i].i >= limit) {
+                    error(data.vec[i].lineno, "%" PRId64 " to large for %s", data.vec[i].i,
+                          to_string(type).c_str());
+                    err = 1;
+                }
+                if (data.vec[i].i >= 0) layout[i].bus[type] = data.vec[i].i;
+            }
+        }
+    } else if (data.i < 0) {
+        error(data.lineno, "%s value %" PRId64 " invalid", what, data.i);
+        err = 1;
+    } else if (data.i >= limit) {
+        error(data.lineno, "%" PRId64 " to large for %s", data.i, to_string(type).c_str());
+        err = 1;
+    } else {
+        for (auto &lrow : layout) lrow.bus[type] = data.i;
+    }
+    return err;
+}
+
+void Table::setup_layout(std::vector<Layout> &layout, const VECTOR(pair_t) & data,
+                         const char *subname) {
+    auto *row = get(data, "row");
+    if (!row && this->to<AttachedTable>()) row = get(data, "logical_row");
+    if (!row) {
+        if (table_type() != TERNARY && Target::TABLES_REQUIRE_ROW())
+            error(lineno, "No 'row' attribute in table %s%s", name(), subname);
+        return;
+    }
+    int err = 0;
+    if (row->type == tVEC)
+        for (value_t &r : row->vec) err |= add_rows(layout, r);
+    else
+        err |= add_rows(layout, *row);
+    if (err) return;
+    bool global_access =
+        (table_type() == TERNARY) ? Target::TCAM_GLOBAL_ACCESS() : Target::SRAM_GLOBAL_ACCESS();
+    if (global_access && table_type() == TERNARY && Target::TCAM_UNITS_PER_ROW() == 1) {
+        if (auto *stg = get(data, "stages")) {
+            if (stg->type == tVEC && stg->vec.size == static_cast<int>(layout.size())) {
+                for (int i = 0; i < stg->vec.size; i++) err |= add_stages(layout[i], stg->vec[i]);
+            } else if (layout.size() == 1)
+                err |= add_stages(layout[0], *stg);
+        } else {
+            for (auto &lrow : layout) err |= add_col(lineno, this->stage->stageno, lrow, 0);
+        }
+    } else if (auto *col = get(data, "column")) {
+        int stage = global_access ? this->stage->stageno : INT_MIN;
+        if (col->type == tMAP && global_access) {
+            bitvec stages_seen;
+            for (auto &kv : col->map) {
+                if (kv.key.type == tINT)
+                    stage = kv.key.i;
+                else if (kv.key == "stage" && kv.key.type == tCMD && kv.key[1].type == tINT)
+                    stage = kv.key[1].i;
+                else {
+                    error(kv.key.lineno, "syntax error, expecting a stage number");
+                    continue;
+                }
+                if (stage < 0 || stage > Target::NUM_STAGES(gress)) {
+                    error(kv.key.lineno, "stage %d out of range", stage);
+                } else if (stages_seen[stage]) {
+                    error(kv.key.lineno, "duplicate stage %d", stage);
+                } else {
+                    if (kv.value.type == tVEC && kv.value.vec.size + 0U == layout.size()) {
+                        for (int i = 0; i < kv.value.vec.size; i++)
+                            err |= add_cols(stage, layout[i], kv.value.vec[i]);
+                    } else {
+                        for (auto &lrow : layout)
+                            if ((err |= add_cols(stage, lrow, kv.value))) break;
+                    }
+                }
+            }
+        } else if (col->type == tVEC && col->vec.size == static_cast<int>(layout.size())) {
+            for (int i = 0; i < col->vec.size; i++) err |= add_cols(stage, layout[i], col->vec[i]);
+        } else {
+            for (auto &lrow : layout)
+                if ((err |= add_cols(stage, lrow, *col))) break;
+        }
+    } else if (layout.size() > 1) {
+        error(lineno, "No 'column' attribute in table %s%s", name(), subname);
+        return;
+    }
+    if (auto *bus = get(data, "bus"))
+        err |= Table::setup_layout_bus_attrib(layout, *bus, "Bus", default_bus_type());
+    else if (auto *bus = get(data, "search_bus"))
+        err |= Table::setup_layout_bus_attrib(layout, *bus, "Bus", Layout::SEARCH_BUS);
+    if (auto *bus = get(data, "lhbus"))
+        err |= Table::setup_layout_bus_attrib(layout, *bus, "R2L hbus", Layout::R2L_BUS);
+    if (auto *bus = get(data, "rhbus"))
+        err |= Table::setup_layout_bus_attrib(layout, *bus, "L2R hbus", Layout::L2R_BUS);
+    if (auto *bus = get(data, "result_bus"))
+        err |= Table::setup_layout_bus_attrib(layout, *bus, "Bus", Layout::RESULT_BUS);
+    if (auto *word = get(data, "word"))
+        err |= Table::setup_layout_attrib(layout, *word, "Word", &Layout::word);
+    if (err) return;
+    for (auto i = layout.begin(); i != layout.end(); i++)
+        for (auto j = i + 1; j != layout.end(); j++)
+            if (*i == *j) {
+                std::stringstream bus;
+                if (!i->bus.empty())
+                    bus << " " << i->bus.begin()->first << " " << i->bus.begin()->second;
+                error(i->lineno, "row %d%s duplicated in table %s%s", i->row, bus.str().c_str(),
+                      name(), subname);
+            }
+}
+
+void Table::setup_logical_id() {
+    if (logical_id >= 0) {
+        if (Table *old = stage->logical_id_use[logical_id]) {
+            error(lineno, "table %s wants logical id %d:%d", name(), stage->stageno, logical_id);
+            error(old->lineno, "already in use by %s", old->name());
+        }
+        stage->logical_id_use[logical_id] = this;
+    }
+}
+
+void Table::setup_maprams(value_t &v) {
+    if (!CHECKTYPE2(v, tINT, tVEC)) return;
+    VECTOR(value_t) *rams = &v.vec, single_ram;
+    if (v.type == tINT) {
+        // treat as a vector of length 1
+        rams = &single_ram;
+        single_ram.size = single_ram.capacity = 1;
+        single_ram.data = &v;
+    }
+    auto r = rams->begin();
+    for (auto &row : layout) {
+        if (r == rams->end()) {
+            error(r->lineno, "Mapram layout doesn't match table layout");
+            break;
+        }
+        auto &maprow = *r++;
+        VECTOR(value_t) * maprow_rams, tmp;
+        if (maprow.type == tINT) {
+            if (layout.size() == 1) {
+                maprow_rams = rams;
+            } else {
+                // treat as a vector of length 1
+                maprow_rams = &tmp;
+                tmp.size = tmp.capacity = 1;
+                tmp.data = &maprow;
+            }
+        } else if (CHECKTYPE(maprow, tVEC)) {
+            maprow_rams = &maprow.vec;
+        } else {
+            continue;
+        }
+        if (maprow_rams->size != static_cast<int>(row.memunits.size())) {
+            error(r->lineno, "Mapram layout doesn't match table layout");
+            continue;
+        }
+        for (auto mapcol : *maprow_rams)
+            if (CHECKTYPE(mapcol, tINT)) {
+                if (mapcol.i < 0 || mapcol.i >= MAPRAM_UNITS_PER_ROW)
+                    error(mapcol.lineno, "Invalid mapram column %" PRId64 "", mapcol.i);
+                else
+                    row.maprams.push_back(mapcol.i);
+            }
+    }
+}
+
+/**
+ * Guarantees that the instruction call provided to the table has valid entries, and that
+ * if multiple choices are required, the compiler can make that choices.
+ *
+ * The instruction address is a two piece address.  The first argument is the address bits
+ * location.  The second argument is a per flow enable bit location.  These are both required.
+ * Additionally, the keyword $DEFAULT means that that particular portion of the address comes
+ * from the default register.
+ *
+ * FIXME -- this code is a messy hack -- various target-specific special cases.  Should try
+ * to figure out a better way to organize this.
+ */
+bool Table::validate_instruction(Table::Call &call) const {
+    if (call.args.size() != 2) {
+        error(call.lineno, "Instruction call has invalid number of arguments");
+        return false;
+    }
+
+    bool field_address = false;
+
+    if (call.args[0].name()) {
+        if (Target::GATEWAY_INHIBIT_INDEX() && call.args[0] == "$GATEWAY_IDX") {
+            field_address = true;
+        } else if (call.args[0] != "$DEFAULT") {
+            error(call.lineno, "Index %s for %s cannot be found", call.args[0].name(),
+                  call->name());
+            return false;
+        }
+    } else if (!call.args[0].field()) {
+        error(call.lineno, "Index for %s cannot be understood", call->name());
+        return false;
+    } else {
+        field_address = true;
+    }
+
+    if (call.args[1].name()) {
+        if (call.args[1] != "$DEFAULT") {
+            error(call.lineno, "Per flow enable %s for %s cannot be found", call.args[1].name(),
+                  call->name());
+            return false;
+        }
+    } else if (!call.args[1].field()) {
+        error(call.lineno, "Per flow enable for %s cannot be understood", call->name());
+        return false;
+    }
+
+    if (actions->hit_actions_count() > 1 && !field_address)
+        error(lineno, "No field to select between multiple action in table %s format", name());
+
+    return true;
+}
+
+static bool column_match(const std::vector<MemUnit> &a, const std::vector<MemUnit> &b) {
+    auto it = b.begin();
+    for (auto &u : a) {
+        if (it == b.end()) return false;
+        if (u.col != it->col) return false;
+        ++it;
+    }
+    return it == b.end();
+}
+
+void Table::setup_vpns(std::vector<Layout> &layout, VECTOR(value_t) * vpn, bool allow_holes) {
+    int period, width, depth;
+    const char *period_name;
+    vpn_params(width, depth, period, period_name);
+    int word = width;
+    Layout *firstrow = 0;
+    auto vpniter = vpn ? vpn->begin() : 0;
+    int *vpn_ctr = new int[period];
+    std::fill_n(vpn_ctr, period, get_start_vpn());
+    std::vector<bitvec> used_vpns(period);
+    bool on_repeat = false;
+    for (auto &row : layout) {
+        if (++word < width) {
+            BUG_CHECK(firstrow);
+            if (!column_match(row.memunits, firstrow->memunits))
+                error(row.lineno, "Columns across wide rows don't match in table %s", name());
+            row.vpns = firstrow->vpns;
+            continue;
+        }
+        word = 0;
+        firstrow = &row;
+        row.vpns.resize(row.memunits.size());
+        value_t *vpncoliter = 0;
+        for (int &el : row.vpns) {
+            // If VPN's are provided by the compiler, they need to match each
+            // element in the specified columns. Below code checks if all
+            // elements are present and errors out if there is any discrepancy.
+            if (vpniter) {
+                if (vpniter == vpn->end()) {
+                    on_repeat = true;
+                    vpniter = vpn->begin();
+                }
+                if (CHECKTYPE2(*vpniter, tVEC, tINT)) {
+                    if (vpniter->type == tVEC) {
+                        if (!vpncoliter) {
+                            if (static_cast<int>(row.vpns.size()) != vpniter->vec.size) {
+                                error(vpniter->lineno,
+                                      "Vpn entries for row %d is %d not equal to column "
+                                      "entries %d",
+                                      row.row, vpniter->vec.size,
+                                      static_cast<int>(row.vpns.size()));
+                                continue;
+                            } else {
+                                vpncoliter = vpniter->vec.begin();
+                            }
+                        }
+                        el = vpncoliter->i;
+                        if (++vpncoliter == &*vpniter->vec.end()) ++vpniter;
+                        continue;
+                    } else if (vpniter->type == tINT) {
+                        el = vpniter->i;
+                    }
+                    ++vpniter;
+                }
+                // Error out if VPN's are repeated in a table. For wide words,
+                // each individual word can have the same vpn
+                if (!on_repeat && used_vpns[period - 1][el].set(true))
+                    error(vpniter->lineno, "Vpn %d used twice in table %s", el, name());
+            } else {
+                // If there is no word information provided in assembly (Ternary
+                // Indirect/Stats) tables, the allocation is always a single
+                // word.
+                // For SRamMatchTables, this should be handled by SRamMatchTable::alloc_vpns(),
+                // so this code will never be hit
+                // FIXME -- move this to Table::alloc_vpns and only call setup_vpns when
+                // there's a vpn specified in the bfa?
+                if (row.word < 0) row.word = word;
+                el = vpn_ctr[row.word];
+                if ((vpn_ctr[row.word] += period) == depth) vpn_ctr[row.word] = 0;
+            }
+        }
+    }
+    delete[] vpn_ctr;
+}
+
+void Table::common_init_setup(const VECTOR(pair_t) & data, bool, P4Table::type) {
+    setup_layout(layout, data);
+    if (auto *fmt = get(data, "format")) {
+        if (CHECKTYPEPM(*fmt, tMAP, fmt->map.size > 0, "non-empty map"))
+            format.reset(new Format(this, fmt->map));
+    }
+    if (auto *hd = get(data, "hash_dist")) HashDistribution::parse(hash_dist, *hd);
+}
+
+bool Table::common_setup(pair_t &kv, const VECTOR(pair_t) & data, P4Table::type p4type) {
+    bool global_access =
+        (table_type() == TERNARY) ? Target::TCAM_GLOBAL_ACCESS() : Target::SRAM_GLOBAL_ACCESS();
+    if (kv.key == "format" || kv.key == "row" || kv.key == "column" || kv.key == "bus") {
+        /* done in Table::common_init_setup */
+    } else if (global_access && (kv.key == "stages" || kv.key == "lhbus" || kv.key == "rhbus")) {
+        /* done in Table::common_init_setup */
+    } else if (kv.key == "action") {
+        action.setup(kv.value, this);
+    } else if (kv.key == "instruction") {
+        instruction.setup(kv.value, this);
+    } else if (kv.key == "action_enable") {
+        if (CHECKTYPE(kv.value, tINT)) action_enable = kv.value.i;
+        if (get(data, "action")) enable_action_data_enable = true;
+        enable_action_instruction_enable = true;
+    } else if (kv.key == "enable_action_data_enable") {
+        enable_action_data_enable = get_bool(kv.value);
+    } else if (kv.key == "enable_action_instruction_enable") {
+        enable_action_instruction_enable = get_bool(kv.value);
+    } else if (kv.key == "actions") {
+        if (CHECKTYPE(kv.value, tMAP)) actions.reset(new Actions(this, kv.value.map));
+    } else if (kv.key == "action_bus") {
+        if (CHECKTYPE(kv.value, tMAP)) action_bus = ActionBus::create(this, kv.value.map);
+    } else if ((kv.key == "default_action") || (kv.key == "default_only_action")) {
+        if (kv.key == "default_only_action") default_only_action = true;
+        default_action_lineno = kv.value.lineno;
+        if (CHECKTYPE2(kv.value, tSTR, tCMD))
+            if (CHECKTYPE(kv.value, tSTR)) default_action = kv.value.s;
+    } else if (kv.key == "default_action_parameters") {
+        if (CHECKTYPE(kv.value, tMAP))
+            for (auto &v : kv.value.map)
+                if (CHECKTYPE(v.key, tSTR) && CHECKTYPE(v.value, tSTR))
+                    default_action_parameters[v.key.s] = v.value.s;
+    } else if (kv.key == "default_action_handle") {
+        default_action_handle = kv.value.i;
+    } else if (kv.key == "hit") {
+        if (!hit_next.empty()) {
+            error(kv.key.lineno, "Specifying both 'hit' and 'next' in table %s", name());
+        } else if (kv.value.type == tVEC) {
+            for (auto &v : kv.value.vec) hit_next.emplace_back(v);
+        } else {
+            hit_next.emplace_back(kv.value);
+        }
+    } else if (kv.key == "miss") {
+        if (miss_next.set()) {
+            error(kv.key.lineno, "Specifying both 'miss' and 'next' in table %s", name());
+        } else {
+            miss_next = kv.value;
+        }
+    } else if (kv.key == "next") {
+        if (!hit_next.empty()) {
+            error(kv.key.lineno, "Specifying both 'hit' and 'next' in table %s", name());
+        } else if (miss_next.set()) {
+            error(kv.key.lineno, "Specifying both 'miss' and 'next' in table %s", name());
+        } else {
+            miss_next = kv.value;
+            hit_next.emplace_back(miss_next);
+        }
+    } else if (kv.key == "long_branch" && Target::LONG_BRANCH_TAGS() > 0) {
+        if (options.disable_long_branch) error(kv.key.lineno, "long branches disabled");
+        if (CHECKTYPE(kv.value, tMAP)) {
+            for (auto &lb : kv.value.map) {
+                if (lb.key.type != tINT || lb.key.i < 0 || lb.key.i >= Target::LONG_BRANCH_TAGS())
+                    error(lb.key.lineno, "Invalid long branch tag %s", value_desc(lb.key));
+                else if (long_branch.count(lb.key.i))
+                    error(lb.key.lineno, "Duplicate long branch tag %" PRId64, lb.key.i);
+                else
+                    long_branch.emplace(lb.key.i, lb.value);
+            }
+        }
+    } else if (kv.key == "vpns") {
+        if (CHECKTYPESIZE(kv.value, tVEC)) setup_vpns(layout, &kv.value.vec);
+    } else if (kv.key == "p4") {
+        if (CHECKTYPE(kv.value, tMAP)) p4_table = P4Table::get(p4type, kv.value.map);
+    } else if (kv.key == "p4_param_order") {
+        if (CHECKTYPE(kv.value, tMAP)) {
+            unsigned position = 0;
+            for (auto &v : kv.value.map) {
+                if ((CHECKTYPE(v.key, tSTR)) && (CHECKTYPE(v.value, tMAP))) {
+                    p4_param p(v.key.s);
+                    for (auto &w : v.value.map) {
+                        if (!CHECKTYPE(w.key, tSTR)) continue;
+
+                        if (w.key == "type" && CHECKTYPE(w.value, tSTR))
+                            p.type = w.value.s;
+                        else if (w.key == "size" && CHECKTYPE(w.value, tINT))
+                            p.bit_width = w.value.i;
+                        else if (w.key == "full_size" && CHECKTYPE(w.value, tINT))
+                            p.bit_width_full = w.value.i;
+                        else if (w.key == "mask")
+                            p.mask = get_bitvec(w.value);
+                        else if (w.key == "alias" && CHECKTYPE(w.value, tSTR))
+                            p.alias = w.value.s;
+                        else if (w.key == "key_name" && CHECKTYPE(w.value, tSTR))
+                            p.key_name = w.value.s;
+                        else if (w.key == "start_bit" && CHECKTYPE(w.value, tINT))
+                            p.start_bit = w.value.i;
+                        else if (w.key == "context_json" && CHECKTYPE(w.value, tMAP))
+                            p.context_json = toJson(w.value.map);
+                        else
+                            error(lineno, "Incorrect param type %s in p4_param_order", w.key.s);
+                    }
+                    // Determine position in p4_param_order. Repeated fields get
+                    // the same position which is set on first occurrence.
+                    // Driver relies on position to order fields. The case when
+                    // we have multiple slices of same field based on position
+                    // only one location is assigned for the entire field.
+                    // However if the field has a name annotation (key_name)
+                    // this overrides the position even if the field belongs to
+                    // the same slice.
+                    bool ppFound = false;
+                    for (auto &pp : p4_params_list) {
+                        if ((pp.name == p.name) && (pp.key_name == p.key_name)) {
+                            ppFound = true;
+                            p.position = pp.position;
+                            break;
+                        }
+                    }
+                    if (!ppFound) p.position = position++;
+                    p4_params_list.emplace_back(std::move(p));
+                }
+            }
+        }
+    } else if (kv.key == "context_json") {
+        setup_context_json(kv.value);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+void Table::setup_context_json(value_t &v) {
+    if (!CHECKTYPE(v, tMAP)) return;
+
+    auto map = toJson(v.map);
+    if (context_json)
+        context_json->merge(*map);
+    else
+        context_json = std::move(map);
+}
+
+/** check two tables to see if they can share ram.
+ * FIXME -- for now we just allow a STATEFUL and a SELECTION to share -- we should
+ * FIXME -- check to make sure they're mutually exclusive and use the memory in
+ * FIXME -- a compatible way
+ */
+bool Table::allow_ram_sharing(const Table *t1, const Table *t2) {
+    if (t1->table_type() == STATEFUL && t2->table_type() == SELECTION &&
+        t1->to<StatefulTable>()->bound_selector == t2)
+        return true;
+    if (t2->table_type() == STATEFUL && t1->table_type() == SELECTION &&
+        t2->to<StatefulTable>()->bound_selector == t1)
+        return true;
+    return false;
+}
+
+/** check two tables to see if they can share action bus
+ * Two ATCAM tables or their action tables can occur in the same stage and share
+ * bytes on the action bus which is valid as they are always mutually exclusive
+ */
+bool Table::allow_bus_sharing(Table *t1, Table *t2) {
+    if (!t1 || !t2) return false;
+    if ((t1->table_type() == ATCAM) && (t2->table_type() == ATCAM) &&
+        (t1->p4_name() == t2->p4_name()))
+        return true;
+    if ((t1->table_type() == ACTION) && (t2->table_type() == ACTION) &&
+        (t1->p4_name() == t2->p4_name())) {
+        // Check if action tables are attached to atcam's
+        auto *m1 = t1->to<ActionTable>()->get_match_table();
+        auto *m2 = t2->to<ActionTable>()->get_match_table();
+        if (m1 && m2) {
+            if ((m1->table_type() == ATCAM) && (m2->table_type() == ATCAM)) return true;
+        }
+    }
+    return false;
+}
+
+void Table::alloc_rams(bool logical, BFN::Alloc2Dbase<Table *> &use,
+                       BFN::Alloc2Dbase<Table *> *bus_use, Layout::bus_type_t bus_type) {
+    for (auto &row : layout) {
+        for (auto &memunit : row.memunits) {
+            BUG_CHECK(memunit.stage == INT_MIN && memunit.row == row.row, "memunit fail");
+            int r = row.row, c = memunit.col;
+            if (logical) {
+                c += 6 * (r & 1);
+                r >>= 1;
+            }
+            try {
+                if (Table *old = use[r][c]) {
+                    if (!allow_ram_sharing(this, old)) {
+                        error(lineno,
+                              "Table %s trying to use (%d,%d) which is already in use "
+                              "by table %s",
+                              name(), r, c, old->name());
+                    }
+                } else {
+                    use[r][c] = this;
+                }
+            } catch (std::out_of_range & /*e*/) {
+                error(lineno, "Table %s using out-of-bounds (%d,%d)", name(), r, c);
+            }
+        }
+        if (bus_use && row.bus.count(bus_type)) {
+            int bus = row.bus.at(bus_type);
+            if (Table *old = (*bus_use)[row.row][bus]) {
+                if (old != this && old->p4_name() != p4_name())
+                    error(lineno,
+                          "Table %s trying to use bus %d on row %d which is already in "
+                          "use by table %s",
+                          name(), bus, row.row, old->name());
+            } else {
+                (*bus_use)[row.row][bus] = this;
+            }
+        }
+    }
+}
+
+void Table::alloc_global_busses() { BUG(); }
+void Table::alloc_global_srams() { BUG(); }
+void Table::alloc_global_tcams() { BUG(); }
+
+void Table::alloc_busses(BFN::Alloc2Dbase<Table *> &bus_use, Layout::bus_type_t bus_type) {
+    for (auto &row : layout) {
+        // If row.memunits is empty, we don't really need a bus here (won't use it
+        // for anything).
+        // E.g. An exact match table with 4 or less static entries (JBay) or 1
+        // static entry (Tofino)
+        // In these examples compiler does gateway optimization where static
+        // entries are encoded in the gateway and no RAM's are used. We skip bus
+        // allocation in these cases.
+        if (!row.bus.count(bus_type) && !row.memunits.empty()) {
+            // FIXME -- iterate over bus_use[row.row] rather than assuming 2 rows
+            if (bus_use[row.row][0] == this)
+                row.bus[bus_type] = 0;
+            else if (bus_use[row.row][1] == this)
+                row.bus[bus_type] = 1;
+            else if (!bus_use[row.row][0])
+                bus_use[row.row][row.bus[bus_type] = 0] = this;
+            else if (!bus_use[row.row][1])
+                bus_use[row.row][row.bus[bus_type] = 1] = this;
+            else
+                error(lineno, "No bus available on row %d for table %s", row.row, name());
+        }
+    }
+}
+
+void Table::alloc_id(const char *idname, int &id, int &next_id, int max_id, bool order,
+                     BFN::Alloc1Dbase<Table *> &use) {
+    if (id >= 0) {
+        next_id = id;
+        return;
+    }
+    while (++next_id < max_id && use[next_id]) {
+    }
+    if (next_id >= max_id && !order) {
+        next_id = -1;
+        while (++next_id < max_id && use[next_id]) {
+        }
+    }
+    if (next_id < max_id)
+        use[id = next_id] = this;
+    else
+        error(lineno, "Can't pick %s id for table %s (ran out)", idname, name());
+}
+
+void Table::alloc_maprams() {
+    if (!Target::SYNTH2PORT_NEED_MAPRAMS()) return;
+    for (auto &row : layout) {
+        int sram_row = row.row / 2;
+        if ((row.row & 1) == 0) {
+            error(row.lineno, "Can only use 2-port rams on right side srams (odd logical rows)");
+            continue;
+        }
+        if (row.maprams.empty()) {
+            int use = 0;
+            for (unsigned i = 0; i < row.memunits.size(); i++) {
+                while (use < MAPRAM_UNITS_PER_ROW && stage->mapram_use[sram_row][use]) use++;
+                if (use >= MAPRAM_UNITS_PER_ROW) {
+                    error(row.lineno, "Ran out of maprams on row %d in stage %d", sram_row,
+                          stage->stageno);
+                    break;
+                }
+                row.maprams.push_back(use);
+                stage->mapram_use[sram_row][use++] = this;
+            }
+        } else {
+            for (auto mapcol : row.maprams) {
+                if (auto *old = stage->mapram_use[sram_row][mapcol]) {
+                    if (!allow_ram_sharing(this, old))
+                        error(lineno,
+                              "Table %s trying to use mapram %d,%d which is use by "
+                              "table %s",
+                              name(), sram_row, mapcol, old->name());
+                } else {
+                    stage->mapram_use[sram_row][mapcol] = this;
+                }
+            }
+        }
+    }
+}
+
+void Table::alloc_vpns() {
+    if (no_vpns || layout_size() == 0 || layout[0].vpns.size() > 0) return;
+    setup_vpns(layout, 0);
+}
+
+void Table::check_next(const Table::Ref &n) {
+    if (n.check()) {
+        if (logical_id >= 0 && n->logical_id >= 0 ? table_id() > n->table_id()
+                                                  : stage->stageno > n->stage->stageno)
+            error(n.lineno, "Next table %s comes before %s", n->name(), name());
+        if (gress != n->gress)
+            error(n.lineno, "Next table %s in %s when %s is in %s", n->name(),
+                  P4Table::direction_name(n->gress).c_str(), name(),
+                  P4Table::direction_name(gress).c_str());
+        // Need to add to the predication map
+        Table *tbl = get_match_table();
+        if (!tbl) tbl = this;  // standalone gateway
+        if (tbl != n) {
+            n->pred[tbl];  // ensure that its in the map, even as an empty set
+        }
+    }
+}
+
+void Table::for_all_next(std::function<void(const Ref &)> fn) {
+    for (auto &n1 : hit_next)
+        for (auto &n2 : n1) fn(n2);
+    for (auto &n : miss_next) fn(n);
+}
+
+void Table::check_next(NextTables &next) {
+    for (auto &n : next) check_next(n);
+    Table *tbl = get_match_table();
+    if (!tbl) tbl = this;
+    next.resolve_long_branch(tbl, long_branch);
+}
+
+void Table::check_next() {
+    for (auto &lb : long_branch) {
+        for (auto &t : lb.second) {
+            if (t.check()) {
+                if (t->stage->stageno <= stage->stageno)
+                    error(t.lineno, "Long branch table %s is not in a later stage than %s",
+                          t->name(), name());
+                else if (stage->stageno + 1 == t->stage->stageno)
+                    warning(t.lineno, "Long branch table %s is the next stage after %s", t->name(),
+                            name());
+                if (gress != t->gress)
+                    error(t.lineno, "Long branch table %s in %s when %s is in %s", t->name(),
+                          P4Table::direction_name(t->gress).c_str(), name(),
+                          P4Table::direction_name(gress).c_str());
+            }
+        }
+    }
+    for (auto &hn : hit_next) check_next(hn);
+    for (auto &hn : extra_next_lut) check_next(hn);
+    check_next(miss_next);
+}
+
+void Table::set_pred() {
+    if (actions == nullptr) return;
+    for (auto &act : *actions) {
+        if (!act.default_only)
+            for (auto &n : act.next_table_ref) n->pred[this].insert(&act);
+        for (auto &n : act.next_table_miss_ref) n->pred[this].insert(&act);
+    }
+}
+
+bool Table::choose_logical_id(const slist<Table *> *work) {
+    if (logical_id >= 0) return true;
+    if (work && find(*work, this) != work->end()) {
+        error(lineno, "Logical table loop with table %s", name());
+        for (auto *tbl : *work) {
+            if (tbl == this) break;
+            warning(tbl->lineno, "loop involves table %s", tbl->name());
+        }
+        return false;
+    }
+    slist<Table *> local(this, work);
+    for (auto *p : Keys(pred))
+        if (!p->choose_logical_id(&local)) return false;
+    int min_id = 0, max_id = LOGICAL_TABLES_PER_STAGE - 1;
+    for (auto *p : Keys(pred))
+        if (p->stage->stageno == stage->stageno && p->logical_id >= min_id)
+            min_id = p->logical_id + 1;
+    for_all_next([&max_id, this](const Ref &n) {
+        if (n && n->stage->stageno == stage->stageno && n->logical_id >= 0 &&
+            n->logical_id <= max_id) {
+            max_id = n->logical_id - 1;
+        }
+    });
+    for (int id = min_id; id <= max_id; ++id) {
+        if (!stage->logical_id_use[id]) {
+            logical_id = id;
+            stage->logical_id_use[id] = this;
+            return true;
+        }
+    }
+    error(lineno, "Can't find a logcial id for table %s", name());
+    return false;
+}
+
+void Table::need_bus(int lineno, BFN::Alloc1Dbase<Table *> &use, int idx, const char *busname) {
+    if (use[idx] && use[idx] != this) {
+        error(lineno, "%s bus conflict on row %d between tables %s and %s", busname, idx, name(),
+              use[idx]->name());
+        error(use[idx]->lineno, "%s defined here", use[idx]->name());
+    } else {
+        use[idx] = this;
+    }
+}
+
+bitvec Table::compute_reachable_tables() {
+    reachable_tables_[uid] = 1;
+    for_all_next([this](const Ref &t) {
+        if (t) {
+            reachable_tables_ |= t->reachable_tables();
+        }
+    });
+    return reachable_tables_;
+}
+
+std::string Table::loc() const {
+    std::stringstream ss;
+    ss << "(" << gress << ", stage=" << stage->stageno << ")";
+    return ss.str();
+}
+
+void Table::pass1() {
+    alloc_vpns();
+    check_next();
+    if (auto att = get_attached()) att->pass1(get_match_table());
+    if (action_bus) action_bus->pass1(this);
+
+    if (actions) {
+        if (instruction) {
+            validate_instruction(instruction);
+        } else {
+            // Phase0 has empty actions which list param order
+            if (table_type() != PHASE0) {
+                error(lineno, "No instruction call provided, but actions provided");
+            }
+        }
+        actions->pass1(this);
+    }
+    set_pred();
+
+    if (action) {
+        auto reqd_args = 2;
+        action->validate_call(action, get_match_table(), reqd_args,
+                              HashDistribution::ACTION_DATA_ADDRESS, action);
+    }
+    for (auto &lb : long_branch) {
+        int last_stage = -1;
+        for (auto &n : lb.second) {
+            if (!n) continue;  // already output error about invalid table
+            last_stage = std::max(last_stage, n->stage->stageno);
+            if (n->long_branch_input >= 0 && n->long_branch_input != lb.first)
+                error(lb.second.lineno, "Conflicting long branch input (%d and %d) for table %s",
+                      lb.first, n->long_branch_input, n->name());
+            n->long_branch_input = lb.first;
+        }
+        // we track the long branch as being 'live' from the stage it is set until the stage
+        // before it is terminated; it can still be use to trigger a table in that stage, even
+        // though it is not 'live' there.  It can also be reused (set) in that stage for use in
+        // later stages.  This matches the range of stages we need to set timing regs for.
+        for (int st = stage->stageno; st < last_stage; ++st) {
+            auto stg = Stage::stage(gress, st);
+            BUG_CHECK(stg);
+            auto &prev = stg->long_branch_use[lb.first];
+            if (prev && *prev != lb.second) {
+                error(lb.second.lineno, "Conflicting use of long_branch tag %d", lb.first);
+                error(prev->lineno, "previous use");
+            } else {
+                prev = &lb.second;
+            }
+            stg->long_branch_thread[gress] |= 1U << lb.first;
+        }
+        auto last_stg = Stage::stage(gress, last_stage);
+        BUG_CHECK(last_stg);
+        last_stg->long_branch_thread[gress] |= 1U << lb.first;
+        last_stg->long_branch_terminate |= 1U << lb.first;
+    }
+}
+
+static void overlap_test(int lineno, unsigned a_bit,
+                         ordered_map<std::string, Table::Format::Field>::iterator a, unsigned b_bit,
+                         ordered_map<std::string, Table::Format::Field>::iterator b) {
+    if (b_bit <= a->second.hi(a_bit)) {
+        if (a->second.group || b->second.group)
+            error(lineno, "Field %s(%d) overlaps with %s(%d)", a->first.c_str(), a->second.group,
+                  b->first.c_str(), b->second.group);
+        else
+            error(lineno, "Field %s overlaps with %s", a->first.c_str(), b->first.c_str());
+    }
+}
+
+static void append_bits(std::vector<Table::Format::bitrange_t> &vec, int lo, int hi) {
+    /* split any chunks that cross a word (128-bit) boundary */
+    while (lo < hi && lo / 128U != hi / 128U) {
+        vec.emplace_back(lo, lo | 127);
+        lo = (lo | 127) + 1;
+    }
+    vec.emplace_back(lo, hi);
+}
+
+bool Table::Format::equiv(const ordered_map<std::string, Field> &a,
+                          const ordered_map<std::string, Field> &b) {
+    if (a.size() != b.size()) return false;
+    for (auto &el : a)
+        if (!b.count(el.first) || b.at(el.first) != el.second) return false;
+    return true;
+}
+
+Table::Format::Format(Table *t, const VECTOR(pair_t) & data, bool may_overlap) : tbl(t) {
+    unsigned nextbit = 0;
+    fmt.resize(1);
+    for (auto &kv : data) {
+        if (lineno < 0) lineno = kv.key.lineno;
+        if (!CHECKTYPE2M(kv.key, tSTR, tCMD, "expecting field desc")) continue;
+        value_t &name = kv.key.type == tSTR ? kv.key : kv.key[0];
+        unsigned idx = 0;
+        if (kv.key.type == tCMD &&
+            (kv.key.vec.size != 2 || !CHECKTYPE(kv.key[1], tINT) || (idx = kv.key[1].i) > 15)) {
+            error(kv.key.lineno, "Invalid field group");
+            continue;
+        }
+        if (kv.value.type != tVEC &&
+            !(CHECKTYPE2(kv.value, tINT, tRANGE) && VALIDATE_RANGE(kv.value)))
+            continue;
+        if (idx >= fmt.size()) fmt.resize(idx + 1);
+        if (fmt[idx].count(name.s) > 0) {
+            if (kv.key.type == tCMD)
+                error(name.lineno, "Duplicate key %s(%d) in format", name.s, idx);
+            else
+                error(name.lineno, "Duplicate key %s in format", name.s);
+            continue;
+        }
+        Field *f = &fmt[idx].emplace(name.s, Field(this)).first->second;
+        f->group = idx;
+        if (kv.value.type == tINT) {
+            if (kv.value.i <= 0)
+                error(kv.value.lineno, "invalid size %" PRId64 " for format field %s", kv.value.i,
+                      name.s);
+            f->size = kv.value.i;
+            append_bits(f->bits, nextbit, nextbit + f->size - 1);
+        } else if (kv.value.type == tRANGE) {
+            if (kv.value.range.lo > kv.value.range.hi)
+                error(kv.value.lineno, "invalid range %d..%d", kv.value.range.lo,
+                      kv.value.range.hi);
+            append_bits(f->bits, kv.value.range.lo, kv.value.range.hi);
+            f->size = kv.value.range.hi - kv.value.range.lo + 1;
+        } else if (kv.value.type == tVEC) {
+            f->size = 0;
+            for (auto &c : kv.value.vec)
+                if (CHECKTYPE(c, tRANGE) && VALIDATE_RANGE(c)) {
+                    append_bits(f->bits, c.range.lo, c.range.hi);
+                    f->size += c.range.hi - c.range.lo + 1;
+                    if ((size_t)c.range.hi + 1 > size) size = c.range.hi + 1;
+                }
+        }
+        nextbit = f->bits.back().hi + 1;
+        if (nextbit > size) size = nextbit;
+    }
+    if (!may_overlap) {
+        for (auto &grp : fmt) {
+            for (auto it = grp.begin(); it != grp.end(); ++it) {
+                for (auto &piece : it->second.bits) {
+                    auto p = byindex.upper_bound(piece.lo);
+                    if (p != byindex.end()) overlap_test(lineno, piece.lo, it, p->first, p->second);
+                    if (p != byindex.begin()) {
+                        --p;
+                        overlap_test(lineno, p->first, p->second, piece.lo, it);
+                        if (p->first == piece.lo && piece.hi <= p->second->second.hi(piece.lo))
+                            continue;
+                    }
+                    byindex[piece.lo] = it;
+                }
+            }
+        }
+    }
+    for (size_t i = 1; i < fmt.size(); i++)
+        if (!equiv(fmt[0], fmt[i]))
+            error(data[0].key.lineno, "Format group %zu doesn't match group 0", i);
+    for (log2size = 0; (1U << log2size) < size; log2size++) {
+    }
+    if (error_count > 0) return;
+    for (auto &f : fmt[0]) {
+        f.second.by_group = new Field *[fmt.size()];
+        f.second.by_group[0] = &f.second;
+    }
+    for (size_t i = 1; i < fmt.size(); i++)
+        for (auto &f : fmt[i]) {
+            Field &f0 = fmt[0].at(f.first);
+            f.second.by_group = f0.by_group;
+            f.second.by_group[i] = &f.second;
+        }
+}
+
+Table::Format::~Format() {
+    for (auto &f : fmt[0]) delete[] f.second.by_group;
+}
+
+void Table::Format::pass1(Table *tbl) {
+    std::map<unsigned, Field *> immed_fields;
+    unsigned lo = INT_MAX, hi = 0;
+    for (auto &f : fmt[0]) {
+        if (!(f.second.flags & Field::USED_IMMED)) continue;
+        if (f.second.bits.size() > 1)
+            error(lineno, "Immmediate action data %s cannot be split", f.first.c_str());
+        immed_fields[f.second.bits[0].lo] = &f.second;
+        if (f.second.bits[0].lo < lo) {
+            immed = &f.second;
+            lo = immed->bits[0].lo;
+        }
+        if (f.second.bits[0].hi > hi) hi = f.second.bits[0].hi;
+    }
+    if (immed_fields.empty()) {
+        LOG2("table " << tbl->name() << " has no immediate data");
+    } else {
+        LOG2("table " << tbl->name() << " has " << immed_fields.size()
+                      << " immediate data fields "
+                         "over "
+                      << (hi + 1 - lo) << " bits");
+        if (hi - lo >= Target::MAX_IMMED_ACTION_DATA()) {
+            error(lineno, "Immediate data for table %s spread over more than %d bits", tbl->name(),
+                  Target::MAX_IMMED_ACTION_DATA());
+            return;
+        }
+        immed_size = hi + 1 - lo;
+        for (unsigned i = 1; i < fmt.size(); i++) {
+            int delta = static_cast<int>(immed->by_group[i]->bits[0].lo) -
+                        static_cast<int>(immed->bits[0].lo);
+            for (auto &f : fmt[0]) {
+                if (!(f.second.flags & Field::USED_IMMED)) continue;
+                if (delta != static_cast<int>(f.second.by_group[i]->bits[0].lo) -
+                                 static_cast<int>(f.second.bits[0].lo)) {
+                    error(lineno,
+                          "Immediate data field %s for table %s does not match across "
+                          "ways in a ram",
+                          f.first.c_str(), tbl->name());
+                    break;
+                }
+            }
+        }
+    }
+    lo = INT_MAX, hi = 0;
+    for (auto &[name, field] : fmt[0]) {
+        // FIXME -- should use a flag rather than names here?  Someone would need to set the flag
+        if (name == "match" || name == "version" || name == "valid") continue;
+        lo = std::min(lo, field.bit(0));
+        hi = std::max(hi, field.bit(field.size - 1));
+    }
+    overhead_size = hi > lo ? hi - lo + 1 : 0;
+    overhead_start = hi > lo ? lo : 0;
+}
+
+void Table::Format::pass2(Table *tbl) {
+    int byte[4] = {-1, -1, -1, -1};
+    int half[2] = {-1, -1};
+    int word = -1;
+    bool err = false;
+    for (auto &f : fmt[0]) {
+        int byte_slot = tbl->find_on_actionbus(&f.second, 0, 8 * f.second.size - 1, f.second.size);
+        if (byte_slot < 0) continue;
+        int slot = Stage::action_bus_slot_map[byte_slot];
+        unsigned off = f.second.immed_bit(0);
+        switch (Stage::action_bus_slot_size[slot]) {
+            case 8:
+                for (unsigned b = off / 8; b <= (off + f.second.size - 1) / 8; b++) {
+                    if (b >= 4 || (b & 3) != (slot & 3) || (byte[b] >= 0 && byte[b] != slot) ||
+                        (byte[b ^ 1] >= 0 && byte[b ^ 1] != (slot ^ 1)) ||
+                        Stage::action_bus_slot_size[slot] != 8) {
+                        err = true;
+                        break;
+                    }
+                    byte[b] = slot++;
+                }
+                break;
+            case 16:
+                for (unsigned w = off / 16; w <= (off + f.second.size - 1) / 16; w++) {
+                    if (w >= 2 || (w & 1) != (slot & 1) || (half[w] >= 0 && half[w] != slot) ||
+                        Stage::action_bus_slot_size[slot] != 16) {
+                        err = true;
+                        break;
+                    }
+                    half[w] = slot++;
+                }
+                break;
+            case 32:
+                if (word >= 0 && word != slot) err = true;
+                word = slot;
+                break;
+            default:
+                BUG();
+        }
+        if (err) error(lineno, "Immediate data misaligned for action bus byte %d", byte_slot);
+    }
+}
+
+std::ostream &operator<<(std::ostream &out, const Table::Format::Field &f) {
+    out << "(size = " << f.size << " ";
+    for (auto b : f.bits) out << "[" << b.lo << ".." << b.hi << "]";
+    out << ")";
+    return out;
+}
+
+bool Table::Actions::Action::equiv(Action *a) {
+    if (instr.size() != a->instr.size()) return false;
+    for (unsigned i = 0; i < instr.size(); i++)
+        if (!instr[i]->equiv(a->instr[i])) return false;
+    if (attached.size() != a->attached.size()) return false;
+    for (unsigned i = 0; i < attached.size(); i++)
+        if (attached[i] != a->attached[i]) return false;
+    return true;
+}
+
+bool Table::Actions::Action::equivVLIW(Action *a) {
+    if (instr.size() != a->instr.size()) return false;
+    for (unsigned i = 0; i < instr.size(); i++)
+        if (!instr[i]->equiv(a->instr[i])) return false;
+    return true;
+}
+
+std::map<std::string, std::vector<Table::Actions::Action::alias_value_t *>>
+Table::Actions::Action::reverse_alias() const {
+    std::map<std::string, std::vector<alias_value_t *>> rv;
+    for (auto &a : alias) rv[a.second.name].push_back(&a);
+    return rv;
+}
+
+std::string Table::Actions::Action::alias_lookup(int lineno, std::string name, int &lo,
+                                                 int &hi) const {
+    bool err = false;
+    bool found = false;
+    while (alias.count(name) && !found) {
+        for (auto &a : ValuesForKey(alias, name)) {
+            // FIXME -- need better handling of multiple aliases...
+            if (lo >= 0 && a.name != "hash_dist") {
+                if (a.lo >= 0) {
+                    if (a.hi >= 0 && hi + a.lo > a.hi) {
+                        err = true;
+                        continue;
+                    }
+                    lo += a.lo;
+                    hi += a.lo;
+                    name = a.name;
+                    found = true;
+                }
+            } else {
+                lo = a.lo;
+                hi = a.hi;
+                name = (alias.count(a.name)) ? alias_lookup(lineno, a.name, lo, hi) : a.name;
+            }
+            lineno = a.lineno;
+            err = false;
+            break;
+        }
+        if (err) {
+            error(lineno, "invalid bitslice of %s", name.c_str());
+            break;
+        }
+    }
+    return name;
+}
+
+Table::Actions::Action::alias_t::alias_t(value_t &data) {
+    lineno = data.lineno;
+    if (CHECKTYPE3(data, tSTR, tCMD, tINT)) {
+        if (data.type == tSTR) {
+            name = data.s;
+            lo = 0;
+            hi = -1;
+        } else if (data.type == tCMD) {
+            name = data.vec[0].s;
+            if (CHECKTYPE2(data.vec[1], tINT, tRANGE)) {
+                if (data.vec[1].type == tINT) {
+                    lo = hi = data.vec[1].i;
+                } else {
+                    lo = data.vec[1].range.lo;
+                    hi = data.vec[1].range.hi;
+                }
+            }
+        } else {
+            is_constant = true;
+        }
+        value = data.i;
+    }
+}
+
+/**
+ * Builds a map of conditional variable to which bits in the action data format that they
+ * control.  Used for JSON later.
+ *
+ * @sa asm_output::EmitAction::mod_cond_value
+ */
+void Table::Actions::Action::setup_mod_cond_values(value_t &map) {
+    for (auto &kv : map.map) {
+        if (CHECKTYPE(kv.key, tSTR) && CHECKTYPE(kv.value, tVEC)) {
+            mod_cond_values[kv.key.s].resize(2, bitvec());
+            for (auto &v : kv.value.vec) {
+                if (CHECKTYPEPM(v, tCMD, v.vec.size == 2, "action data or immediate slice")) {
+                    int array_index = -1;
+                    if (v[0] == "action_data_table") {
+                        array_index = MC_ADT;
+                    } else if (v[0] == "immediate") {
+                        array_index = MC_IMMED;
+                    } else {
+                        error(map.lineno,
+                              "A non action_data_table or immediate value in the "
+                              "mod_con_value map: %s",
+                              v[0].s);
+                        continue;
+                    }
+                    int lo = -1;
+                    int hi = -1;
+                    if (v[1].type == tINT) {
+                        lo = hi = v[1].i;
+                    } else if (v[1].type == tRANGE) {
+                        lo = v[1].range.lo;
+                        hi = v[1].range.hi;
+                    }
+                    mod_cond_values.at(kv.key.s).at(array_index).setrange(lo, hi - lo + 1);
+                }
+            }
+        }
+    }
+}
+
+Table::Actions::Action::Action(Table *tbl, Actions *actions, pair_t &kv, int pos) {
+    lineno = kv.key.lineno;
+    position_in_assembly = pos;
+    if (kv.key.type == tCMD) {
+        name = kv.key[0].s;
+        if (CHECKTYPE(kv.key[1], tINT)) code = kv.key[1].i;
+        if (kv.key.vec.size > 2 && CHECKTYPE(kv.key[2], tINT)) {
+            if ((addr = kv.key[2].i) < 0 || addr >= ACTION_IMEM_ADDR_MAX)
+                error(kv.key[2].lineno, "Invalid instruction address %d", addr);
+        }
+    } else if (kv.key.type == tINT) {
+        name = std::to_string((code = kv.key.i));
+    } else {
+        name = kv.key.s;
+    }
+    if (code >= 0) {
+        if (actions->code_use[code]) {
+            if (!equivVLIW(actions->by_code[code]))
+                error(kv.key.lineno, "Duplicate action code %d", code);
+        } else {
+            actions->by_code[code] = this;
+            actions->code_use[code] = true;
+        }
+    }
+    for (auto &i : kv.value.vec) {
+        if (i.type == tINT && instr.empty()) {
+            if ((addr = i.i) < 0 || i.i >= ACTION_IMEM_ADDR_MAX)
+                error(i.lineno, "Invalid instruction address %" PRId64 "", i.i);
+        } else if (i.type == tMAP) {
+            for (auto &a : i.map)
+                if (CHECKTYPE(a.key, tSTR)) {
+                    if (a.key == "p4_param_order") {
+                        if (!CHECKTYPE(a.value, tMAP)) continue;
+
+                        unsigned position = 0;
+                        for (auto &v : a.value.map) {
+                            if (!(CHECKTYPE(v.key, tSTR) && CHECKTYPE2(v.value, tINT, tMAP)))
+                                continue;
+
+                            if (v.value.type == tINT) {
+                                p4_params_list.emplace_back(v.key.s, position++, v.value.i);
+                            } else {
+                                p4_param p(v.key.s, position++);
+                                for (auto &w : v.value.map) {
+                                    if (!CHECKTYPE(w.key, tSTR)) continue;
+                                    if (w.key == "width" && CHECKTYPE(w.value, tINT))
+                                        p.bit_width = w.value.i;
+                                    else if (w.key == "context_json" && CHECKTYPE(w.value, tMAP))
+                                        p.context_json = toJson(w.value.map);
+                                    else
+                                        error(lineno, "Incorrect param type %s in p4_param_order",
+                                              w.key.s);
+                                }
+
+                                p4_params_list.emplace_back(std::move(p));
+                            }
+                        }
+                    } else if (a.key == "hit_allowed") {
+                        if CHECKTYPE (a.value, tMAP) {
+                            for (auto &p : a.value.map) {
+                                if (CHECKTYPE(p.key, tSTR) && CHECKTYPE(p.value, tSTR)) {
+                                    if (p.key == "allowed")
+                                        hit_allowed = get_bool(p.value);
+                                    else if (p.key == "reason")
+                                        hit_disallowed_reason = p.value.s;
+                                }
+                            }
+                        }
+                    } else if (a.key == "default_action" || a.key == "default_only_action") {
+                        if CHECKTYPE (a.value, tMAP) {
+                            for (auto &p : a.value.map) {
+                                if (CHECKTYPE(p.key, tSTR) && CHECKTYPE(p.value, tSTR)) {
+                                    if (p.key == "allowed")
+                                        default_allowed = get_bool(p.value);
+                                    else if (p.key == "is_constant")
+                                        is_constant = get_bool(p.value);
+                                    else if (p.key == "reason")
+                                        default_disallowed_reason = p.value.s;
+                                }
+                            }
+                        }
+                        default_only = a.key == "default_only_action";
+                    } else if (a.key == "handle") {
+                        if CHECKTYPE (a.value, tINT) {
+                            handle = a.value.i;
+                        }
+                    } else if (a.key == "next_table") {
+                        if (a.value.type == tINT)
+                            next_table_encode = a.value.i;
+                        else
+                            next_table_ref = a.value;
+                    } else if (a.key == "next_table_miss") {
+                        next_table_miss_ref = a.value;
+                    } else if (a.key == "mod_cond_value") {
+                        if (CHECKTYPE(a.value, tMAP)) {
+                            setup_mod_cond_values(a.value);
+                        }
+                    } else if (a.key == "context_json") {
+                        if (CHECKTYPE(a.value, tMAP)) {
+                            context_json = toJson(a.value.map);
+                        }
+                    } else if (CHECKTYPE3(a.value, tSTR, tCMD, tINT)) {
+                        if (a.value.type == tINT) {
+                            auto k = alias.find(a.key.s);
+                            if (k == alias.end()) {
+                                alias.emplace(a.key.s, a.value);
+                            } else {
+                                k->second.is_constant = true;
+                                k->second.value = a.value.i;
+                            }
+                        } else if (a.value.type == tSTR) {
+                            auto k = alias.find(a.value.s);
+                            if (k == alias.end()) {
+                                alias.emplace(a.key.s, a.value);
+                            } else {
+                                auto alias_value = k->second;
+                                alias.erase(k);
+                                alias.emplace(a.key.s, alias_value);
+                            }
+                        } else {
+                            alias.emplace(a.key.s, a.value);
+                        }
+                    }
+                }
+
+        } else if (CHECKTYPE2(i, tSTR, tCMD)) {
+            VECTOR(value_t) tmp;
+            if (i.type == tSTR) {
+                if (!*i.s) continue;  // skip blank line
+                VECTOR_init1(tmp, i);
+            } else {
+                VECTOR_initcopy(tmp, i.vec);
+            }
+            if (auto *p = Instruction::decode(tbl, this, tmp))
+                instr.emplace_back(p);
+            else if (tbl->to<MatchTable>() || tbl->to<TernaryIndirectTable>() ||
+                     tbl->to<ActionTable>())
+                attached.emplace_back(i, tbl);
+            else
+                error(i.lineno, "Unknown instruction %s", tmp[0].s);
+            VECTOR_fini(tmp);
+        }
+    }
+}
+
+Table::Actions::Action::Action(const char *n, int l) : name(n), lineno(l) {}
+Table::Actions::Action::~Action() {}
+
+Table::Actions::Actions(Table *tbl, VECTOR(pair_t) & data) {
+    table = tbl;
+    int pos = 0;
+    for (auto &kv : data) {
+        if ((kv.key.type != tINT && !CHECKTYPE2M(kv.key, tSTR, tCMD, "action")) ||
+            !CHECKTYPE(kv.value, tVEC))
+            continue;
+        std::string name = kv.key.type == tINT   ? std::to_string(kv.key.i)
+                           : kv.key.type == tSTR ? kv.key.s
+                                                 : kv.key[0].s;
+        if (actions.count(name)) {
+            error(kv.key.lineno, "Duplicate action %s", name.c_str());
+            continue;
+        }
+        actions.emplace(name, tbl, this, kv, pos++);
+    }
+}
+
+int Table::Actions::hit_actions_count() const {
+    int cnt = 0;
+    for (auto &a : actions) {
+        if (a.second.hit_allowed) ++cnt;
+    }
+    return cnt;
+}
+
+int Table::Actions::default_actions_count() const {
+    int cnt = 0;
+    for (auto &a : actions) {
+        if (a.second.default_allowed) ++cnt;
+    }
+    return cnt;
+}
+
+AlwaysRunTable::AlwaysRunTable(gress_t gress, Stage *stage, pair_t &init)
+    : Table(init.key.lineno,
+            "always run " + to_string(gress) + " stage " + to_string(stage->stageno), gress,
+            stage) {
+    VECTOR(pair_t) tmp = {1, 1, &init};
+    actions.reset(new Actions(this, tmp));
+    if (actions->count() == 1) {  // unless there was an error parsing the action...
+        auto &act = *actions->begin();
+        if (act.addr >= 0) error(act.lineno, "always run action address is fixed");
+        act.addr = ACTION_ALWAYS_RUN_IMEM_ADDR;
+    }
+}
+
+void Table::Actions::Action::check_next_ref(Table *tbl, const Table::Ref &ref) const {
+    if (ref.check() && ref->table_id() >= 0 && ref->table_id() < tbl->table_id()) {
+        error(lineno, "Next table %s for action %s before containing table %s", ref->name(),
+              name.c_str(), tbl->name());
+        return;
+    }
+
+    if (ref->table_id() > (1U << NEXT_TABLE_MAX_RAM_EXTRACT_BITS) - 1 &&
+        tbl->get_hit_next().size() == 0) {
+        error(lineno, "Next table cannot properly be saved on the RAM line for this action %s",
+              name.c_str());
+    }
+}
+
+/**
+ * By the end of this function, both next_table and next_table_miss_ref will have been created
+ * and validated.
+ *
+ * Each action must have at least next_table or a next_table_miss from the node.
+ *     - next_table: The next table to run on hit
+ *     - next_table_miss: The next table to run on miss
+ *
+ * The next_table_encode is the entry into the next_table_hitmap, if a next_table hit map is
+ * provided.  If the next_table hit map is empty, then the next_table_encode won't have been
+ * set.  If the action can be used on a hit, then either a next_table_ref/next_table_encode
+ * would be provided.
+ *
+ * The next_table_ref could come from the next_table as an int value, which would be on offset
+ * into the hit_map
+ */
+void Table::Actions::Action::check_next(Table *tbl) {
+    if (next_table_encode >= 0) {
+        int idx = next_table_encode;
+        if (idx < tbl->get_hit_next().size()) {
+            next_table_ref = tbl->get_hit_next().at(idx);
+        } else if ((idx -= tbl->get_hit_next().size()) < tbl->extra_next_lut.size()) {
+            next_table_ref = tbl->extra_next_lut.at(idx);
+        } else {
+            error(lineno,
+                  "The encoding on action %s is outside the range of the hitmap in "
+                  "table %s",
+                  name.c_str(), tbl->name());
+        }
+    }
+
+    if (!next_table_miss_ref.set() && !next_table_ref.set()) {
+        if (tbl->get_hit_next().size() != 1) {
+            error(lineno,
+                  "Either next_table or next_table_miss must be required on action %s "
+                  "if the next table cannot be determined",
+                  name.c_str());
+        } else {
+            next_table_ref = tbl->get_hit_next()[0];
+            next_table_miss_ref = next_table_ref;
+            next_table_encode = 0;
+        }
+    } else if (!next_table_ref.set()) {
+        if (!default_only) {
+            error(lineno,
+                  "Action %s on table %s that can be programmed on hit must have "
+                  "a next_table encoding",
+                  name.c_str(), tbl->name());
+        }
+        next_table_ref = next_table_miss_ref;
+    } else if (!next_table_miss_ref.set()) {
+        next_table_miss_ref = next_table_ref;
+    }
+    tbl->check_next(next_table_ref);
+    tbl->check_next(next_table_miss_ref);
+    if (next_table_encode < 0 && !default_only) next_table_ref.force_single_next_table();
+    for (auto &n : next_table_ref) check_next_ref(tbl, n);
+    for (auto &n : next_table_miss_ref) check_next_ref(tbl, n);
+}
+
+void Table::Actions::Action::pass1(Table *tbl) {
+    // The compiler generates all action handles which must be specified in the
+    // assembly, if not we throw an error.
+    if ((handle == 0) && tbl->needs_handle()) {
+        error(lineno, "No action handle specified for table - %s, action - %s", tbl->name(),
+              name.c_str());
+    }
+
+    if (tbl->needs_next()) {
+        check_next(tbl);
+    }
+
+    if (tbl->get_default_action() == name) {
+        if (!tbl->default_action_handle) tbl->default_action_handle = handle;
+        if (tbl->default_only_action) default_only = true;
+    }
+    /* SALU actions always have addr == -1 (so iaddr == -1) */
+    int iaddr = -1;
+    bool shared_VLIW = false;
+    for (auto &inst : instr) {
+        inst.reset(inst.release()->pass1(tbl, this));
+        if (inst->slot >= 0) {
+            if (slot_use[inst->slot])
+                error(inst->lineno, "instruction slot %d used multiple times in action %s",
+                      inst->slot, name.c_str());
+            slot_use[inst->slot] = 1;
+        }
+    }
+    if (addr >= 0) {
+        if (auto old = tbl->stage->imem_addr_use[imem_thread(tbl->gress)][addr]) {
+            if (equivVLIW(old)) {
+                shared_VLIW = true;
+            } else {
+                error(lineno, "action instruction addr %d in use elsewhere", addr);
+                warning(old->lineno, "also defined here");
+            }
+        }
+        tbl->stage->imem_addr_use[imem_thread(tbl->gress)][addr] = this;
+        iaddr = addr / ACTION_IMEM_COLORS;
+    }
+    if (!shared_VLIW) {
+        for (auto &inst : instr) {
+            if (inst->slot >= 0 && iaddr >= 0) {
+                if (tbl->stage->imem_use[iaddr][inst->slot])
+                    error(lineno, "action instruction slot %d.%d in use elsewhere", iaddr,
+                          inst->slot);
+                tbl->stage->imem_use[iaddr][inst->slot] = 1;
+            }
+        }
+    }
+    for (auto &a : alias) {
+        while (alias.count(a.second.name) >= 1) {
+            // the alias refers to something else in the alias list
+            auto &rec = alias.find(a.second.name)->second;
+            if (rec.name == a.first) {
+                error(a.second.lineno, "recursive alias %s", a.first.c_str());
+                break;
+            }
+            if (rec.lo > 0) {
+                a.second.lo += rec.lo;
+                if (a.second.hi >= 0) a.second.hi += rec.lo;
+            }
+            if (rec.hi > 0 && a.second.hi < 0) a.second.hi = rec.hi;
+            if (a.second.lo < rec.lo || (rec.hi >= 0 && a.second.hi > rec.hi)) {
+                error(a.second.lineno,
+                      "alias for %s:%s(%d:%d) has out of range index from allowed %s:%s(%d:%d)",
+                      a.first.c_str(), a.second.name.c_str(), a.second.lo, a.second.hi,
+                      a.second.name.c_str(), rec.name.c_str(), rec.lo, rec.hi);
+                break;
+            }
+            a.second.name = rec.name;
+        }
+        if (auto *f = tbl->lookup_field(a.second.name, name)) {
+            if (a.second.hi < 0) a.second.hi = f->size - 1;
+        } else if (a.second.name == "hash_dist" && a.second.lo >= 0) {
+            // nothing to be done for now.  lo..hi is the hash dist index rather than
+            // a bit index, which will cause problems if we want to later slice the alias
+            // to access only some bits of it.
+        } else {
+            error(a.second.lineno, "No field %s in table %s", a.second.to_string().c_str(),
+                  tbl->name());
+        }
+    }
+    // Update default value for params if default action parameters present
+    for (auto &p : p4_params_list) {
+        if (auto def_act_params = tbl->get_default_action_parameters()) {
+            if (def_act_params->count(p.name) > 0) {
+                p.default_value = (*def_act_params)[p.name];
+                p.defaulted = true;
+            }
+        }
+    }
+    for (auto &c : attached) {
+        if (!c) {
+            error(c.lineno, "Unknown instruction or table %s", c.name.c_str());
+            continue;
+        }
+        if (c->table_type() != COUNTER && c->table_type() != METER && c->table_type() != STATEFUL) {
+            error(c.lineno, "%s is not a counter, meter or stateful table", c.name.c_str());
+            continue;
+        }
+    }
+}
+
+/**
+ * Determines if the field, which has a particular range of bits in the format, is controlled
+ * by a conditional variable.  This is required for context JSON information on parameters in
+ * the action data table pack format, or in the immediate fields:
+ *
+ *     -is_mod_field_conditionally_value
+ *     -mod_field_conditionally_mask_field_name
+ *
+ * @sa asm_output::EmitAction::mod_cond_value
+ */
+void Table::Actions::Action::check_conditional(Table::Format::Field &field) const {
+    bool found = false;
+    std::string condition;
+    for (auto kv : mod_cond_values) {
+        for (auto br : field.bits) {
+            auto overlap = kv.second[MC_ADT].getslice(br.lo, br.size());
+            if (overlap.empty()) {
+                BUG_CHECK(!found || (found && condition != kv.first));
+            } else if (overlap.popcount() == br.size()) {
+                if (found) {
+                    BUG_CHECK(condition == kv.first);
+                } else {
+                    found = true;
+                    condition = kv.first;
+                }
+            } else {
+                BUG();
+            }
+        }
+    }
+    if (found) {
+        field.conditional_value = true;
+        field.condition = condition;
+    }
+}
+
+/**
+ * @sa Table::Actions::Action::check_conditional
+ */
+bool Table::Actions::Action::immediate_conditional(int lo, int sz, std::string &condition) const {
+    bool found = false;
+    for (auto kv : mod_cond_values) {
+        auto overlap = kv.second[MC_IMMED].getslice(lo, sz);
+        if (overlap.empty()) {
+            BUG_CHECK(!found || (found && condition != kv.first));
+        } else {
+            if (found) {
+                BUG_CHECK(condition == kv.first);
+            } else if (overlap.popcount() == sz) {
+                found = true;
+                condition = kv.first;
+            } else {
+                BUG();
+            }
+        }
+    }
+    return found;
+}
+
+void Table::Actions::pass1(Table *tbl) {
+    for (auto &act : *this) {
+        act.pass1(tbl);
+        slot_use |= act.slot_use;
+    }
+}
+
+std::map<Table *, std::set<Table::Actions::Action *>> Table::find_pred_in_stage(
+    int stageno, const std::set<Actions::Action *> &acts) {
+    std::map<Table *, std::set<Actions::Action *>> rv;
+    if (stage->stageno < stageno) return rv;
+    if (stage->stageno == stageno) {
+        rv[this].insert(acts.begin(), acts.end());
+    }
+    for (auto &p : pred) {
+        for (auto &kv : p.first->find_pred_in_stage(stageno, p.second)) {
+            rv[kv.first].insert(kv.second.begin(), kv.second.end());
+        }
+    }
+    for (auto *mt : get_match_tables()) {
+        if (mt != this) {
+            for (auto &kv : mt->find_pred_in_stage(stageno, acts)) {
+                rv[kv.first].insert(kv.second.begin(), kv.second.end());
+            }
+        }
+    }
+    return rv;
+}
+
+void Table::Actions::pass2(Table *tbl) {
+    /* We do NOT call this for SALU actions, so we can assume VLIW actions here */
+    BUG_CHECK(tbl->table_type() != STATEFUL);
+    int code = tbl->get_gateway() ? 1 : 0;  // if there's a gateway, reserve code 0 for a NOP
+                                            // to run when the gateway inhibits the table
+
+    /* figure out how many codes we can encode in the match table(s), and if we need a distinct
+     * code for every action to handle next_table properly */
+    int code_limit = 0x10000;
+    bool use_code_for_next = false;  // true iff a table uses the action code for next table
+                                     // selection in addition to using it for the action instruction
+
+    for (auto match : tbl->get_match_tables()) {
+        // action is currently a default keyword for the instruction address
+        auto instruction = match->instruction_call();
+        auto fld = instruction.args[0].field();
+        if (fld) {
+            code_limit = 1 << fld->size;
+            if (match->hit_next_size() > 1 && !match->lookup_field("next"))
+                use_code_for_next = true;
+        } else {
+            code_limit = code + 1;
+        }
+    }
+
+    /* figure out if we need more codes than can fit in the action_instruction_adr_map.
+     * use code = -1 to signal that condition. */
+    int non_nop_actions = by_code.size();
+    // Check if a nop action is defined. The action will be empty (no
+    // instructions). By default we will use code '0' for nop action, unless
+    // compiler has assigned a different value.
+    int nop_code = 0;
+    for (auto &bc : by_code) {
+        if (bc.second->instr.empty()) nop_code = bc.first;
+    }
+    if (by_code.count(nop_code) && by_code.at(nop_code)->instr.empty()) {
+        --non_nop_actions;  // don't count nop code action
+        code = 1;
+    }
+    for (auto &act : *this) {
+        if (act.default_only) continue;
+        if (act.instr.empty() && !use_code_for_next)
+            code = 1;  // nop action -- use code 0 unless it needs to be used as next
+        else if (act.code < 0)
+            ++non_nop_actions;
+    }  // FIXME -- should combine identical actions?
+    if (code + non_nop_actions > ACTION_INSTRUCTION_SUCCESSOR_TABLE_DEPTH) code = -1;
+    bool code0_is_noop = (code != 0);
+
+    for (auto &act : *this) {
+        for (auto &inst : act.instr) inst->pass2(tbl, &act);
+        if (act.addr < 0) {
+            for (int i = 0; i < ACTION_IMEM_ADDR_MAX; i++) {
+                if (auto old = tbl->stage->imem_addr_use[imem_thread(tbl->gress)][i]) {
+                    if (act.equivVLIW(old)) {
+                        act.addr = i;
+                        break;
+                    }
+                    continue;
+                }
+                if (tbl->stage->imem_use[i / ACTION_IMEM_COLORS].intersects(act.slot_use)) continue;
+                act.addr = i;
+                tbl->stage->imem_use[i / ACTION_IMEM_COLORS] |= act.slot_use;
+                tbl->stage->imem_addr_use[imem_thread(tbl->gress)][i] = &act;
+                break;
+            }
+        }
+        if (act.addr < 0) error(act.lineno, "Can't find an available instruction address");
+        if (act.code < 0 && !act.default_only) {
+            if (code < 0 && !code_use[act.addr]) {
+                act.code = act.addr;
+            } else if (act.instr.empty() && !use_code_for_next && code0_is_noop) {
+                act.code = 0;
+            } else {
+                while (code >= 0 && code_use[code]) code++;
+                act.code = code;
+            }
+        } else if (code < 0 && act.code != act.addr && !act.default_only) {
+            error(act.lineno,
+                  "Action code must be the same as action instruction address "
+                  "when there are more than %d actions",
+                  ACTION_INSTRUCTION_SUCCESSOR_TABLE_DEPTH);
+            if (act.code < 0)
+                warning(act.lineno, "Code %d is already in use by another action", act.addr);
+        }
+        if (act.code >= 0) {
+            by_code[act.code] = &act;
+            code_use[act.code] = true;
+        }
+        if (act.code >= code_limit)
+            error(act.lineno,
+                  "Action code %d for %s too large for action specifier in "
+                  "table %s",
+                  act.code, act.name.c_str(), tbl->name());
+        if (act.code > max_code) max_code = act.code;
+    }
+    actions.sort([](const value_type &a, const value_type &b) -> bool {
+        return a.second.code < b.second.code;
+    });
+    if (!tbl->default_action.empty()) {
+        if (!exists(tbl->default_action)) {
+            error(tbl->default_action_lineno, "no action %s in table %s",
+                  tbl->default_action.c_str(), tbl->name());
+        } else {
+            auto &defact = actions.at(tbl->default_action);
+            if (!defact.default_allowed) {
+                // FIXME -- should be an error, but the compiler currently does this?
+                // FIXME -- see p4_16_programs_tna_lpm_match
+                warning(tbl->default_action_lineno,
+                        "default action %s in table %s is not allowed "
+                        "to be default?",
+                        tbl->default_action.c_str(), tbl->name());
+                defact.default_allowed = true;
+            }
+        }
+    }
+    auto pred = tbl->find_pred_in_stage(tbl->stage->stageno);
+    for (auto &p : pred) {
+        auto *actions = p.first->get_actions();
+        if (!actions || actions == this) continue;
+        if (!slot_use.intersects(actions->slot_use)) continue;
+        for (auto &a1 : *this) {
+            bool first = false;
+            for (auto a2 : p.second) {
+                if (a1.slot_use.intersects(a2->slot_use)) {
+                    if (!first)
+                        warning(a1.lineno,
+                                "Conflicting instruction slot usage for non-exlusive "
+                                "table %s action %s",
+                                tbl->name(), a1.name.c_str());
+                    first = true;
+                    warning(a2->lineno, "and table %s action %s", p.first->name(),
+                            a2->name.c_str());
+                }
+            }
+        }
+    }
+}
+
+void Table::Actions::stateful_pass2(Table *tbl) {
+    BUG_CHECK(tbl->table_type() == STATEFUL);
+    auto *stbl = tbl->to<StatefulTable>();
+    for (auto &act : *this) {
+        if (act.code >= 4) {
+            error(act.lineno, "Only 4 actions in a stateful table");
+        } else if (act.code >= 0) {
+            if (code_use[act.code]) {
+                error(act.lineno, "duplicate use of code %d in SALU", act.code);
+                warning(by_code[act.code]->lineno, "previous use here");
+            }
+            by_code[act.code] = &act;
+            code_use[act.code] = true;
+        }
+        if (act.code == 3 && stbl->clear_value)
+            error(act.lineno, "Can't use SALU action 3 with a non-zero clear value");
+        for (const auto &inst : act.instr) inst->pass2(tbl, &act);
+    }
+    if (stbl->clear_value) code_use[3] = true;
+    for (auto &act : *this) {
+        if (act.code < 0) {
+            if ((act.code = code_use.ffz(0)) >= 4) {
+                error(act.lineno, "Only 4 actions in a stateful table");
+                break;
+            }
+            by_code[act.code] = &act;
+            code_use[act.code] = true;
+        }
+    }
+}
+
+template <class REGS>
+void Table::Actions::write_regs(REGS &regs, Table *tbl) {
+    for (auto &act : *this) {
+        LOG2("# action " << act.name << " code=" << act.code << " addr=" << act.addr);
+        tbl->write_action_regs(regs, &act);
+        for (const auto &inst : act.instr) inst->write_regs(regs, tbl, &act);
+        if (options.fill_noop_slot) {
+            for (auto slot : Phv::use(tbl->gress) - tbl->stage->imem_use_all()) {
+                auto tmp = VLIW::genNoopFill(tbl, &act, options.fill_noop_slot, slot);
+                tmp->pass1(tbl, &act);
+                tmp->pass2(tbl, &act);
+                tmp->write_regs(regs, tbl, &act);
+            }
+        }
+    }
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void Table::Actions::write_regs, mau_regs &,
+                      Table *)
+
+/**
+ * Indirect Counters, Meters, and Stateful Alus can be addressed in many different ways, e.g.
+ * Hash Distribution, Overhead Index, Stateful Counter, Constant, etc.
+ *
+ * The indexing can be different per individual action.  Say one action always uses an indirect
+ * address, while another one uses a constant.  The driver has to know where to put that
+ * constant into the RAM line.
+ *
+ * Also, say an address is from hash, but can have multiple meter types.  By using the override
+ * address of an action, when that action is programmed, the meter type written in overhead will
+ * be determined by the overhead address.
+ *
+ * override_addr - a boolean of whether to use the override value for these parameters.
+ *     This is enabled if the address does not come from overhead.
+ *
+ * Override_addr_pfe - Not actually useful, given the override_full_addr contains the per flow
+ *     enable bit
+ *
+ * Override_full_addr - the constant value to be written directly into the corresponding bit
+ *     positions in the RAM line
+ */
+static void gen_override(json::map &cfg, const Table::Call &att) {
+    auto type = att->table_type();
+    // Direct tables currently don't require overrides
+    // FIXME: Corner cases where miss actions do not use the stateful object should have
+    // an override of all 0
+    if (att->to<AttachedTable>()->is_direct()) return;
+    std::string base;
+    bool override_addr = false;
+    bool override_addr_pfe = false;
+    unsigned override_full_addr = 0;
+    switch (type) {
+        case Table::COUNTER:
+            base = "override_stat";
+            break;
+        case Table::METER:
+            base = "override_meter";
+            break;
+        case Table::STATEFUL:
+            base = "override_stateful";
+            break;
+        default:
+            error(att.lineno, "unsupported table type in action call");
+    }
+    // Always true if the call is provided
+    override_addr_pfe = true;
+    override_full_addr |= 1U << (type == Table::COUNTER ? STATISTICS_PER_FLOW_ENABLE_START_BIT
+                                                        : METER_PER_FLOW_ENABLE_START_BIT);
+    int idx = -1;
+    for (auto &arg : att.args) {
+        ++idx;
+        if (arg.type == Table::Call::Arg::Name) {
+            if (strcmp(arg.name(), "$hash_dist") == 0 ||
+                strcmp(arg.name(), "$stful_counter") == 0) {
+                override_addr = true;
+            } else if (auto *st = att->to<StatefulTable>()) {
+                if (auto *act = st->actions->action(arg.name())) {
+                    override_full_addr |= 1 << METER_TYPE_START_BIT;
+                    override_full_addr |= act->code << (METER_TYPE_START_BIT + 1);
+                }
+            }
+            // FIXME -- else assume its a reference to a format field, so doesn't need to
+            // FIXME -- be in the override.  Should check that somewhere, but need access
+            // FIXME -- to the match_table to do it here.
+        } else if (arg.type == Table::Call::Arg::Const) {
+            if (idx == 0 && att.args.size() > 1) {
+                // The first argument for meters/stateful is the meter type
+                override_full_addr |= arg.value() << METER_TYPE_START_BIT;
+            } else {
+                override_full_addr |= arg.value() << att->address_shift();
+                override_addr = true;
+            }
+        } else if (arg.type == Table::Call::Arg::Counter) {
+            // does not affect context json
+        } else {
+            error(att.lineno, "argument not a constant");
+        }
+    }
+    cfg[base + "_addr"] = override_addr;
+    cfg[base + "_addr_pfe"] = override_addr ? override_addr_pfe : false;
+    cfg[base + "_full_addr"] = override_addr ? override_full_addr : 0;
+}
+
+bool Table::Actions::Action::is_color_aware() const {
+    for (auto &att : attached) {
+        if (att->table_type() != Table::METER) continue;
+        if (att.args.size() < 2) continue;
+        auto type_arg = att.args[0];
+        if (type_arg.type == Table::Call::Arg::Const && type_arg.value() == METER_COLOR_AWARE)
+            return true;
+    }
+    return false;
+}
+
+void Table::Actions::Action::check_and_add_resource(json::vector &resources,
+                                                    json::map &resource) const {
+    // Check if resource already exists in the json::vector. For tables
+    // spanning multiple stages, the same resource gets added as an attached
+    // resource for every stage. To avoid duplication only add when not
+    // present in the resource array
+    bool found = false;
+    for (auto &r : resources) {
+        if (resource == r->to<json::map>()) {
+            found = true;
+            break;
+        }
+    }
+    if (!found) resources.push_back(std::move(resource));
+}
+
+void Table::Actions::Action::add_direct_resources(json::vector &direct_resources,
+                                                  const Call &att) const {
+    json::map direct_resource;
+    direct_resource["resource_name"] = att->p4_name();
+    direct_resource["handle"] = att->handle();
+    check_and_add_resource(direct_resources, direct_resource);
+}
+
+void Table::Actions::Action::add_indirect_resources(json::vector &indirect_resources,
+                                                    const Call &att) const {
+    auto addr_arg = att.args.back();
+    json::map indirect_resource;
+    if (addr_arg.type == Table::Call::Arg::Name) {
+        auto *p = has_param(addr_arg.name());
+        if (p) {
+            indirect_resource["access_mode"] = "index";
+            indirect_resource["parameter_name"] = p->name;
+            indirect_resource["parameter_index"] = p->position;
+        } else {
+            return;
+        }
+    } else if (addr_arg.type == Table::Call::Arg::Const) {
+        indirect_resource["access_mode"] = "constant";
+        indirect_resource["value"] = addr_arg.value();
+    } else {
+        return;
+    }
+    indirect_resource["resource_name"] = att->p4_name();
+    indirect_resource["handle"] = att->handle();
+    check_and_add_resource(indirect_resources, indirect_resource);
+}
+
+void Table::Actions::gen_tbl_cfg(json::vector &actions_cfg) const {
+    for (auto &act : *this) {
+        // Use action node if it already exists in json
+        bool act_json_present = false;
+        json::map *action_ptr = nullptr;
+        for (auto &_action_o : actions_cfg) {
+            auto &_action = _action_o->to<json::map>();
+            if (_action["name"] == act.name) {
+                action_ptr = &_action;
+                act_json_present = true;
+                break;
+            }
+        }
+        if (!act_json_present) action_ptr = new json::map();
+        json::map &action_cfg = *action_ptr;
+
+        action_cfg["name"] = act.name;
+        action_cfg["handle"] = act.handle;  // FIXME-JSON
+        if (act.instr.empty() || action_cfg.count("primitives") == 0)
+            action_cfg["primitives"] = json::vector();
+        auto &direct_resources = action_cfg["direct_resources"] = json::vector();
+        auto &indirect_resources = action_cfg["indirect_resources"] = json::vector();
+        for (auto &att : act.attached) {
+            if (att.is_direct_call())
+                act.add_direct_resources(direct_resources, att);
+            else
+                act.add_indirect_resources(indirect_resources, att);
+        }
+        if (!act.hit_allowed && !act.default_allowed)
+            error(act.lineno, "Action %s must be allowed to be hit and/or default action.",
+                  act.name.c_str());
+        action_cfg["allowed_as_hit_action"] = act.hit_allowed;
+        // TODO: allowed_as_default_action info is directly passed through assembly
+        // This will be 'false' for following conditions:
+        // 1. Action requires hardware in hit path i.e. hash distribution or
+        // random number generator
+        // 2. There is a default action declared constant in program which
+        // implies all other actions cannot be set to default
+        action_cfg["allowed_as_default_action"] = act.default_allowed;
+        // TODO: "disallowed_as_default_action" is not used by driver.
+        // Keeping it here as debugging info. Will be set to "none",
+        // "has_const_default", "has_hash_dist". Once rng support is added
+        // to the compiler this must reflect "has_rng" or similar string.
+        if (!act.default_allowed)
+            action_cfg["disallowed_as_default_action_reason"] = act.default_disallowed_reason;
+        // TODO: Need to be set through assembly
+        action_cfg["is_compiler_added_action"] = false;
+        action_cfg["constant_default_action"] = act.is_constant;
+
+        // TODO: These will be set to 'true' & "" for a keyless table to
+        // allow any action to be set as default by the control plane
+        // Exception is TernaryIndirectTables which dont have params list as they are on the main
+        // TernaryMatchTable, hence check for match_table to query params list
+        if (table->get_match_table()->p4_params_list.empty()) {
+            action_cfg["allowed_as_default_action"] = true;
+            action_cfg["disallowed_as_default_action_reason"] = "";
+        }
+
+        json::vector &p4_params = action_cfg["p4_parameters"] = json::vector();
+        act.add_p4_params(p4_params);
+        action_cfg["override_meter_addr"] = false;
+        action_cfg["override_meter_addr_pfe"] = false;
+        action_cfg["override_meter_full_addr"] = 0;
+        action_cfg["override_stat_addr"] = false;
+        action_cfg["override_stat_addr_pfe"] = false;
+        action_cfg["override_stat_full_addr"] = 0;
+        action_cfg["override_stateful_addr"] = false;
+        action_cfg["override_stateful_addr_pfe"] = false;
+        action_cfg["override_stateful_full_addr"] = 0;
+        for (auto &att : act.attached) gen_override(action_cfg, att);
+        action_cfg["is_action_meter_color_aware"] = act.is_color_aware();
+        if (act.context_json) action_cfg.merge(*act.context_json.get());
+        if (!act_json_present) actions_cfg.push_back(std::move(action_cfg));
+    }
+}
+
+/**
+ * For action data tables, the entirety of the action configuration is not necessary, as the
+ * information is per match table, not per action data table.  The only required parameters
+ * are the name, handle, and p4_parameters
+ *
+ * Even at some point, even actions that have the different p4_parameters could even share a
+ * member, if for example, one of the parameters is not stored in the action data table,
+ * but rather as an index for a counter/meter etc.  The compiler/driver do not have support for
+ * this yet.
+ */
+void Table::Actions::Action::gen_simple_tbl_cfg(json::vector &actions_cfg) const {
+    json::map action_cfg;
+    action_cfg["name"] = name;
+    action_cfg["handle"] = handle;
+    json::vector &p4_params = action_cfg["p4_parameters"] = json::vector();
+    add_p4_params(p4_params, false);
+    actions_cfg.push_back(std::move(action_cfg));
+}
+
+void Table::Actions::Action::add_p4_params(json::vector &cfg, bool include_default) const {
+    unsigned start_bit = 0;
+    for (auto &a : p4_params_list) {
+        json::map param;
+        param["name"] = a.name;
+        param["start_bit"] = start_bit;
+        param["position"] = a.position;
+        if (include_default && a.defaulted) param["default_value"] = a.default_value;
+        param["bit_width"] = a.bit_width;
+        if (a.context_json) param.merge(*a.context_json.get());
+        cfg.push_back(std::move(param));
+        start_bit += a.bit_width;
+    }
+}
+
+void Table::Actions::add_p4_params(const Action &act, json::vector &cfg) const {
+    int index = 0;
+    unsigned start_bit = 0;
+    // Add p4 params if present. This will add params even if the action is
+    // otherwise empty. Driver will always generate an action spec if p4_params
+    // are present for an action
+    for (auto &a : act.p4_params_list) {
+        json::map param;
+        param["name"] = a.name;
+        param["start_bit"] = start_bit;
+        param["position"] = a.position;
+        if (a.defaulted) param["default_value"] = a.default_value;
+        param["bit_width"] = a.bit_width;
+        cfg.push_back(std::move(param));
+        start_bit += a.bit_width;
+    }
+}
+
+void Table::Actions::add_action_format(const Table *table, json::map &tbl) const {
+    json::vector &action_format = tbl["action_format"] = json::vector();
+    for (auto &act : *this) {
+        json::map action_format_per_action;
+        unsigned next_table = -1;
+
+        std::string next_table_name = "--END_OF_PIPELINE--";
+        if (!act.default_only) {
+            if (act.next_table_encode >= 0) {
+                next_table = static_cast<unsigned>(act.next_table_encode);
+            } else {
+                // The RAM value is only 8 bits, for JBay must be solved by table placement
+                next_table = act.next_table_ref.next_table_id() & 0xff;
+                next_table_name = act.next_table_ref.next_table_name();
+                if (next_table_name == "END") next_table_name = "--END_OF_PIPELINE--";
+            }
+        }
+        unsigned next_table_full = act.next_table_miss_ref.next_table_id();
+
+        /**
+         * This following few fields are required on a per stage table action basis.
+         * The following information is:
+         *
+         * - next_table - The value that will be written into the next field RAM line on a hit,
+         *       when the entry is specified with this action.  This is either an index into
+         *       the next_table_map_en (if that map is enabled), or the 8 bit next table value.
+         *
+         * - next_table_full - The value that will be written into the miss register for next
+         *       table (next_table_format_data.match_next_table_adr_miss_value), if this action
+         *       is set as the default action.  This is the full 8 bit (9 bit for JBay) next
+         *       table.
+         *
+         * - vliw_instruction - The value that will be written into the action instruction RAM
+         *       entry when the entry is specified with this action.  This is either an index
+         *       into into the 8 entry table mau_action_instruction_adr_map_data, if that is
+         *       enabled, or the full word instruction
+         *
+         * - vliw_instruction_full - The value that will written into the miss register for
+         *       action_instruction (mau_action_instruction_adr_miss_value), when this
+         *       action is specified as the default action.  The full address with the PFE
+         *       bit enabled.
+         */
+        action_format_per_action["action_name"] = act.name;
+        action_format_per_action["action_handle"] = act.handle;
+        action_format_per_action["table_name"] = next_table_name;
+        action_format_per_action["next_table"] = next_table;
+        action_format_per_action["next_table_full"] = next_table_full;
+        if (Target::LONG_BRANCH_TAGS() > 0 && !options.disable_long_branch) {
+            if (Target::NEXT_TABLE_EXEC_COMBINED()) {
+                action_format_per_action["next_table_exec"] =
+                    ((act.next_table_miss_ref.next_in_stage(table->stage->stageno) & 0xfffe)
+                     << 15) +
+                    (act.next_table_miss_ref.next_in_stage(table->stage->stageno + 1) & 0xffff);
+            } else {
+                action_format_per_action["next_table_local_exec"] =
+                    act.next_table_miss_ref.next_in_stage(table->stage->stageno) >> 1;
+                action_format_per_action["next_table_global_exec"] =
+                    act.next_table_miss_ref.next_in_stage(table->stage->stageno + 1);
+            }
+            action_format_per_action["next_table_long_brch"] =
+                act.next_table_miss_ref.long_branch_tags();
+        }
+        action_format_per_action["vliw_instruction"] = act.code;
+        action_format_per_action["vliw_instruction_full"] =
+            ACTION_INSTRUCTION_ADR_ENABLE | act.addr;
+
+        json::vector &next_tables = action_format_per_action["next_tables"] = json::vector();
+        for (auto n : act.next_table_ref) {
+            auto nP4Name = n->p4_name();
+            // Gateway next tables dont have a p4 Name
+            if (nP4Name == nullptr) {
+                nP4Name = n.name.c_str();
+            }
+            next_tables.push_back(
+                json::map{{"next_table_name", json::string(nP4Name)},
+                          {"next_table_logical_id", json::number(n->logical_id)},
+                          {"next_table_stage_no", json::number(n->stage->stageno)}});
+        }
+        json::vector &action_format_per_action_imm_fields =
+            action_format_per_action["immediate_fields"] = json::vector();
+        for (auto &a : act.alias) {
+            json::string name = a.first;
+            int lo = remove_name_tail_range(name);
+            json::string immed_name = a.second.name;
+            if (immed_name != "immediate") continue;  // output only immediate fields
+            if (!(act.has_param(name) || a.second.is_constant))
+                continue;  // and fields that are parameters or constants
+            json::map action_format_per_action_imm_field;
+            action_format_per_action_imm_field["param_name"] = name;
+            action_format_per_action_imm_field["param_type"] = "parameter";
+            if (a.second.is_constant) {
+                action_format_per_action_imm_field["param_type"] = "constant";
+                action_format_per_action_imm_field["const_value"] = a.second.value;
+                action_format_per_action_imm_field["param_name"] =
+                    "constant_" + std::to_string(a.second.value);
+            }
+            action_format_per_action_imm_field["param_shift"] = lo;
+            action_format_per_action_imm_field["dest_start"] = a.second.lo;
+            action_format_per_action_imm_field["dest_width"] = a.second.size();
+            std::string condition;
+            if (act.immediate_conditional(a.second.lo, a.second.size(), condition)) {
+                action_format_per_action_imm_field["is_mod_field_conditionally_value"] = true;
+                action_format_per_action_imm_field["mod_field_conditionally_mask_field_name"] =
+                    condition;
+            }
+            action_format_per_action_imm_fields.push_back(
+                std::move(action_format_per_action_imm_field));
+        }
+        action_format.push_back(std::move(action_format_per_action));
+    }
+}
+
+std::ostream &operator<<(std::ostream &out, const Table::Actions::Action::alias_t &a) {
+    out << "(" << a.name << ", lineno = " << a.lineno << ", lo = " << a.lo << ", hi = " << a.hi
+        << ", is_constant = " << a.is_constant << ", value = 0x" << std::hex << a.value << std::dec
+        << ")";
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const Table::Actions::Action &a) {
+    out << a.name << "(";
+    auto indent = a.name.length() + 10;
+    for (auto &p : a.p4_params_list) out << p << std::endl << std::setw(indent);
+    out << ")";
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const Table::p4_param &p) {
+    out << p.name << "[ w =" << p.bit_width << ", w_full =" << p.bit_width_full
+        << ", start_bit =" << p.start_bit << ", mask = 0x" << p.mask << ", position =" << p.position
+        << ", default_value =" << p.default_value << ", defaulted =" << p.defaulted
+        << ", is_valid =" << p.is_valid << ", type =" << p.type << ", alias =" << p.alias
+        << ", key_name =" << p.key_name << "]";
+    return out;
+}
+
+void Table::Actions::add_immediate_mapping(json::map &tbl) {
+    for (auto &act : *this) {
+        if (act.alias.empty()) continue;
+        json::vector &map = tbl["action_to_immediate_mapping"][act.name];
+        for (auto &a : act.alias) {
+            json::string name = a.first;
+            json::string immed_name = a.second.name;
+            if (immed_name == "immediate") immed_name = "--immediate--";
+            int lo = remove_name_tail_range(name);
+            map.push_back(json::vector{json::map{
+                {"name", std::move(name)},
+                {"parameter_least_significant_bit", json::number(lo)},
+                {"parameter_most_significant_bit", json::number(lo + a.second.hi - a.second.lo)},
+                {"immediate_least_significant_bit", json::number(a.second.lo)},
+                {"immediate_most_significant_bit", json::number(a.second.hi)},
+                {"field_called", std::move(immed_name)}}});
+        }
+    }
+}
+
+template <class REGS>
+void Table::write_mapram_regs(REGS &regs, int row, int col, int vpn, int type) {
+    auto &mapram_config = regs.rams.map_alu.row[row].adrmux.mapram_config[col];
+    // auto &mapram_ctl = map_alu_row.adrmux.mapram_ctl[col];
+    mapram_config.mapram_type = type;
+    mapram_config.mapram_logical_table = logical_id;
+    mapram_config.mapram_vpn_members = 0;
+    if (!options.match_compiler)  // FIXME -- glass doesn't set this?
+        mapram_config.mapram_vpn = vpn;
+    if (gress == INGRESS)
+        mapram_config.mapram_ingress = 1;
+    else
+        mapram_config.mapram_egress = 1;
+    mapram_config.mapram_enable = 1;
+    mapram_config.mapram_ecc_check = 1;
+    mapram_config.mapram_ecc_generate = 1;
+    if (gress) regs.cfg_regs.mau_cfg_mram_thread[col / 3U] |= 1U << (col % 3U * 8U + row);
+}
+FOR_ALL_REGISTER_SETS(INSTANTIATE_TARGET_TEMPLATE, void Table::write_mapram_regs, mau_regs &, int,
+                      int, int, int)
+
+HashDistribution *Table::find_hash_dist(int unit) {
+    for (auto &hd : hash_dist)
+        if (hd.id == unit) return &hd;
+    for (auto t : get_match_tables())
+        for (auto &hd : t->hash_dist)
+            if (hd.id == unit) return &hd;
+    if (auto *a = get_attached())
+        for (auto &call : a->meters)
+            for (auto &hd : call->hash_dist)
+                if (hd.id == unit) return &hd;
+    return nullptr;
+}
+
+int Table::find_on_actionbus(const char *name, TableOutputModifier mod, int lo, int hi, int size,
+                             int *len) {
+    return action_bus ? action_bus->find(name, mod, lo, hi, size, len) : -1;
+}
+
+void Table::need_on_actionbus(Table *att, TableOutputModifier mod, int lo, int hi, int size) {
+    if (!action_bus) action_bus = ActionBus::create();
+    action_bus->need_alloc(this, att, mod, lo, hi, size);
+}
+
+int Table::find_on_actionbus(const ActionBusSource &src, int lo, int hi, int size, int pos) {
+    return action_bus ? action_bus->find(src, lo, hi, size, pos) : -1;
+}
+
+void Table::need_on_actionbus(const ActionBusSource &src, int lo, int hi, int size) {
+    if (!action_bus) action_bus = ActionBus::create();
+    action_bus->need_alloc(this, src, lo, hi, size);
+}
+
+int Table::find_on_ixbar(Phv::Slice sl, InputXbar::Group group, InputXbar::Group *found) {
+    for (auto &ixb : input_xbar) {
+        if (auto *i = ixb->find(sl, group, found)) {
+            unsigned bit = (i->lo + sl.lo - i->what->lo);
+            BUG_CHECK(bit < 128);
+            return bit / 8;
+        }
+    }
+    if (group.index >= 0) {
+        for (auto *in : stage->ixbar_use[group]) {
+            if (auto *i = in->find(sl, group)) {
+                unsigned bit = (i->lo + sl.lo - i->what->lo);
+                BUG_CHECK(bit < 128);
+                return bit / 8;
+            }
+        }
+    } else {
+        for (auto &g : Keys(stage->ixbar_use)) {
+            if (g.type != group.type) continue;
+            int t;
+            if ((t = find_on_ixbar(sl, g)) >= 0) {
+                if (found) *found = g;
+                return t;
+            }
+        }
+    }
+    return -1;
+}
+
+int Table::json_memunit(const MemUnit &r) const {
+    if (r.stage >= 0) {
+        return r.stage * Target::SRAM_STRIDE_STAGE() + r.row * Target::SRAM_STRIDE_ROW() +
+               r.col * Target::SRAM_STRIDE_COLUMN();
+    } else if (r.row >= 0) {
+        // per-stage physical sram
+        return r.row * Target::SRAM_UNITS_PER_ROW() + r.col;
+    } else {
+        // lamb
+        return r.col;
+    }
+}
+
+std::unique_ptr<json::map> Table::gen_memory_resource_allocation_tbl_cfg(
+    const char *type, const std::vector<Layout> &layout, bool skip_spare_bank) const {
+    int width, depth, period;
+    const char *period_name;
+    // FIXME -- calling vpn_params here is only valid when layout == this->layout, but we also
+    // FIXME -- get here for color_maprams.  It works out as we don't use depth or width, only
+    // FIXME -- period, which will always be 1 for meter layout or color_maprams
+    vpn_params(width, depth, period, period_name);
+    json::map mra;
+    mra["memory_type"] = type;
+    std::vector<std::map<int, int>> mem_units;
+    json::vector &mem_units_and_vpns = mra["memory_units_and_vpns"] = json::vector();
+    int vpn_ctr = 0;
+    bool no_vpns = false;
+    int spare_vpn;
+    std::vector<int> spare_mem;
+
+    // Retrieve the Spare banks
+    // skip_spare_bank is only false on tables don't have spare banks, or when building
+    // memory_units json for map rams
+    if (skip_spare_bank) {
+        BUG_CHECK(&layout == &this->layout, "layout not matching");
+        spare_mem = determine_spare_bank_memory_units();
+        BUG_CHECK(!spare_mem.empty(), "No spare banks in %s?", name());
+        // if all the mems are "spare" this is really a DDP table, so we want to
+        // put the usits/vpns of the spares in the memory_units json
+        if (spare_mem.size() == layout_size()) skip_spare_bank = false;
+    } else if (&layout == &this->layout) {
+        BUG_CHECK(determine_spare_bank_memory_units().empty(),
+                  "%s has spare banks, but we're not skipping them?", name());
+    }
+
+    for (auto &row : layout) {
+        int word = row.word >= 0 ? row.word : 0;
+        auto vpn_itr = row.vpns.begin();
+        for (auto &ram : row.memunits) {
+            BUG_CHECK(ram.row == row.row, "bogus %s in row %d", ram.desc(), row.row);
+            if (vpn_itr == row.vpns.end())
+                no_vpns = true;
+            else
+                vpn_ctr = *vpn_itr++;
+            if (size_t(vpn_ctr) >= mem_units.size()) mem_units.resize(vpn_ctr + 1);
+            // Create a vector indexed by vpn no where each element is a map
+            // having a RAM entry indexed by word number
+            // VPN WORD RAM
+            //  0 -> 0   90
+            //       1   91
+            //  1 -> 0   92
+            //       1   93
+            //  E.g. VPN 0 has Ram 90 with word 0 and Ram 91 with word 1
+            int unit = json_memunit(ram);
+            if (skip_spare_bank &&
+                std::find(spare_mem.begin(), spare_mem.end(), unit) != spare_mem.end())
+                continue;
+            mem_units[vpn_ctr][word] = json_memunit(ram);
+        }
+    }
+    if (mem_units.size() == 0) return nullptr;
+    int vpn = 0;
+    for (auto &mem_unit : mem_units) {
+        json::vector mem;
+        // Below for loop orders the mem unit as { .., word1, word0 } which is
+        // assumed to be what driver expects.
+        for (int word = mem_unit.size() - 1; word >= 0; word--) {
+            for (auto m : mem_unit) {
+                if (m.first == word) {
+                    mem.push_back(m.second);
+                    break;
+                }
+            }
+        }
+        if (mem.size() != 0) {
+            json::map tmp;
+            tmp["memory_units"] = std::move(mem);
+            json::vector vpns;
+            if (no_vpns)
+                vpns.push_back(nullptr);
+            else
+                vpns.push_back(vpn);
+            tmp["vpns"] = std::move(vpns);
+            mem_units_and_vpns.push_back(std::move(tmp));
+        }
+        vpn++;
+    }
+    if (skip_spare_bank && spare_mem.size() != 0) {
+        if (spare_mem.size() == 1) {
+            mra["spare_bank_memory_unit"] = spare_mem[0];
+        } else {
+            json::vector &spare = mra["spare_bank_memory_unit"];
+            for (auto u : spare_mem) spare.push_back(u);
+        }
+    }
+    return json::mkuniq<json::map>(std::move(mra));
+}
+
+json::map *Table::base_tbl_cfg(json::vector &out, const char *type, int size) const {
+    auto tbl = p4_table->base_tbl_cfg(out, size, this);
+    if (context_json) add_json_node_to_table(*tbl, "user_annotations");
+    return tbl;
+}
+
+json::map *Table::add_stage_tbl_cfg(json::map &tbl, const char *type, int size) const {
+    json::vector &stage_tables = tbl["stage_tables"];
+    json::map stage_tbl;
+    stage_tbl["stage_number"] = stage->stageno;
+    stage_tbl["size"] = size;
+    stage_tbl["stage_table_type"] = type;
+    stage_tbl["logical_table_id"] = logical_id;
+    if (physical_ids) {
+        // this is only used by the driver to set miss entry imem/iad/next, so it should
+        // not matter which physical table it is set on if there are multiple
+        stage_tbl["physical_table_id"] = *physical_ids.begin();
+    }
+
+    if (this->to<MatchTable>()) {
+        stage_tbl["has_attached_gateway"] = false;
+        if (get_gateway()) stage_tbl["has_attached_gateway"] = true;
+    }
+    if (!strcmp(type, "selection") && get_stateful())
+        tbl["bound_to_stateful_table_handle"] = get_stateful()->handle();
+    if (Target::SUPPORT_ALWAYS_RUN() && (this->to<MatchTable>() || this->to<GatewayTable>()))
+        stage_tbl["always_run"] = is_always_run();
+
+    stage_tables.push_back(std::move(stage_tbl));
+    return &(stage_tables.back()->to<json::map>());
+}
+
+/**
+ * One can no longer use whether the table is directly or indirectly addressed on whether
+ * a table is referenced that way.  This is due to the corner case on hash action tables
+ * For a hash action table, an attached table that was previously directly addressed is now
+ * addressed by hash.  However, for the driver, the driver must know which tables used to be
+ * directly addressed vs. an attached table that is addressed by a hash based index.
+ *
+ * Thus, for those corner cases, a how_referenced in the p4 tag of the attached table is
+ * currently provided.  Really for an attached table in hardware, it has no sense of how the
+ * table is addressed, as it only receives an address, so if somehow two tables, where one was
+ * direct while another was indirect (which is theoretically supportable if a hash action direct
+ * counter is shared), would break this parameter.
+ *
+ * However, for the moment, there are no realistic attached table not either directly or indirectly
+ * referenced
+ *
+ * If we need to change this, this was the delineation for how this was determined in match tables:
+ *
+ * In the call for both of these examples, the address field is a hash_dist object, as this is
+ * necessary for the set up of the address.  This call, unlike every other type table, cannot
+ * be the place where the address is determined.
+ *
+ * Instead, the attached calls in the action is how the assembler can delineate whether the
+ * reference table is direct or indirect.  If the address argument is $DIRECT, then the direct
+ * table has been converted to a hash, however if the argument is $hash_dist, then the original
+ * call was from a hash-based index, and is indirect
+ */
+void Table::add_reference_table(json::vector &table_refs, const Table::Call &c) const {
+    if (c) {
+        auto t_name = c->name();
+        if (c->p4_table) {
+            t_name = c->p4_table->p4_name();
+            if (!t_name) {
+                error(-1, "No p4 table name found for table : %s", c->name());
+                return;
+            }
+        }
+        // Dont add ref table if already present in table_refs vector
+        for (auto &tref : table_refs) {
+            auto tref_name = tref->to<json::map>()["name"];
+            if (!strcmp(tref_name->as_string()->c_str(), t_name)) return;
+        }
+        json::map table_ref;
+        std::string hr = c->to<AttachedTable>()->how_referenced();
+        if (hr.empty()) hr = c->to<AttachedTable>()->is_direct() ? "direct" : "indirect";
+        table_ref["how_referenced"] = hr;
+        table_ref["handle"] = c->handle();
+        table_ref["name"] = t_name;
+        auto mtr = c->to<MeterTable>();
+        if (mtr && mtr->uses_colormaprams()) {
+            BUG_CHECK(mtr->color_mapram_addr != MeterTable::NO_COLOR_MAP,
+                      "inconsistent color mapram address bus for %s", mtr->name());
+            table_ref["color_mapram_addr_type"] =
+                mtr->color_mapram_addr == MeterTable::IDLE_MAP_ADDR ? "idle" : "stats";
+        }
+
+        table_refs.push_back(std::move(table_ref));
+    }
+}
+
+bool Table::is_directly_referenced(const Table::Call &c) const {
+    if (c) {
+        std::string hr = c->to<AttachedTable>()->how_referenced();
+        if (hr.empty()) {
+            if (c->to<AttachedTable>()->is_direct()) return true;
+        }
+    }
+    return false;
+}
+
+json::map &Table::add_pack_format(json::map &stage_tbl, int memword, int words, int entries) const {
+    json::map pack_fmt;
+    pack_fmt["table_word_width"] = memword * words;
+    pack_fmt["memory_word_width"] = memword;
+    if (entries >= 0) pack_fmt["entries_per_table_word"] = entries;
+    pack_fmt["number_memory_units_per_table_word"] = words;
+    json::vector &pack_format = stage_tbl["pack_format"];
+    pack_format.push_back(std::move(pack_fmt));
+    return pack_format.back()->to<json::map>();
+}
+
+void Table::canon_field_list(json::vector &field_list) const {
+    for (auto &field_ : field_list) {
+        auto &field = field_->to<json::map>();
+        auto &name = field["field_name"]->to<json::string>();
+        if (int lo = remove_name_tail_range(name)) field["start_bit"]->to<json::number>().val += lo;
+    }
+}
+
+std::vector<Table::Call> Table::get_calls() const {
+    std::vector<Call> rv;
+    if (action) rv.emplace_back(action);
+    if (instruction) rv.emplace_back(instruction);
+    return rv;
+}
+
+/**
+ * Determines both the start bit and the source name in the context JSON node for a particular
+ * field.  Do not like string matching, and this should potentially be determined by looking
+ * through a list of fields, but this will work in the short term
+ */
+void Table::get_cjson_source(const std::string &field_name, std::string &source,
+                             int &start_bit) const {
+    source = "spec";
+    if (field_name == "hash_group") {
+        source = "proxy_hash";
+    } else if (field_name == "version") {
+        source = "version";
+    } else if (field_name == "immediate") {
+        source = "immediate";
+    } else if (field_name == "action") {
+        source = "instr";
+    } else if (field_name == "next") {
+        source = "next_table";
+    } else if (field_name == "action_addr") {
+        source = "adt_ptr";
+        if (auto adt = action->to<ActionTable>()) start_bit = std::min(5U, adt->get_log2size() - 2);
+    } else if (field_name == "counter_addr") {
+        source = "stats_ptr";
+        auto a = get_attached();
+        if (a && a->stats.size() > 0) {
+            auto s = a->stats[0];
+            start_bit = s->address_shift();
+        }
+    } else if (field_name == "counter_pfe") {
+        source = "stats_ptr";
+        start_bit = STATISTICS_PER_FLOW_ENABLE_START_BIT;
+    } else if (field_name == "meter_addr") {
+        if (auto m = get_meter()) {
+            source = "meter_ptr";
+            start_bit = m->address_shift();
+        } else if (auto s = get_selector()) {
+            source = "sel_ptr";
+            start_bit = s->address_shift();
+        } else if (auto s = get_stateful()) {
+            source = "stful_ptr";
+            start_bit = s->address_shift();
+        } else {
+            error(lineno, "Table %s has a meter_addr but no attached meter", name());
+        }
+    } else if (field_name == "meter_pfe") {
+        if (get_meter()) {
+            source = "meter_ptr";
+        } else if (get_selector()) {
+            source = "sel_ptr";
+        } else if (get_stateful()) {
+            source = "stful_ptr";
+        } else {
+            error(lineno, "Table %s has a meter_pfe but no attached meter", name());
+        }
+        start_bit = METER_PER_FLOW_ENABLE_START_BIT;
+    } else if (field_name == "meter_type") {
+        if (get_meter())
+            source = "meter_ptr";
+        else if (get_selector())
+            source = "sel_ptr";
+        else if (get_stateful())
+            source = "stful_ptr";
+        else
+            error(lineno, "Table %s has a meter_type but no attached meter", name());
+        start_bit = METER_TYPE_START_BIT;
+    } else if (field_name == "sel_len_mod") {
+        source = "selection_length";
+    } else if (field_name == "sel_len_shift") {
+        source = "selection_length_shift";
+    } else if (field_name == "valid") {
+        source = "valid";
+    }
+}
+
+/**
+ * Adds a field into the format of either a match or action table.  Honestly, this is used
+ * for both action data tables and match tables, and this should be split up into two
+ * separate functions, as the corner casing for these different cases can be quite different
+ * and lead to some significant confusion
+ */
+void Table::add_field_to_pack_format(json::vector &field_list, unsigned basebit, std::string name,
+                                     const Table::Format::Field &field,
+                                     const Table::Actions::Action *act) const {
+    decltype(act->reverse_alias()) aliases;
+    if (act) aliases = act->reverse_alias();
+    auto alias = get(aliases, name);
+
+    // we need to add only those aliases that are parameters, and there can be multiple
+    // such fields that contain slices of one or more other aliases
+    // FIXME: why aren't we de-aliasing in setup?
+    for (auto a : alias) {
+        json::string param_name = a->first;
+        int lo = remove_name_tail_range(param_name);
+        if (act->has_param(param_name) || a->second.is_constant) {
+            auto newField = field;
+            if (a->second.hi != -1) {
+                unsigned fieldSize = a->second.hi - a->second.lo + 1;
+                if (field.bits.size() > 1) warning(0, "multiple bit ranges for %s", name.c_str());
+                newField =
+                    Table::Format::Field(field.fmt, fieldSize, a->second.lo + field.bits[0].lo,
+                                         static_cast<Format::Field::flags_t>(field.flags));
+            }
+            act->check_conditional(newField);
+
+            if (a->second.is_constant)
+                output_field_to_pack_format(field_list, basebit, a->first, "constant", 0, newField,
+                                            a->second.value);
+            else
+                output_field_to_pack_format(field_list, basebit, a->first, "spec", 0, newField);
+        }
+    }
+
+    // Determine the source of the field. If called recursively for an alias,
+    // act will be a nullptr
+    std::string source = "";
+    int start_bit = 0;
+    if (!act) get_cjson_source(name, source, start_bit);
+
+    if (field.flags == Format::Field::ZERO) source = "zero";
+
+    if (source != "")
+        output_field_to_pack_format(field_list, basebit, name, source, start_bit, field);
+
+    // Convert fields with slices embedded in the name, eg. "foo.bar[4:0]", to
+    // slice-free field names with the start_bit incremented by the low bit of
+    // the slice.
+    canon_field_list(field_list);
+}
+
+void Table::output_field_to_pack_format(json::vector &field_list, unsigned basebit,
+                                        std::string name, std::string source, unsigned start_bit,
+                                        const Table::Format::Field &field, unsigned value) const {
+    unsigned add_width = 0;
+    bool pfe_enable = false;
+    unsigned indirect_addr_start_bit = 0;
+    int lobit = 0;
+    for (auto &bits : field.bits) {
+        json::map field_entry;
+        field_entry["start_bit"] = lobit + start_bit;
+        field_entry["field_width"] = bits.size() + add_width;
+        field_entry["lsb_mem_word_idx"] = bits.lo / MEM_WORD_WIDTH;
+        field_entry["msb_mem_word_idx"] = bits.hi / MEM_WORD_WIDTH;
+        field_entry["source"] = json::string(source);
+        field_entry["enable_pfe"] = false;
+        if (source == "constant") {
+            field_entry["const_tuples"] =
+                json::vector{json::map{{"dest_start", json::number(0)},
+                                       {"value", json::number(value)},
+                                       {"dest_width", json::number(bits.size())}}};
+        }
+        field_entry["lsb_mem_word_offset"] = basebit + (bits.lo % MEM_WORD_WIDTH);
+        field_entry["field_name"] = json::string(name);
+        field_entry["global_name"] = json::string("");
+
+        if (field.conditional_value) {
+            field_entry["is_mod_field_conditionally_value"] = true;
+            field_entry["mod_field_conditionally_mask_field_name"] = json::string(field.condition);
+        }
+        // field_entry["immediate_name"] = json::string(immediate_name);
+        // if (this->to<ExactMatchTable>())
+        if (this->to<SRamMatchTable>()) {
+            // FIXME-JSON : match_mode only matters for ATCAM's not clear if
+            // 'unused' or 'exact' is used by driver
+            std::string match_mode = "unused";
+            // For version bits field match mode is set to "s1q0" (to match
+            // glass)
+            if (name == "version") match_mode = "s1q0";
+            field_entry["match_mode"] = match_mode;
+        }
+        field_list.push_back(std::move(field_entry));
+        lobit += bits.size();
+    }
+}
+
+void Table::add_zero_padding_fields(Table::Format *format, Table::Actions::Action *act,
+                                    unsigned format_width) const {
+    if (!format) return;
+    // For an action with no format pad zeros for action table size
+    unsigned pad_count = 0;
+    if (format->log2size == 0) {
+        if (auto at = this->to<ActionTable>()) {
+            format->size = at->get_size();
+            BUG_CHECK(format->size);
+            format->log2size = at->get_log2size();
+            // For wide action formats, entries per word is 1, so plug in a
+            // single pad field of 256 bits
+            unsigned action_entries_per_word = std::max(1U, 128U / format->size);
+            // Add a flag type to specify padding?
+            Format::Field f(format, format->size, 0, Format::Field::ZERO);
+            for (unsigned i = 0; i < action_entries_per_word; i++)
+                format->add_field(f, "--padding--");
+        } else {
+            error(lineno,
+                  "Adding zero padding to a non action table "
+                  "which has no action entries in format");
+        }
+        return;
+    }
+    decltype(act->reverse_alias()) alias;
+    if (act) alias = act->reverse_alias();
+
+    // Determine the zero padding necessary by creating a bitvector that has all
+    // bits cleared, and then iterate through parameters and immediates and set the
+    // bits that are used. Create padding for the remaining bit ranges.
+    bitvec padbits;
+    padbits.clrrange(0, format_width - 1);
+    for (int entry = 0; entry < format->groups(); ++entry) {
+        for (auto &field : format->group(entry)) {
+            auto aliases = get(alias, field.first);
+            for (auto a : aliases) {
+                auto newField = field.second;
+                json::string param_name = a->first;
+                int lo = remove_name_tail_range(param_name);
+                if (act->has_param(param_name) || a->second.is_constant) {
+                    auto newField = Table::Format::Field(
+                        field.second.fmt, a->second.size(), a->second.lo + field.second.bits[0].lo,
+                        static_cast<Format::Field::flags_t>(field.second.flags));
+                    newField.set_field_bits(padbits);
+                }
+            }
+            if (aliases.size() == 0) field.second.set_field_bits(padbits);
+        }
+    }
+
+    int idx_lo = 0;
+    for (auto p : padbits) {
+        if (p > idx_lo) {
+            Format::Field f(format, p - idx_lo, idx_lo, Format::Field::ZERO);
+            std::string pad_name =
+                "--padding_" + std::to_string(idx_lo) + "_" + std::to_string(p - 1) + "--";
+            format->add_field(f, pad_name);
+        }
+        idx_lo = p + 1;
+    }
+    if (idx_lo < int(format_width)) {
+        Format::Field f(format, format_width - idx_lo, idx_lo, Format::Field::ZERO);
+        std::string pad_name =
+            "--padding_" + std::to_string(idx_lo) + "_" + std::to_string(format_width - 1) + "--";
+        format->add_field(f, pad_name);
+    }
+}
+
+json::map &Table::add_pack_format(json::map &stage_tbl, Table::Format *format, bool pad_zeros,
+                                  bool print_fields, Table::Actions::Action *act) const {
+    // Add zero padding fields to format
+    // FIXME: Can this be moved to a format pass?
+    if (pad_zeros)
+        add_zero_padding_fields(format, act, format ? format->get_padding_format_width() : -1);
+    json::map pack_fmt;
+    auto mem_word_width = ram_word_width();
+    pack_fmt["memory_word_width"] = mem_word_width;
+    auto table_word_width = format ? format->get_table_word_width() : ram_word_width();
+    pack_fmt["table_word_width"] = table_word_width;
+    pack_fmt["entries_per_table_word"] = format ? format->get_entries_per_table_word() : 1;
+    pack_fmt["number_memory_units_per_table_word"] =
+        format ? format->get_mem_units_per_table_word() : 1;
+
+    /**
+     * Entry number has to be unique for all tables.  However, for ATCAM tables specifically,
+     * the entry with the highest priority starts at entry number 0.  The priority decreases
+     * as the entry number increases.
+     *
+     * This is actually reversed in the hardware.  The compiler format entries are in priority
+     * order in the hardware, and have been validated in validate_format.  Thus, the context
+     * JSON is reversed.
+     */
+    if (print_fields) {
+        BUG_CHECK(format);
+        int basebit = std::max(0, mem_word_width - (1 << format->log2size));
+        json::vector &entry_list = pack_fmt["entries"];
+        if (format->is_wide_format()) {
+            for (int i = format->groups() - 1; i >= 0; --i) {
+                int entry_number = i;
+                if (table_type() == ATCAM) entry_number = format->groups() - 1 - i;
+                json::vector field_list;
+                for (auto it = format->begin(i); it != format->end(i); ++it)
+                    add_field_to_pack_format(field_list, basebit, it->first, it->second, act);
+                entry_list.push_back(json::map{{"entry_number", json::number(entry_number)},
+                                               {"fields", std::move(field_list)}});
+            }
+        } else {
+            for (int i = format->get_entries_per_table_word() - 1; i >= 0; --i) {
+                int entry_number = i;
+                if (table_type() == ATCAM)
+                    entry_number = format->get_entries_per_table_word() - 1 - i;
+                json::vector field_list;
+                for (auto &field : *format)
+                    add_field_to_pack_format(field_list, basebit, field.first, field.second, act);
+                entry_list.push_back(json::map{{"entry_number", json::number(entry_number)},
+                                               {"fields", std::move(field_list)}});
+                basebit -= 1 << format->log2size;
+            }
+        }
+    }
+    if (act) pack_fmt["action_handle"] = act->handle;
+    json::vector &pack_format = stage_tbl["pack_format"];
+    pack_format.push_back(std::move(pack_fmt));
+    return pack_format.back()->to<json::map>();
+}
+
+// Check if node exists in context_json entry in bfa, add entry to the input
+// json node and remove the entry from context_json.
+//
+// Set parameter "append" to true in order to append to existing entries in
+// specified section of context_json.  Set to false to overwrite.  Applies
+// only to json::vector containers.
+bool Table::add_json_node_to_table(json::map &tbl, const char *name, bool append) const {
+    if (context_json) {
+        if (context_json->count(name)) {
+            std::unique_ptr<json::obj> new_obj = context_json->remove(name);
+            json::vector *add_vect = nullptr;
+            if (append && (add_vect = dynamic_cast<json::vector *>(new_obj.get()))) {
+                json::vector &new_vect = tbl[name];
+                std::move(add_vect->begin(), add_vect->end(), std::back_inserter(new_vect));
+            } else
+                tbl[name] = std::move(new_obj);
+            return true;
+        }
+    }
+    return false;
+}
+
+void Table::add_match_key_cfg(json::map &tbl) const {
+    json::vector &params = tbl["match_key_fields"];
+    if ((!p4_params_list.empty()) && this->to<MatchTable>()) {
+        // If a table is splitted to different stages in backend, the
+        // match_key_fields section will be populated every time the splitted
+        // tables are emitted. Therefore, we clear the vector before populating
+        // it again to avoid duplicated keys.
+        params.clear();
+        for (auto &p : p4_params_list) {
+            json::map param;
+            std::string name = p.name;
+            std::string global_name = "";
+            if (p.key_name.empty()) {
+                param["name"] = name;
+            } else {
+                // Presence of key name indicates the field has a name
+                // annotation. If the name annotation is on a field slice, then
+                // the slice is treated as a field with the key_name as its
+                // "name". The field output will have the same bit_width and
+                // bit_width_full indicating its not treated as a slice.  We
+                // also provide the original p4 name as the "global_name" to
+                // allow driver to use it as a lookup up against the snapshot
+                // fields published in context.json. These fields will all have
+                // original p4 field names.
+                param["name"] = p.key_name;
+                param["global_name"] = p.name;
+            }
+            param["start_bit"] = p.start_bit;
+            param["bit_width"] = p.bit_width;
+            param["bit_width_full"] = p.bit_width_full;
+            if (!p.mask.empty()) {
+                std::stringstream ss;
+                ss << "0x" << p.mask;
+                param["mask"] = ss.str();
+            }
+            param["position"] = p.position;
+            param["match_type"] = p.type;
+            param["is_valid"] = p.is_valid;
+            std::string fieldname, instname;
+            gen_instfield_name(name, instname, fieldname);
+            param["instance_name"] = instname;
+            param["field_name"] = fieldname;
+            if (!p.alias.empty()) param["alias"] = p.alias;
+            if (p.context_json) param.merge(*p.context_json.get());
+            params.push_back(std::move(param));
+            if (p.type == "range") tbl["uses_range"] = true;
+        }
+    }
+}
+
+template <typename T>
+void Table::init_json_node(json::map &tbl, const char *name) const {
+    if (tbl.count(name)) return;
+    tbl[name] = T();
+}
+
+void Table::common_tbl_cfg(json::map &tbl) const {
+    tbl["default_action_handle"] = get_default_action_handle();
+    tbl["action_profile"] = action_profile();
+    // FIXME -- setting next_table_mask unconditionally only works because we process the
+    // stage table in stage order (so we'll end up with the value from the last stage table,
+    // which is what we want.)  Should we check in case the ordering ever changes?
+    tbl["default_next_table_mask"] = next_table_adr_mask;
+    // FIXME -- the driver currently always assumes this is 0, so we arrange for it to be
+    // when choosing the action encoding.  But we should be able to choose something else
+    tbl["default_next_table_default"] = 0;
+    // FIXME-JSON: PD related, check glass examples for false (ALPM)
+    tbl["is_resource_controllable"] = true;
+    tbl["uses_range"] = false;
+    if (p4_table && p4_table->disable_atomic_modify) tbl["disable_atomic_modify"] = true;
+    add_match_key_cfg(tbl);
+    init_json_node<json::vector>(tbl, "ap_bind_indirect_res_to_match");
+    init_json_node<json::vector>(tbl, "static_entries");
+    if (context_json) {
+        add_json_node_to_table(tbl, "ap_bind_indirect_res_to_match");
+    }
+}
+
+void Table::add_result_physical_buses(json::map &stage_tbl) const {
+    json::vector &result_physical_buses = stage_tbl["result_physical_buses"] = json::vector();
+    for (auto l : layout) {
+        if (l.bus.count(Layout::RESULT_BUS))
+            result_physical_buses.push_back(l.row * 2 + l.bus.at(Layout::RESULT_BUS));
+    }
+}
+
+void Table::merge_context_json(json::map &tbl, json::map &stage_tbl) const {
+    if (context_json) {
+        add_json_node_to_table(tbl, "static_entries", true);
+        stage_tbl.merge(*context_json);
+    }
+}
diff --git a/backends/tofino/bf-asm/tables.h b/backends/tofino/bf-asm/tables.h
new file mode 100644
index 00000000000..ad66ae2fcea
--- /dev/null
+++ b/backends/tofino/bf-asm/tables.h
@@ -0,0 +1,2246 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TABLES_H_
+#define BACKENDS_TOFINO_BF_ASM_TABLES_H_
+
+#include <iostream>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "backends/tofino/bf-asm/alloc.h"
+#include "backends/tofino/bf-asm/asm-types.h"
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/json.h"
+#include "backends/tofino/bf-asm/map.h"
+#include "backends/tofino/bf-asm/p4_table.h"
+#include "backends/tofino/bf-asm/phv.h"
+#include "backends/tofino/bf-asm/slist.h"
+#include "backends/tofino/bf-asm/target.h"
+#include "constants.h"
+#include "hash_dist.h"
+#include "input_xbar.h"
+#include "lib/algorithm.h"
+#include "lib/bitops.h"
+#include "lib/bitvec.h"
+#include "lib/ordered_map.h"
+
+class ActionBus;
+struct ActionBusSource;
+class AttachedTable;
+struct AttachedTables;
+class GatewayTable;
+class IdletimeTable;
+class ActionTable;
+struct Instruction;
+class InputXbar;
+class MatchTable;
+class SelectionTable;
+class StatefulTable;
+class MeterTable;
+class Synth2Port;
+class Stage;
+struct HashCol;
+
+struct RandomNumberGen {
+    int unit;
+    explicit RandomNumberGen(int u) : unit(u) {}
+    bool operator==(const RandomNumberGen &a) const { return unit == a.unit; }
+};
+
+enum class TableOutputModifier { NONE, Color, Address };
+std::ostream &operator<<(std::ostream &, TableOutputModifier);
+
+/* a memory storage 'unit' somewhere on the chip */
+struct MemUnit {
+    int stage = INT_MIN;  // current stage (only) for tofino1/2
+                          // can have negative stage numbers for tcams in egress
+    int row = -1;
+    int col;  // (lamb) unit when row == -1
+    MemUnit() = delete;
+    MemUnit(const MemUnit &) = default;
+    MemUnit(MemUnit &&) = default;
+    MemUnit &operator=(const MemUnit &) = default;
+    MemUnit &operator=(MemUnit &&) = default;
+    virtual ~MemUnit() {}
+    explicit MemUnit(int unit) : col(unit) {}
+    MemUnit(int r, int c) : row(r), col(c) {}
+    MemUnit(int s, int r, int c) : stage(s), row(r), col(c) {}
+    bool operator==(const MemUnit &a) const {
+        return std::tie(stage, row, col) == std::tie(a.stage, a.row, a.col);
+    }
+    bool operator!=(const MemUnit &a) const {
+        return std::tie(stage, row, col) != std::tie(a.stage, a.row, a.col);
+    }
+    bool operator<(const MemUnit &a) const {
+        return std::tie(stage, row, col) < std::tie(a.stage, a.row, a.col);
+    }
+    virtual const char *desc() const;  // Short lived temp for messages
+    friend std::ostream &operator<<(std::ostream &out, const MemUnit &m) { return out << m.desc(); }
+};
+
+class Table {
+ public:
+    struct Layout {
+        /* Holds the layout of which rams/tcams/busses are used by the table
+         * These refer to rows/columns in different spaces:
+         * ternary match refers to tcams (12x2)
+         * exact match and ternary indirect refer to physical srams (8x12)
+         * action (and others?) refer to logical srams (16x6)
+         * vpns contains the (base)vpn index of each ram in the row
+         * maprams contain the map ram indexes for synthetic 2-port memories
+         * vpns/maprams (if not empty) must match up to memunits (same size) */
+        int lineno = -1;
+        int row = -1;
+        enum bus_type_t { SEARCH_BUS, RESULT_BUS, TIND_BUS, IDLE_BUS, L2R_BUS, R2L_BUS };
+        std::map<bus_type_t, int> bus;
+
+        int word = -1;          // which word for wide tables
+        bool home_row = false;  // is this a home row
+        std::vector<MemUnit> memunits;
+        std::vector<int> vpns, maprams;
+        Layout() = default;
+        Layout(int l, int r) : lineno(l), row(r) {}
+        friend std::ostream &operator<<(std::ostream &, const Layout &);
+
+        bool word_initialized() const { return word >= 0; }
+        bool operator==(const Layout &) const;
+        bool operator!=(const Layout &a) const { return !(*this == a); }
+    };
+
+ protected:
+    Table(int line, std::string &&n, gress_t gr, Stage *s,
+          int lid = -1);  // NOLINT(whitespace/operators)
+    virtual ~Table();
+    Table(const Table &) = delete;
+    Table(Table &&) = delete;
+    virtual void setup(VECTOR(pair_t) & data) = 0;
+    virtual void common_init_setup(const VECTOR(pair_t) &, bool, P4Table::type);
+    virtual bool common_setup(pair_t &, const VECTOR(pair_t) &, P4Table::type);
+    void setup_context_json(value_t &);
+    void setup_layout(std::vector<Layout> &, const VECTOR(pair_t) & data, const char *subname = "");
+    int setup_layout_bus_attrib(std::vector<Layout> &, const value_t &data, const char *what,
+                                Layout::bus_type_t type);
+    int setup_layout_attrib(std::vector<Layout> &, const value_t &data, const char *what,
+                            int Layout::*attr);
+    void setup_logical_id();
+    void setup_actions(value_t &);
+    void setup_maprams(value_t &);
+    void setup_vpns(std::vector<Layout> &, VECTOR(value_t) *, bool allow_holes = false);
+    virtual void vpn_params(int &width, int &depth, int &period, const char *&period_name) const {
+        BUG();
+    }
+    virtual int get_start_vpn() { return 0; }
+    void alloc_rams(bool logical, BFN::Alloc2Dbase<Table *> &use,
+                    BFN::Alloc2Dbase<Table *> *bus_use = 0,
+                    Layout::bus_type_t bus_type = Layout::SEARCH_BUS);
+    void alloc_global_bus(Layout &, Layout::bus_type_t, int, int, int, int);
+    virtual void alloc_global_busses();
+    void alloc_global_srams();
+    void alloc_global_tcams();
+    void alloc_busses(BFN::Alloc2Dbase<Table *> &bus_use, Layout::bus_type_t bus_type);
+    void alloc_id(const char *idname, int &id, int &next_id, int max_id, bool order,
+                  BFN::Alloc1Dbase<Table *> &use);
+    void alloc_maprams();
+    virtual void alloc_vpns();
+    virtual Layout::bus_type_t default_bus_type() const { return Layout::SEARCH_BUS; }
+    void need_bus(int lineno, BFN::Alloc1Dbase<Table *> &use, int idx, const char *name);
+    static bool allow_ram_sharing(const Table *t1, const Table *t2);
+
+ public:
+    class Type {
+        static std::map<std::string, Type *> *all;
+        std::map<std::string, Type *>::iterator self;
+
+     protected:
+        explicit Type(std::string &&);  // NOLINT(whitespace/operators)
+        explicit Type(const char *name) : Type(std::string(name)) {}
+        virtual ~Type();
+
+     public:
+        static Type *get(const char *name) { return ::get(all, name); }
+        static Type *get(const std::string &name) { return ::get(all, name); }
+        virtual Table *create(int lineno, const char *name, gress_t gress, Stage *stage, int lid,
+                              VECTOR(pair_t) & data) = 0;
+    };
+
+    struct Ref {
+        int lineno;
+        std::string name;
+        Ref() : lineno(-1) {}
+        Ref(const Ref &) = default;
+        Ref(Ref &&) = default;
+        Ref &operator=(const Ref &a) & {
+            name = a.name;
+            if (lineno < 0) lineno = a.lineno;
+            return *this;
+        }
+        Ref &operator=(Ref &&a) & {
+            name = a.name;
+            if (lineno < 0) lineno = a.lineno;
+            return *this;
+        }
+        Ref &operator=(const value_t &a) & {
+            BUG_CHECK(a.type == tSTR);
+            name = a.s;
+            lineno = a.lineno;
+            return *this;
+        }
+        Ref(const std::string &n) : lineno(-1), name(n) {}  // NOLINT(runtime/explicit)
+        Ref(const char *n) : lineno(-1), name(n) {}         // NOLINT(runtime/explicit)
+        Ref(const value_t &a) : lineno(a.lineno) {          // NOLINT(runtime/explicit)
+            if (CHECKTYPE(a, tSTR)) name = a.s;
+        }
+        Ref &operator=(const std::string &n) {
+            name = n;
+            return *this;
+        }
+        operator bool() const { return all && all->count(name) > 0; }
+        operator Table *() const { return ::get(all, name); }
+        Table *operator->() const { return ::get(all, name); }
+        bool set() const { return lineno >= 0; }
+        bool operator==(const Table *t) const { return name == t->name_; }
+        bool operator==(const char *t) const { return name == t; }
+        bool operator==(const std::string &t) const { return name == t; }
+        bool operator==(const Ref &a) const { return name == a.name; }
+        bool operator<(const Ref &a) const { return name < a.name; }
+        bool check() const {
+            if (set() && !*this) error(lineno, "No table named %s", name.c_str());
+            return *this;
+        }
+    };
+
+    class NextTables {
+        std::set<Ref> next;
+        unsigned lb_tags = 0;                // long branch tags to use (bitmask)
+        const Table *next_table_ = nullptr;  // table to use as next table (if any)
+        bool resolved = false;
+        bool can_use_lb(int stage, const NextTables &);
+
+     public:
+        int lineno = -1;
+        NextTables() = default;
+        NextTables(const NextTables &) = default;
+        NextTables(NextTables &&) = default;
+        NextTables &operator=(const NextTables &a) = default;
+        NextTables &operator=(NextTables &&) = default;
+        NextTables(value_t &v);  // NOLINT(runtime/explicit)
+
+        std::set<Ref>::iterator begin() const { return next.begin(); }
+        std::set<Ref>::iterator end() const { return next.end(); }
+        int size() const { return next.size(); }
+        bool operator==(const NextTables &a) const { return next == a.next; }
+        bool subset_of(const NextTables &a) const {
+            for (auto &n : next)
+                if (!a.next.count(n)) return false;
+            return true;
+        }
+        void resolve_long_branch(const Table *tbl, const std::map<int, NextTables> &lbrch);
+        bool set() const { return lineno >= 0; }
+        int next_table_id() const {
+            BUG_CHECK(resolved);
+            return next_table_ ? next_table_->table_id() : Target::END_OF_PIPE();
+        }
+        std::string next_table_name() const {
+            BUG_CHECK(resolved);
+            if (next_table_) {
+                if (auto nxt_p4_name = next_table_->p4_name()) return nxt_p4_name;
+            }
+            return "END";
+        }
+        const Table *next_table() const { return next_table_; }
+        unsigned long_branch_tags() const { return lb_tags; }
+        unsigned next_in_stage(int stage) const;
+        bool need_next_map_lut() const;
+        void force_single_next_table();
+    };
+
+    class Format {
+     public:
+        struct bitrange_t {
+            unsigned lo, hi;
+            bitrange_t(unsigned l, unsigned h) : lo(l), hi(h) {}
+            bool operator==(const bitrange_t &a) const { return lo == a.lo && hi == a.hi; }
+            bool disjoint(const bitrange_t &a) const { return lo > a.hi || a.lo > hi; }
+            bitrange_t overlap(const bitrange_t &a) const {
+                // only valid if !disjoint
+                return bitrange_t(std::max(lo, a.lo), std::min(hi, a.hi));
+            }
+            int size() const { return hi - lo + 1; }
+        };
+        struct Field {
+            unsigned size = 0, group = 0, flags = 0;
+            std::vector<bitrange_t> bits;
+            Field **by_group = 0;
+            Format *fmt;  // containing format
+            bool operator==(const Field &a) const { return size == a.size; }
+            /* return the bit in the format that contains bit i of this field */
+            unsigned bit(unsigned i) {
+                unsigned last = 0;
+                for (auto &chunk : bits) {
+                    if (i < (unsigned)chunk.size()) return chunk.lo + i;
+                    i -= chunk.size();
+                    last = chunk.hi + 1;
+                }
+                if (i == 0) return last;
+                BUG();
+                return 0;  // quiet -Wreturn-type warning
+            }
+            /* bit(i), adjusted for the immediate shift of the match group of the field
+             * returns the bit in the post-extract immediate containing bit i */
+            unsigned immed_bit(unsigned i) {
+                auto rv = bit(i);
+                if (fmt && fmt->immed) rv -= fmt->immed->by_group[group]->bit(0);
+                return rv;
+            }
+            unsigned hi(unsigned bit) {
+                for (auto &chunk : bits)
+                    if (bit >= chunk.lo && bit <= chunk.hi) return chunk.hi;
+                BUG();
+                return 0;  // quiet -Wreturn-type warning
+            }
+            enum flags_t { NONE = 0, USED_IMMED = 1, ZERO = 3 };
+            bool conditional_value = false;
+            std::string condition;
+            explicit Field(Format *f) : fmt(f) {}
+            Field(Format *f, unsigned size, unsigned lo = 0, enum flags_t fl = NONE)
+                : size(size), flags(fl), fmt(f) {
+                if (size) bits.push_back({lo, lo + size - 1});
+            }
+            Field(const Field &f, Format *fmt)
+                : size(f.size), flags(f.flags), bits(f.bits), fmt(fmt) {}
+
+            /// mark all bits from the field in @param bitset
+            void set_field_bits(bitvec &bitset) const {
+                for (auto &b : bits) bitset.setrange(b.lo, b.size());
+            }
+        };
+        friend std::ostream &operator<<(std::ostream &, const Field &);
+        explicit Format(Table *t) : tbl(t) { fmt.resize(1); }
+        Format(Table *, const VECTOR(pair_t) & data, bool may_overlap = false);
+        ~Format();
+        void pass1(Table *tbl);
+        void pass2(Table *tbl);
+
+     private:
+        std::vector<ordered_map<std::string, Field>> fmt;
+        std::map<unsigned, ordered_map<std::string, Field>::iterator> byindex;
+        static bool equiv(const ordered_map<std::string, Field> &,
+                          const ordered_map<std::string, Field> &);
+
+     public:
+        int lineno = -1;
+        Table *tbl;
+        unsigned size = 0, immed_size = 0;
+        Field *immed = 0;
+        unsigned log2size = 0;                           /* ceil(log2(size)) */
+        unsigned overhead_start = 0, overhead_size = 0;  // extent of non-match
+        int overhead_word = -1;
+
+        unsigned groups() const { return fmt.size(); }
+        const ordered_map<std::string, Field> &group(int g) const { return fmt.at(g); }
+        Field *field(const std::string &n, int group = 0) {
+            BUG_CHECK(group >= 0 && (size_t)group < fmt.size());
+            auto it = fmt[group].find(n);
+            if (it != fmt[group].end()) return &it->second;
+            return 0;
+        }
+        void apply_to_field(const std::string &n, std::function<void(Field *)> fn) {
+            for (auto &m : fmt) {
+                auto it = m.find(n);
+                if (it != m.end()) fn(&it->second);
+            }
+        }
+        std::string find_field(Field *field) {
+            for (auto &m : fmt)
+                for (auto &f : m)
+                    if (field == &f.second) return f.first;
+            return "<unknown>";
+        }
+        int find_field_lineno(Field *field) {
+            for (auto &m : fmt)
+                for (auto &f : m)
+                    if (field == &f.second) return lineno;
+            return -1;
+        }
+        void add_field(Field &f, std::string name = "dummy", int grp = 0) {
+            fmt[grp].emplace(name, Field(f, this));
+        }
+        decltype(fmt[0].begin()) begin(int grp = 0) { return fmt[grp].begin(); }
+        decltype(fmt[0].end()) end(int grp = 0) { return fmt[grp].end(); }
+        decltype(fmt[0].cbegin()) begin(int grp = 0) const { return fmt[grp].begin(); }
+        decltype(fmt[0].cend()) end(int grp = 0) const { return fmt[grp].end(); }
+        bool is_wide_format() const { return (log2size >= 7 || groups() > 1) ? true : false; }
+        int get_entries_per_table_word() const {
+            // A phase0 table can only have 1 entry
+            if (tbl->table_type() == PHASE0) return 1;
+            if (is_wide_format()) return groups();
+            return log2size ? (1U << (ceil_log2(tbl->ram_word_width()) - log2size)) : 0;
+        }
+        int get_mem_units_per_table_word() const {
+            return is_wide_format() ? ((size - 1) / tbl->ram_word_width()) + 1 : 1;
+        }
+        int get_table_word_width() const {
+            return is_wide_format() ? tbl->ram_word_width() * get_mem_units_per_table_word()
+                                    : tbl->ram_word_width();
+        }
+        int get_padding_format_width() const {
+            return is_wide_format() ? get_mem_units_per_table_word() * tbl->ram_word_width()
+                                    : (1U << log2size);
+        }
+    };
+
+    struct Call : Ref { /* a Ref with arguments */
+        struct Arg {
+            enum { Field, HashDist, Counter, Const, Name } type;
+
+         private:
+            union {
+                Format::Field *fld;
+                HashDistribution *hd;
+                intptr_t val;
+                char *str;
+            };
+
+            void set(const Arg &a) {
+                type = a.type;
+                switch (type) {
+                    case Field:
+                        fld = a.fld;
+                        return;
+                    case HashDist:
+                        hd = a.hd;
+                        return;
+                    case Counter:
+                    case Const:
+                        val = a.val;
+                        return;
+                    case Name:
+                        str = a.str;
+                        return;
+                }
+            }
+
+         public:
+            Arg() = delete;
+            Arg(const Arg &a) {
+                set(a);
+                if (type == Name) str = strdup(str);
+            }
+            Arg(Arg &&a) {
+                set(a);
+                a.type = Const;
+            }
+            Arg &operator=(const Arg &a) {
+                if (&a == this) return *this;
+                if (a == *this) return *this;
+                if (type == Name) free(str);
+                set(a);
+                if (type == Name) str = strdup(a.str);
+                return *this;
+            }
+            Arg &operator=(Arg &&a) {
+                std::swap(type, a.type);
+                std::swap(val, a.val);
+                return *this;
+            }
+            Arg(Format::Field *f) : type(Field) { fld = f; }  // NOLINT(runtime/explicit)
+            Arg(HashDistribution *hdist) : type(HashDist) {   // NOLINT(runtime/explicit)
+                hd = hdist;
+            }
+            Arg(int v) : type(Const) { val = v; }                 // NOLINT(runtime/explicit)
+            Arg(const char *n) : type(Name) { str = strdup(n); }  // NOLINT(runtime/explicit)
+            Arg(decltype(Counter) ctr, int mode) : type(Counter) {
+                val = mode;
+                BUG_CHECK(ctr == Counter);
+            }
+            ~Arg() {
+                if (type == Name) free(str);
+            }
+            bool operator==(const Arg &a) const {
+                if (type != a.type) return false;
+                switch (type) {
+                    case Field:
+                        return fld == a.fld;
+                    case HashDist:
+                        return hd == a.hd;
+                    case Counter:
+                    case Const:
+                        return val == a.val;
+                    case Name:
+                        return !strcmp(str, a.str);
+                    default:
+                        BUG();
+                }
+                return false;
+            }
+            bool operator!=(const Arg &a) const { return !operator==(a); }
+            Format::Field *field() const { return type == Field ? fld : nullptr; }
+            HashDistribution *hash_dist() const { return type == HashDist ? hd : nullptr; }
+            const char *name() const { return type == Name ? str : nullptr; }
+            int count_mode() const { return type == Counter ? val : 0; }
+            int value() const { return type == Const ? val : 0; }
+            operator bool() const { return fld != nullptr; }
+            unsigned size() const;
+        };
+        std::vector<Arg> args;
+        void setup(const value_t &v, Table *tbl);
+        Call() {}
+        Call(const value_t &v, Table *tbl) { setup(v, tbl); }
+        bool operator==(const Call &a) const { return Ref::operator==(a) && args == a.args; }
+        bool operator!=(const Call &a) const { return !(*this == a); }
+        bool is_direct_call() const {
+            if (args.size() == 0) return false;
+            for (auto &a : args)
+                if (a == "$DIRECT") return true;
+            return false;
+        }
+    };
+
+    struct p4_param {
+        std::string name;
+        std::string alias;
+        std::string key_name;
+        unsigned start_bit = 0;
+        unsigned position = 0;
+        unsigned bit_width = 0;
+        unsigned bit_width_full = 0;
+        bitvec mask;
+        std::string default_value;  // value stored as hex string to accommodate large nos
+        bool defaulted = false;
+        bool is_valid = false;
+        std::string type;
+        std::unique_ptr<json::map> context_json;
+        explicit p4_param(std::string n = "", unsigned p = 0, unsigned bw = 0)
+            : name(n), position(p), bit_width(bw) {}
+    };
+    friend std::ostream &operator<<(std::ostream &, const p4_param &);
+    typedef std::vector<p4_param> p4_params;
+
+    class Actions {
+     public:
+        struct Action {
+            struct alias_t {
+                std::string name;
+                int lineno = -1, lo = -1, hi = -1;
+                bool is_constant = false;
+                unsigned value = 0;
+                explicit alias_t(value_t &);
+                unsigned size() const {
+                    if (hi != -1 && lo != -1)
+                        return hi - lo + 1;
+                    else
+                        return 0;
+                }
+                std::string to_string() const {
+                    if (hi >= 0 && lo >= 0)
+                        return name + '(' + std::to_string(lo) + ".." + std::to_string(hi) + ')';
+                    return name;
+                }
+            };
+            std::string name;
+            std::string rng_param_name = "";
+            int lineno = -1, addr = -1, code = -1;
+            std::multimap<std::string, alias_t> alias;
+            std::vector<std::unique_ptr<Instruction>> instr;
+            bitvec slot_use;
+            unsigned handle = 0;
+            p4_params p4_params_list;
+            bool hit_allowed = true;
+            bool default_allowed = false;
+            bool default_only = false;
+            bool is_constant = false;
+            std::string hit_disallowed_reason = "";
+            std::string default_disallowed_reason = "";
+            std::vector<Call> attached;
+            int next_table_encode = -1;
+            NextTables next_table_ref;
+            NextTables next_table_miss_ref;
+            std::map<std::string, std::vector<bitvec>> mod_cond_values;
+            // The hit map points to next tables for actions as ordered in the
+            // assembly, we use 'position_in_assembly' to map the correct next
+            // table, as actions can be ordered in the map different from the
+            // assembly order.
+            int position_in_assembly = -1;
+            bool minmax_use = false;  // jbay sful min/max
+            // Predication operand coming into the output ALUs in stateful actions. This attribute
+            // is used to make sure that all combined predicate outputs from a given stateful action
+            // have the same form, because the predication operand is always the same in every
+            // output ALU.
+            int pred_comb_sel = -1;
+            std::unique_ptr<json::map> context_json;
+            Action(Table *, Actions *, pair_t &, int);
+            enum mod_cond_loc_t { MC_ADT, MC_IMMED };
+            void setup_mod_cond_values(value_t &map);
+            Action(const char *n, int l);
+            Action(const Action &) = delete;
+            Action(Action &&) = delete;
+            ~Action();
+            bool equiv(Action *a);
+            bool equivVLIW(Action *a);
+            typedef const decltype(alias)::value_type alias_value_t;
+            std::map<std::string, std::vector<alias_value_t *>> reverse_alias() const;
+            std::string alias_lookup(int lineno, std::string name, int &lo, int &hi) const;
+            bool has_rng() { return !rng_param_name.empty(); }
+            const p4_param *has_param(std::string param) const {
+                for (auto &e : p4_params_list)
+                    if (e.name == param) return &e;
+                return nullptr;
+            }
+            void pass1(Table *tbl);
+            void check_next(Table *tbl);
+            void check_next_ref(Table *tbl, const Table::Ref &ref) const;
+            void add_direct_resources(json::vector &direct_resources, const Call &att) const;
+            void add_indirect_resources(json::vector &indirect_resources, const Call &att) const;
+            void check_and_add_resource(json::vector &resources, json::map &resource) const;
+            bool is_color_aware() const;
+            void gen_simple_tbl_cfg(json::vector &) const;
+            void add_p4_params(json::vector &, bool include_default = true) const;
+            void check_conditional(Table::Format::Field &field) const;
+            bool immediate_conditional(int lo, int sz, std::string &condition) const;
+            friend std::ostream &operator<<(std::ostream &, const alias_t &);
+            friend std::ostream &operator<<(std::ostream &, const Action &);
+        };
+
+     private:
+        typedef ordered_map<std::string, Action> map_t;
+        map_t actions;
+        bitvec code_use;
+        std::map<int, Action *> by_code;
+        bitvec slot_use;
+        Table *table;
+
+     public:
+        int max_code = -1;
+        Actions(Table *tbl, VECTOR(pair_t) &);
+        typedef map_t::value_type value_type;
+        typedef IterValues<map_t::iterator>::iterator iterator;
+        typedef IterValues<map_t::const_iterator>::iterator const_iterator;
+        iterator begin() { return iterator(actions.begin()); }
+        const_iterator begin() const { return const_iterator(actions.begin()); }
+        iterator end() { return iterator(actions.end()); }
+        const_iterator end() const { return const_iterator(actions.end()); }
+        int count() { return actions.size(); }
+        int hit_actions_count() const;
+        int default_actions_count() const;
+        Action *action(const std::string &n) {
+            auto it = actions.find(n);
+            return it == actions.end() ? nullptr : &it->second;
+        }
+        bool exists(const std::string &n) { return actions.count(n) > 0; }
+        void pass1(Table *);
+        void pass2(Table *);
+        void stateful_pass2(Table *);
+        template <class REGS>
+        void write_regs(REGS &, Table *);
+        void add_p4_params(const Action &, json::vector &) const;
+        void gen_tbl_cfg(json::vector &) const;
+        void add_immediate_mapping(json::map &);
+        void add_action_format(const Table *, json::map &) const;
+        bool has_hash_dist() { return (table->table_type() == HASH_ACTION); }
+        size_t size() { return actions.size(); }
+    };
+
+ public:
+    const char *name() const { return name_.c_str(); }
+    const char *p4_name() const {
+        if (p4_table) {
+            return p4_table->p4_name();
+        }
+        return nullptr;
+    }
+    unsigned p4_size() const {
+        if (p4_table) {
+            return p4_table->p4_size();
+        }
+        return 0;
+    }
+    unsigned handle() const {
+        if (p4_table) {
+            return p4_table->get_handle();
+        }
+        return -1;
+    }
+    std::string action_profile() const {
+        if (p4_table) {
+            return p4_table->action_profile;
+        }
+        return "";
+    }
+    std::string how_referenced() const {
+        if (p4_table) {
+            return p4_table->how_referenced;
+        }
+        return "";
+    }
+    int table_id() const;
+    virtual bool is_always_run() const { return false; }
+    virtual void pass0() {}  // only match tables need pass0
+    virtual void pass1();
+    virtual void pass2() = 0;
+    virtual void pass3() = 0;
+    /* C++ does not allow virtual template methods, so we work around it by explicitly
+     * instantiating overloads for all the virtual template methods we want. */
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, virtual void write_action_regs,
+                          (mau_regs &, const Actions::Action *), {})
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, virtual void write_merge_regs,
+                          (mau_regs &, int type, int bus), { assert(0); })
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, virtual void write_merge_regs,
+                          (mau_regs &, MatchTable *match, int type, int bus,
+                           const std::vector<Call::Arg> &args),
+                          { assert(0); })
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, virtual void write_regs, (mau_regs &), = 0)
+
+    virtual void gen_tbl_cfg(json::vector &out) const = 0;
+    virtual json::map *base_tbl_cfg(json::vector &out, const char *type, int size) const;
+    virtual json::map *add_stage_tbl_cfg(json::map &tbl, const char *type, int size) const;
+    virtual std::unique_ptr<json::map> gen_memory_resource_allocation_tbl_cfg(
+        const char *type, const std::vector<Layout> &layout, bool skip_spare_bank = false) const;
+    virtual std::vector<int> determine_spare_bank_memory_units() const { return {}; }
+    virtual void common_tbl_cfg(json::map &tbl) const;
+    void add_match_key_cfg(json::map &tbl) const;
+    bool add_json_node_to_table(json::map &tbl, const char *name, bool append = false) const;
+    void allocate_physical_ids(unsigned usable = ~0U);
+    template <typename T>
+    void init_json_node(json::map &tbl, const char *name) const;
+    enum table_type_t {
+        OTHER = 0,
+        TERNARY_INDIRECT,
+        GATEWAY,
+        ACTION,
+        SELECTION,
+        COUNTER,
+        METER,
+        IDLETIME,
+        STATEFUL,
+        HASH_ACTION,
+        EXACT,
+        TERNARY,
+        PHASE0,
+        ATCAM,
+        PROXY_HASH
+    };
+    virtual table_type_t table_type() const { return OTHER; }
+    virtual int instruction_set() { return 0; /* VLIW_ALU */ }
+    virtual table_type_t set_match_table(MatchTable *m, bool indirect) {
+        assert(0);
+        return OTHER;
+    }
+    virtual const MatchTable *get_match_table() const {
+        assert(0);
+        return nullptr;
+    }
+    virtual MatchTable *get_match_table() {
+        assert(0);
+        return nullptr;
+    }
+    virtual std::set<MatchTable *> get_match_tables() { return std::set<MatchTable *>(); }
+    virtual const AttachedTables *get_attached() const { return 0; }
+    virtual AttachedTables *get_attached() { return 0; }
+    virtual const GatewayTable *get_gateway() const { return 0; }
+    virtual SelectionTable *get_selector() const { return 0; }
+    virtual MeterTable *get_meter() const { return 0; }
+    virtual void set_stateful(StatefulTable *s) { BUG(); }
+    virtual StatefulTable *get_stateful() const { return 0; }
+    virtual void set_address_used() {
+        // FIXME -- could use better error message(s) -- lineno is not accurate/useful
+        error(lineno,
+              "Tofino does not support extracting the address used on "
+              "a non-stateful table %s",
+              name());
+    }
+    virtual void set_color_used() {
+        error(lineno, "Cannot extract color on a non-meter table %s", name());
+    }
+    virtual void set_output_used() {
+        error(lineno, "Cannot extract output on a non-stateful table %s", name());
+    }
+    virtual const Call &get_action() const { return action; }
+    virtual std::vector<Call> get_calls() const;
+    virtual bool is_attached(const Table *) const {
+        BUG();
+        return false;
+    }
+    virtual Format::Field *find_address_field(const AttachedTable *) const {
+        BUG();
+        return 0;
+    }
+    virtual Format::Field *get_per_flow_enable_param(MatchTable *) const {
+        BUG();
+        return 0;
+    }
+    virtual Format::Field *get_meter_address_param(MatchTable *) const {
+        BUG();
+        return 0;
+    }
+    virtual Format::Field *get_meter_type_param(MatchTable *) const {
+        BUG();
+        return 0;
+    }
+    virtual int direct_shiftcount() const {
+        BUG();
+        return -1;
+    }
+    virtual int indirect_shiftcount() const {
+        BUG();
+        return -1;
+    }
+    virtual int address_shift() const {
+        BUG();
+        return -1;
+    }
+    virtual int home_row() const {
+        BUG();
+        return -1;
+    }
+    /* mem unitno mapping -- unit numbers used in context json */
+    virtual int json_memunit(const MemUnit &u) const;
+    virtual int ram_word_width() const { return MEM_WORD_WIDTH; }
+    virtual int unitram_type() {
+        BUG();
+        return -1;
+    }
+    virtual bool uses_colormaprams() const { return false; }
+    virtual int color_shiftcount(Table::Call &call, int group, int tcam_shift) const {
+        BUG();
+        return -1;
+    }
+    virtual bool adr_mux_select_stats() { return false; }
+    virtual bool run_at_eop() { return false; }
+    virtual Format *get_format() const { return format.get(); }
+    virtual unsigned determine_shiftcount(Table::Call &call, int group, unsigned word,
+                                          int tcam_shift) const {
+        assert(0);
+        return -1;
+    }
+    template <class REGS>
+    void write_mapram_regs(REGS &regs, int row, int col, int vpn, int type);
+    template <class T>
+    T *to() {
+        return dynamic_cast<T *>(this);
+    }
+    template <class T>
+    const T *to() const {
+        return dynamic_cast<const T *>(this);
+    }
+    virtual void determine_word_and_result_bus() { BUG(); }
+    virtual int stm_vbus_column() const { BUG(); }
+
+    std::string name_;
+    int uid;
+    P4Table *p4_table = 0;
+    Stage *stage = 0;
+    gress_t gress;
+    int lineno = -1;
+    int logical_id = -1;
+    bitvec physical_ids;
+    std::vector<DynamicIXbar> dynamic_config;
+    std::vector<std::unique_ptr<InputXbar>> input_xbar;
+    std::vector<Layout> layout;
+    bool no_vpns = false;  // for odd actions with null vpns
+                           // generated by compiler
+    std::unique_ptr<Format> format;
+    int action_enable = -1;
+    bool enable_action_data_enable = false;
+    bool enable_action_instruction_enable = false;
+    Call action;
+    Call instruction;
+    std::unique_ptr<Actions> actions;
+    std::unique_ptr<ActionBus> action_bus;
+    std::string default_action;
+    unsigned default_action_handle = 0;
+    int default_action_lineno = -1;
+    typedef std::map<std::string, std::string> default_action_params;
+    default_action_params default_action_parameters;
+    bool default_only_action = false;
+    std::vector<NextTables> hit_next;
+    std::vector<NextTables> extra_next_lut;  // extra entries not in the hit_next from gateway
+    // currently the assembler will add extra elements to the 8 entry next table lut if they
+    // are needed for a gateway and not present in the lut already.  We add these in a separate
+    // vector from hit_next so that context.json only reports the original hit_next from the source
+    // and we don't try to get a next table hit index from the action.
+    NextTables miss_next;
+    std::map<int, NextTables> long_branch;
+    int long_branch_input = -1;
+    std::map<Table *, std::set<Actions::Action *>> pred;  // predecessor tables w the actions in
+                                                          // that table that call this table
+    std::vector<HashDistribution> hash_dist;
+    p4_params p4_params_list;
+    std::unique_ptr<json::map> context_json;
+    // saved here in to extract into the context json
+    unsigned next_table_adr_mask = 0U;
+    bitvec reachable_tables_;
+
+    static std::map<std::string, Table *> *all;
+    static std::vector<Table *> *by_uid;
+
+    unsigned layout_size() const {
+        unsigned rv = 0;
+        for (auto &row : layout) rv += row.memunits.size();
+        return rv;
+    }
+    unsigned layout_get_vpn(const MemUnit &m) const {
+        for (auto &row : layout) {
+            if (row.row != m.row) continue;
+            auto u = find(row.memunits.begin(), row.memunits.end(), m);
+            if (u == row.memunits.end()) continue;
+            return row.vpns.at(u - row.memunits.begin());
+        }
+        BUG();
+        return 0;
+    }
+    void layout_vpn_bounds(int &min, int &max, bool spare = false) const {
+        min = 1000000;
+        max = -1;
+        for (const Layout &row : layout)
+            for (const auto v : row.vpns) {
+                if (v < min) min = v;
+                if (v > max) max = v;
+            }
+        if (spare && max > min) --max;
+    }
+    virtual Format::Field *lookup_field(const std::string &n, const std::string &act = "") const {
+        return format ? format->field(n) : 0;
+    }
+    virtual std::string find_field(Format::Field *field) {
+        return format ? format->find_field(field) : "<unknown>";
+    }
+    virtual int find_field_lineno(Format::Field *field) {
+        return format ? format->find_field_lineno(field) : -1;
+    }
+    virtual void apply_to_field(const std::string &n, std::function<void(Format::Field *)> fn) {
+        if (format) format->apply_to_field(n, fn);
+    }
+    int find_on_ixbar(Phv::Slice sl, InputXbar::Group group, InputXbar::Group *found = nullptr);
+    int find_on_ixbar(Phv::Slice sl, int group) {
+        return find_on_ixbar(sl, InputXbar::Group(InputXbar::Group::EXACT, group));
+    }
+    virtual HashDistribution *find_hash_dist(int unit);
+    virtual int find_on_actionbus(const ActionBusSource &src, int lo, int hi, int size,
+                                  int pos = -1);
+    virtual void need_on_actionbus(const ActionBusSource &src, int lo, int hi, int size);
+    virtual int find_on_actionbus(const char *n, TableOutputModifier mod, int lo, int hi, int size,
+                                  int *len = 0);
+    int find_on_actionbus(const char *n, int lo, int hi, int size, int *len = 0) {
+        return find_on_actionbus(n, TableOutputModifier::NONE, lo, hi, size, len);
+    }
+    int find_on_actionbus(const std::string &n, TableOutputModifier mod, int lo, int hi, int size,
+                          int *len = 0) {
+        return find_on_actionbus(n.c_str(), mod, lo, hi, size, len);
+    }
+    int find_on_actionbus(const std::string &n, int lo, int hi, int size, int *len = 0) {
+        return find_on_actionbus(n.c_str(), TableOutputModifier::NONE, lo, hi, size, len);
+    }
+    virtual void need_on_actionbus(Table *att, TableOutputModifier mod, int lo, int hi, int size);
+    static bool allow_bus_sharing(Table *t1, Table *t2);
+    virtual Call &action_call() { return action; }
+    virtual Call &instruction_call() { return instruction; }
+    virtual Actions *get_actions() const { return actions.get(); }
+    virtual const std::vector<NextTables> &get_hit_next() const { return hit_next; }
+    virtual const NextTables &get_miss_next() const { return miss_next; }
+    virtual bool is_directly_referenced(const Table::Call &c) const;
+    virtual void add_reference_table(json::vector &table_refs, const Table::Call &c) const;
+    json::map &add_pack_format(json::map &stage_tbl, int memword, int words,
+                               int entries = -1) const;
+    json::map &add_pack_format(json::map &stage_tbl, Table::Format *format, bool pad_zeros = true,
+                               bool print_fields = true,
+                               Table::Actions::Action *act = nullptr) const;
+    virtual void add_field_to_pack_format(json::vector &field_list, unsigned basebit,
+                                          std::string name, const Table::Format::Field &field,
+                                          const Table::Actions::Action *act) const;
+    virtual bool validate_call(Table::Call &call, MatchTable *self, size_t required_args,
+                               int hash_dist_type, Table::Call &first_call) {
+        BUG();
+        return false;
+    }
+    bool validate_instruction(Table::Call &call) const;
+    // const std::vector<Actions::Action::alias_value_t *> &);
+    // Generate the context json for a field into field list.
+    // Use the bits specified in field, offset by the base bit.
+    // If the field is a constant, output a const_tuple map, including the specified value.
+    void output_field_to_pack_format(json::vector &field_list, unsigned basebit, std::string name,
+                                     std::string source, unsigned start_bit,
+                                     const Table::Format::Field &field, unsigned value = 0) const;
+    void add_zero_padding_fields(Table::Format *format, Table::Actions::Action *act = nullptr,
+                                 unsigned format_width = 64) const;
+    void get_cjson_source(const std::string &field_name, std::string &source, int &start_bit) const;
+    // Result physical buses should be setup for
+    // Exact/Hash/MatchwithNoKey/ATCAM/Ternary tables
+    virtual void add_result_physical_buses(json::map &stage_tbl) const;
+    virtual void merge_context_json(json::map &tbl, json::map &stage_tbl) const;
+    void canon_field_list(json::vector &field_list) const;
+    void for_all_next(std::function<void(const Ref &)> fn);
+    void check_next(const Ref &next);
+    void check_next(NextTables &next);
+    void check_next();
+    virtual void set_pred();
+    /* find the predecessors in the given stage that must run iff this table runs.
+     * includes `this` if it is in the stage.  The values are the set of actions that
+     * (lead to) triggering this table, or empty if any action might */
+    std::map<Table *, std::set<Actions::Action *>> find_pred_in_stage(
+        int stageno, const std::set<Actions::Action *> &acts = std::set<Actions::Action *>());
+
+    bool choose_logical_id(const slist<Table *> *work = nullptr);
+    virtual int hit_next_size() const { return hit_next.size(); }
+    virtual int get_tcam_id() const { BUG("%s not a TCAM table", name()); }
+
+    const std::vector<const p4_param *> find_p4_params(std::string s, std::string t = "",
+                                                       unsigned start_bit = -1,
+                                                       int width = -1) const {
+        remove_name_tail_range(s);
+        std::vector<const p4_param *> params;
+        if (start_bit <= -1) return params;
+        if (width <= -1) return params;
+        int end_bit = start_bit + width;
+        for (auto &p : p4_params_list) {
+            if ((p.name == s) || (p.alias == s)) {
+                int p_end_bit = p.start_bit + p.bit_width;
+                if (!t.empty() && (p.type != t)) continue;
+                if (p.start_bit > start_bit) continue;
+                if (p_end_bit < end_bit) continue;
+                params.push_back(&p);
+            }
+        }
+        return params;
+    }
+
+    const p4_param *find_p4_param(std::string s, std::string t = "", unsigned start_bit = -1,
+                                  int width = -1) const {
+        remove_name_tail_range(s);
+        std::vector<p4_param *> params;
+        for (auto &p : p4_params_list) {
+            if ((p.name == s) || (p.alias == s)) {
+                if (!t.empty() && (p.type != t)) continue;
+                if ((start_bit > -1) && (start_bit < p.start_bit)) continue;
+                if ((width > -1) && (p.start_bit + p.bit_width < start_bit + width)) continue;
+                return &p;
+            }
+        }
+        return nullptr;
+    }
+
+    const p4_param *find_p4_param_type(std::string &s) const {
+        for (auto &p : p4_params_list)
+            if (p.type == s) return &p;
+        return nullptr;
+    }
+    virtual std::string get_default_action() {
+        return (!default_action.empty()) ? default_action : action ? action->default_action : "";
+    }
+    virtual default_action_params *get_default_action_parameters() {
+        return (!default_action_parameters.empty()) ? &default_action_parameters
+               : action                             ? &action->default_action_parameters
+                                                    : nullptr;
+    }
+    virtual unsigned get_default_action_handle() const {
+        return default_action_handle > 0 ? default_action_handle
+               : action                  ? action->default_action_handle
+                                         : 0;
+    }
+    int get_format_field_size(std::string s) const {
+        if (auto field = lookup_field(s)) return field->size;
+        return 0;
+    }
+    virtual bool needs_handle() const { return false; }
+    virtual bool needs_next() const { return false; }
+    virtual bitvec compute_reachable_tables();
+    bitvec reachable_tables() {
+        if (!reachable_tables_) reachable_tables_ = compute_reachable_tables();
+        return reachable_tables_;
+    }
+    std::string loc() const;
+};
+
+std::ostream &operator<<(std::ostream &, const Table::Layout &);
+std::ostream &operator<<(std::ostream &, const Table::Layout::bus_type_t);
+
+class FakeTable : public Table {
+ public:
+    explicit FakeTable(const char *name) : Table(-1, name, INGRESS, 0, -1) {}
+    void setup(VECTOR(pair_t) & data) override { assert(0); }
+    void pass1() override { assert(0); }
+    void pass2() override { assert(0); }
+    void pass3() override { assert(0); }
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_regs, (mau_regs &), override { assert(0); })
+    void gen_tbl_cfg(json::vector &out) const override { assert(0); }
+};
+
+class AlwaysRunTable : public Table {
+    /* a 'table' to hold the always run action in a stage */
+ public:
+    AlwaysRunTable(gress_t gress, Stage *stage, pair_t &init);
+    void setup(VECTOR(pair_t) & data) override { assert(0); }
+    void pass1() override { actions->pass1(this); }
+    void pass2() override { actions->pass2(this); }
+    void pass3() override {}
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_regs, (mau_regs & regs), override)
+    void gen_tbl_cfg(json::vector &out) const override {}
+};
+
+struct AttachedTables {
+    Table::Call selector;
+    Table::Call selector_length;
+    std::vector<Table::Call> stats, meters, statefuls;
+    Table::Call meter_color;
+    SelectionTable *get_selector() const;
+    MeterTable *get_meter(std::string name = "") const;
+    StatefulTable *get_stateful(std::string name = "") const;
+    Table::Format::Field *find_address_field(const AttachedTable *tbl) const;
+    const Table::Call *get_call(const Table *) const;
+    bool is_attached(const Table *tbl) const { return get_call(tbl) != nullptr; }
+    void pass0(MatchTable *self);
+    void pass1(MatchTable *self);
+    template <class REGS>
+    void write_merge_regs(REGS &regs, MatchTable *self, int type, int bus);
+    template <class REGS>
+    void write_tcam_merge_regs(REGS &regs, MatchTable *self, int bus, int tcam_shift);
+    bool run_at_eop();
+    bitvec compute_reachable_tables() const;
+};
+
+#define DECLARE_ABSTRACT_TABLE_TYPE(TYPE, PARENT, ...)                                        \
+    class TYPE : public PARENT {                                                              \
+     protected:                                                                               \
+        TYPE(int l, const char *n, gress_t g, Stage *s, int lid) : PARENT(l, n, g, s, lid) {} \
+        __VA_ARGS__                                                                           \
+    };
+
+DECLARE_ABSTRACT_TABLE_TYPE(
+    MatchTable, Table, GatewayTable *gateway = 0; IdletimeTable *idletime = 0;
+    AttachedTables attached; bool always_run = false; friend struct AttachedTables;
+    enum {NONE = 0, TABLE_MISS = 1, TABLE_HIT = 2, DISABLED = 3, GATEWAY_MISS = 4, GATEWAY_HIT = 5,
+          GATEWAY_INHIBIT = 6} table_counter = NONE;
+
+    using Table::pass1; using Table::write_regs;
+    template <class TARGET> void write_common_regs(typename TARGET::mau_regs &, int, Table *);
+    template <class REGS> void write_regs(REGS &, int type, Table *result);
+    template <class REGS> void write_next_table_regs(REGS &, Table *);
+    void common_init_setup(const VECTOR(pair_t) &, bool, P4Table::type) override;
+    bool common_setup(pair_t &, const VECTOR(pair_t) &, P4Table::type) override;
+    int get_address_mau_actiondata_adr_default(unsigned log2size, bool per_flow_enable); public
+    : bool is_always_run() const override { return always_run; } void pass0() override;
+    void pass1() override; void pass3() override; bool is_alpm() const {
+        if (p4_table) {
+            return p4_table->is_alpm();
+        }
+        return false;
+    } bool is_attached(const Table *tbl) const override;
+    const Table::Call *get_call(const Table *tbl) const {
+        return get_attached()->get_call(tbl);
+    } const AttachedTables *get_attached() const override { return &attached; } std::vector<Call>
+        get_calls() const override;
+    AttachedTables * get_attached() override { return &attached; } Format *
+    get_format() const override;
+    const GatewayTable *get_gateway()
+        const override { return gateway; } const MatchTable *get_match_table() const override {
+            return this;
+        } MatchTable *get_match_table() override { return this; } std::set<MatchTable *>
+            get_match_tables() override {
+                std::set<MatchTable *> rv;
+                rv.insert(this);
+                return rv;
+            } Format::Field *find_address_field(const AttachedTable *tbl) const override {
+                return attached.find_address_field(tbl);
+            } Format::Field *lookup_field(const std::string &n, const std::string &act = "")
+                const override;
+    bool run_at_eop() override { return attached.run_at_eop(); }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    virtual bool is_ternary() { return false; }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    void gen_idletime_tbl_cfg(json::map &stage_tbl) const;
+    int direct_shiftcount() const override {
+        return 64;
+    } void gen_hash_bits(const std::map<int, HashCol> &hash_table, InputXbar::HashTable ht_id,
+                         json::vector &hash_bits, unsigned hash_group_no, bitvec hash_bits_used)
+        const;
+    virtual void add_hash_functions(json::map &stage_tbl) const;
+    void add_all_reference_tables(json::map &tbl, Table *math_table = nullptr) const;
+    METER_ACCESS_TYPE default_meter_access_type(bool for_stateful);
+    bool needs_handle() const override { return true; } bool needs_next()
+        const override { return true; } bitvec compute_reachable_tables() override;)
+
+#define DECLARE_TABLE_TYPE(TYPE, PARENT, NAME, ...)                                           \
+    class TYPE : public PARENT { /* NOLINT */                                                 \
+        static struct Type : public Table::Type {                                             \
+            Type() : Table::Type(NAME) {}                                                     \
+            TYPE *create(int lineno, const char *name, gress_t gress, Stage *stage, int lid,  \
+                         VECTOR(pair_t) & data);                                              \
+        } table_type_singleton;                                                               \
+        friend struct Type;                                                                   \
+                                                                                              \
+     protected:                                                                               \
+        TYPE(int l, const char *n, gress_t g, Stage *s, int lid) : PARENT(l, n, g, s, lid) {} \
+        void setup(VECTOR(pair_t) & data) override;                                           \
+                                                                                              \
+     public:                                                                                  \
+        void pass1() override;                                                                \
+        void pass2() override;                                                                \
+        void pass3() override;                                                                \
+        /* gcc gets confused by overloading this template with the virtual                    \
+         * functions if we try to specialize the templates, so we mangle                      \
+         * the name with a _vt extension to help it out. */                                   \
+        template <class REGS>                                                                 \
+        void write_regs_vt(REGS &regs);                                                       \
+        FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_regs, (mau_regs & regs), override)  \
+        void gen_tbl_cfg(json::vector &out) const override;                                   \
+                                                                                              \
+     private:                                                                                 \
+        __VA_ARGS__                                                                           \
+    };
+
+#define DEFINE_TABLE_TYPE(TYPE)                                                                  \
+    TYPE::Type TYPE::table_type_singleton;                                                       \
+    TYPE *TYPE::Type::create(int lineno, const char *name, gress_t gress, Stage *stage, int lid, \
+                             VECTOR(pair_t) & data) {                                            \
+        TYPE *rv = new TYPE(lineno, name, gress, stage, lid);                                    \
+        rv->setup(data);                                                                         \
+        return rv;                                                                               \
+    }                                                                                            \
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void TYPE::write_regs, (mau_regs & regs),             \
+                          { write_regs_vt(regs); })
+
+/* Used to create a subclass for a table type */
+#define DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(TYPE, KIND)                                        \
+    TYPE::Type TYPE::table_type_singleton;                                                       \
+    TYPE *TYPE::Type::create(int lineno, const char *name, gress_t gress, Stage *stage, int lid, \
+                             VECTOR(pair_t) & data) {                                            \
+        SWITCH_FOREACH_##KIND(options.target,                                                    \
+                              auto *rv = new TARGET::TYPE(lineno, name, gress, stage, lid);      \
+                              rv->setup(data); return rv;)                                       \
+    }                                                                                            \
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void TYPE::write_regs, (mau_regs & regs),             \
+                          { write_regs_vt(regs); })
+
+DECLARE_ABSTRACT_TABLE_TYPE(SRamMatchTable, MatchTable,         // exact, atcam, or proxy_hash
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ public:
+    struct Ram : public MemUnit {
+        using MemUnit::MemUnit;
+        Ram(const MemUnit &m) : MemUnit(m) {}
+        Ram(MemUnit &&m) : MemUnit(std::move(m)) {}
+        bool isLamb() const { return stage == INT_MIN && row == -1; }
+        const char *desc() const;  // Short lived temp for messages
+    };
+    struct Way {
+        int                             lineno;
+        int                             group_xme;      // hash group or xme
+        int                             index;          // first bit of index
+        int                             index_hi = -1;  // top bit (if set) for sanity checking
+        int                             subword_bits;
+        bitvec                          select;
+        std::vector<Ram>                rams;
+        bool isLamb() const {
+            BUG_CHECK(!rams.empty(), "no rams in way");
+            return rams.at(0).isLamb(); }
+        bitvec select_bits() const {
+            bitvec rv = select;
+            rv.setrange(index, (isLamb() ? LAMB_DEPTH_BITS : SRAM_DEPTH_BITS) + subword_bits);
+            return rv;
+        }
+    };
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ protected:
+    std::vector<Way>                      ways;
+    struct WayRam { int way, index, word, bank; };
+    std::map<Ram, WayRam>                   way_map;
+    std::vector<MatchSource *>              match;
+    std::map<unsigned, MatchSource *>       match_by_bit;
+    std::vector<std::vector<MatchSource *>> match_in_word;
+    std::vector<int>                      word_ixbar_group;
+    struct GroupInfo {
+        /* info about which word(s) are used per format group with wide matches */
+        int                     overhead_word;  /* which word of wide match contains overhead */
+        int                     overhead_bit;   /* lowest bit that contains overhead in that word */
+        // The word that is going to contain the result bus.  Same as the overhead word, if
+        // the entry actually has overhead
+        int                     result_bus_word;
+        std::map<int, int>      match_group;    /* which match group for each word with match */
+        std::vector<unsigned>   tofino_mask;    /* 14-bit tofino byte/nibble mask for each word */
+        int                     vpn_offset;     /* which vpn to use for this group */
+        GroupInfo() : overhead_word(-1), overhead_bit(-1), result_bus_word(-1), vpn_offset(-1) {}
+        // important function in order to determine shiftcount for exact match entries
+        int result_bus_word_group() const { return match_group.at(result_bus_word); }
+    };  // NOLINT
+    std::vector<GroupInfo>      group_info;
+    std::vector<std::vector<int>> word_info;    // which format group corresponds to each
+                                                // match group in each word
+    int         mgm_lineno = -1;                // match_group_map lineno
+    friend class GatewayTable;      // Gateway needs to examine word group details for compat
+    friend class Target::Tofino::GatewayTable;
+    bitvec version_nibble_mask;
+    // Which hash groups are assigned to the hash_function_number in the hash_function json node
+    // This is to coordinate with the hash_function_id in the ways
+    std::map<unsigned, unsigned> hash_fn_ids;
+
+    // helper function only used/instantiated on tofino1/2
+    template<class REGS>
+    void write_attached_merge_regs(REGS &regs, int bus, int word, int word_group);
+
+    bool parse_ram(const value_t &, std::vector<Ram> &);
+    bool parse_way(const value_t &);
+    void common_sram_setup(pair_t &, const VECTOR(pair_t) &);
+    void common_sram_checks();
+    void alloc_global_busses() override;
+    void alloc_vpns() override;
+    int find_problematic_vpn_offset() const;
+    virtual void setup_ways();
+    void setup_hash_function_ids();
+    void pass1() override;
+    template<class REGS> void write_regs_vt(REGS &regs);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD,
+        void write_regs, (mau_regs &regs), override )
+    virtual std::string get_match_mode(const Phv::Ref &pref, int offset) const;
+    json::map* add_common_sram_tbl_cfgs(json::map &tbl,
+        std::string match_type, std::string stage_table_type) const;
+    void add_action_cfgs(json::map &tbl, json::map &stage_tbl) const;
+    virtual unsigned entry_ram_depth() const { return 1024; }
+    unsigned get_number_entries() const;
+    unsigned get_format_width() const;
+    virtual int determine_pre_byteswizzle_loc(MatchSource *ms, int lo, int hi, int word);
+    void add_field_to_pack_format(json::vector &field_list, unsigned basebit, std::string name,
+                                  const Table::Format::Field &field,
+                                  const Table::Actions::Action *act) const override;
+    std::unique_ptr<json::map> gen_memory_resource_allocation_tbl_cfg(const Way &) const;
+    Actions *get_actions() const override {
+        return actions ? actions.get() : (action ? action->actions.get() : nullptr);
+    }
+    void add_hash_functions(json::map &stage_tbl) const override;
+    virtual void gen_ghost_bits(int hash_function_number, json::vector &ghost_bits_to_hash_bits,
+        json::vector &ghost_bits_info) const { }
+    virtual void no_overhead_determine_result_bus_usage();
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ public:
+    Format::Field *lookup_field(const std::string &n, const std::string &act = "") const override;
+    OVERLOAD_FUNC_FOREACH(TARGET_CLASS, virtual void, setup_word_ixbar_group, (), ())
+    OVERLOAD_FUNC_FOREACH(TARGET_CLASS, virtual void, verify_format, (), ())
+    OVERLOAD_FUNC_FOREACH(TARGET_CLASS, virtual void, verify_format_pass2, (), ())
+    virtual bool verify_match_key();
+    void verify_match(unsigned fmt_width);
+    void vpn_params(int &width, int &depth, int &period, const char *&period_name) const override {
+        width = (format->size-1)/128 + 1;
+        period = format->groups();
+        depth = period * layout_size() / width;
+        period_name = "match group size"; }
+    template<class REGS> void write_merge_regs_vt(REGS &regs, int type, int bus) {
+        attached.write_merge_regs(regs, this, type, bus); }
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD,
+        void write_merge_regs, (mau_regs &regs, int type, int bus), override {
+            write_merge_regs_vt(regs, type, bus); })
+    bool is_match_bit(const std::string name, const int bit) const {
+        for (auto *m : match) {
+            std::string m_name = m->name();
+            int m_lo = remove_name_tail_range(m_name) + m->fieldlobit();
+            int m_hi = m_lo + m->size() -1;
+            if (m_name == name) {
+                if (m_lo <= bit
+                        && m_hi >= bit)
+                    return true;
+            }
+        }
+        return false;
+    }
+    void determine_word_and_result_bus() override;
+    SelectionTable *get_selector() const override { return attached.get_selector(); }
+    StatefulTable *get_stateful() const override { return attached.get_stateful(); }
+    MeterTable* get_meter() const override { return attached.get_meter(); }
+    const Way *way_for_ram(Ram r) const {
+        return way_map.count(r) ? &ways[way_map.at(r).way] : nullptr; }
+    const Way *way_for_xme(int xme) const {
+        for (auto &way : ways) if (way.group_xme == xme) return &way;
+        return nullptr; }
+)
+
+DECLARE_TABLE_TYPE(
+    ExactMatchTable, SRamMatchTable, "exact_match", bool dynamic_key_masks = false;
+
+    // The position of the ghost bits in a single hash function
+    // The key is name of the field and the field bit, the value is one-hot for all
+    // bits that this ghost bit has an impact on
+    using GhostBitPositions = std::map<std::pair<std::string, int>, bitvec>;
+    std::map<int, GhostBitPositions> ghost_bit_positions; std::unique_ptr<Format> stash_format;
+    std::vector<int> stash_rows; std::vector<int> stash_cols; std::vector<int> stash_units;
+    std::vector<int> stash_overhead_rows;
+
+    // NOLINTNEXTLINE (whitespace/indent)
+    public
+    : int unitram_type() override { return UnitRam::MATCH; } table_type_t table_type()
+        const override { return EXACT; } bool has_group(int grp) {
+            for (auto &way : ways)
+                if (way.group_xme == grp) return true;
+            return false;
+        } void determine_ghost_bits();
+    void gen_ghost_bits(int hash_function_number, json::vector &ghost_bits_to_hash_bits,
+                        json::vector &ghost_bits_info) const override;
+    void generate_stash_overhead_rows();)
+
+DECLARE_TABLE_TYPE(
+    AlgTcamMatchTable, SRamMatchTable, "atcam_match",
+    // key is column priority, value is way index
+    std::map<int, int> col_priority_way;
+    int number_partitions = 0; int max_subtrees_per_partition = 0; int bins_per_partition = 0;
+    int atcam_subset_width = 0; int shift_granularity = 0; std::string partition_field_name = "";
+    std::vector<int> ixbar_subgroup, ixbar_mask; struct match_element {
+        Phv::Ref *field;
+        unsigned offset, width;
+    };
+    bitvec s0q1_nibbles, s1q0_nibbles; std::vector<Phv::Ref *> s0q1_prefs, s1q0_prefs;
+    std::map<int, match_element> s0q1, s1q0; table_type_t table_type()
+        const override { return ATCAM; } void verify_format(Target::Tofino) override;
+    void verify_entry_priority(); void setup_column_priority(); void find_tcam_match();
+    void gen_unit_cfg(json::vector &units, int size) const;
+    std::unique_ptr<json::vector> gen_memory_resource_allocation_tbl_cfg() const;
+    void setup_nibble_mask(Table::Format::Field *match, int group,
+                           std::map<int, match_element> &elems, bitvec &mask);
+    std::string get_match_mode(const Phv::Ref &pref, int offset) const override;
+    void base_alpm_atcam_tbl_cfg(json::map &atcam_tbl, const char *type, int size) const {
+        if (p4_table) p4_table->base_alpm_tbl_cfg(atcam_tbl, size, this, P4Table::Atcam);
+    }
+    // For ATCAM tables, no hash functions are generated for the table, as the current
+    // interpretation of the table is that the partition index is an identity hash function.
+    // Potentially this could change at some point
+    void add_hash_functions(json::map &stage_tbl)
+        const override {} bool has_directly_attached_synth2port() const;
+    std::string get_lpm_field_name() const {
+        std::string lpm = "lpm";
+        if (auto *p = find_p4_param_type(lpm))
+            return p->key_name.empty() ? p->name : p->key_name;
+        else
+            error(lineno, "'lpm' type field not found in alpm atcam '%s-%s' p4 param order", name(),
+                  p4_name());
+        return "";
+    } std::set<unsigned>
+        get_partition_action_handle() const {
+            if (p4_table) return p4_table->get_partition_action_handle();
+            return {};
+        } void no_overhead_determine_result_bus_usage() override;
+    std::string get_partition_field_name() const {
+        if (!p4_table) return "";
+        auto name = p4_table->get_partition_field_name();
+        if (auto *p = find_p4_param(name))
+            if (!p->key_name.empty()) return p->key_name;
+        return name;
+    } unsigned entry_ram_depth() const override {
+        return std::min(number_partitions, 1024);
+    } void gen_alpm_cfg(json::map &) const;)
+
+DECLARE_TABLE_TYPE(
+    ProxyHashMatchTable, SRamMatchTable, "proxy_hash", bool dynamic_key_masks = false;
+    void setup_ways() override; int proxy_hash_group = -1; std::string proxy_hash_alg = "<invalid>";
+    bool verify_match_key() override; table_type_t table_type()
+        const override { return PROXY_HASH; } void setup_word_ixbar_group() override;
+    int determine_pre_byteswizzle_loc(MatchSource *ms, int lo, int hi, int word) override;
+    void add_proxy_hash_function(json::map &stage_tbl) const;)
+
+DECLARE_TABLE_TYPE(TernaryMatchTable, MatchTable, "ternary_match",
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ protected:
+    void vpn_params(int &width, int &depth, int &period, const char *&period_name) const override;
+    struct Match {
+        int lineno = -1, word_group = -1, byte_group = -1, byte_config = 0, dirtcam = 0;
+        Match() {}
+        explicit Match(const value_t &);
+    };
+    enum range_match_t { TCAM_NORMAL = 0, DIRTCAM_2B = 1, DIRTCAM_4B_LO = 2,
+                         DIRTCAM_4B_HI = 3, NONE = 4 };
+    enum byte_config_t { MIDBYTE_NIBBLE_LO = 0, MIDBYTE_NIBBLE_HI = 1 };
+    std::vector<Match>  match;
+    int match_word(int word_group) const {
+        for (unsigned i = 0; i < match.size(); i++)
+            if (match[i].word_group == word_group)
+                return i;
+        return -1; }
+    unsigned            chain_rows[TCAM_UNITS_PER_ROW]; /* bitvector per column */
+    enum { ALWAYS_ENABLE_ROW = (1<<2) | (1<<5) | (1<<9) };
+    friend class TernaryIndirectTable;
+
+    virtual void check_tcam_match_bus(const std::vector<Table::Layout> &) = 0;
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ public:
+    void pass0() override;
+    int tcam_id = -1;
+    Table::Ref indirect;
+    int indirect_bus = -1;   /* indirect bus to use if there's no indirect table */
+    void alloc_vpns() override;
+    range_match_t get_dirtcam_mode(int group, int byte) const {
+        BUG_CHECK(group >= 0);
+        BUG_CHECK(byte >= 0);
+        range_match_t dirtcam_mode = NONE;
+        for (auto &m : match) {
+            if (m.word_group == group) {
+                dirtcam_mode = (range_match_t) ((m.dirtcam >> 2*byte) & 0x3); } }
+        return dirtcam_mode; }
+    Format::Field *lookup_field(const std::string &name, const std::string &action) const override;
+    HashDistribution *find_hash_dist(int unit) override {
+        return indirect ? indirect->find_hash_dist(unit) : Table::find_hash_dist(unit); }
+    int find_on_actionbus(const ActionBusSource &src, int lo, int hi, int size,
+            int pos = -1) override {
+        return indirect ? indirect->find_on_actionbus(src, lo, hi, size, pos)
+                        : Table::find_on_actionbus(src, lo, hi, size, pos); }
+    void need_on_actionbus(const ActionBusSource &src, int lo, int hi, int size) override {
+        indirect ? indirect->need_on_actionbus(src, lo, hi, size)
+                 : Table::need_on_actionbus(src, lo, hi, size); }
+    int find_on_actionbus(const char *n, TableOutputModifier mod, int lo, int hi,
+                          int size, int *len = 0) override {
+        return indirect ? indirect->find_on_actionbus(n, mod, lo, hi, size, len)
+                        : Table::find_on_actionbus(n, mod, lo, hi, size, len); }
+    void need_on_actionbus(Table *att, TableOutputModifier mod, int lo, int hi, int size) override {
+                indirect ? indirect->need_on_actionbus(att, mod, lo, hi, size)
+                                     : Table::need_on_actionbus(att, mod, lo, hi, size); }
+    const Call &get_action() const override { return indirect ? indirect->get_action() : action; }
+    Actions *get_actions() const override { return actions ? actions.get() :
+        (action ? action->actions.get() : indirect ? indirect->actions ? indirect->actions.get() :
+         indirect->action ? indirect->action->actions.get() : 0 : 0); }
+    const AttachedTables *get_attached() const override {
+        return indirect ? indirect->get_attached() : &attached; }
+    AttachedTables *get_attached() override {
+        return indirect ? indirect->get_attached() : &attached; }
+    SelectionTable *get_selector() const override {
+        return indirect ? indirect->get_selector() : 0; }
+    StatefulTable *get_stateful() const override {
+        return indirect ? indirect->get_stateful() : 0; }
+    MeterTable* get_meter() const override {
+        return indirect ? indirect->get_meter() : 0; }
+    bool is_attached(const Table *tbl) const override {
+        return indirect ? indirect->is_attached(tbl) : MatchTable::is_attached(tbl); }
+    Format::Field *find_address_field(const AttachedTable *tbl) const override {
+        return indirect ? indirect->find_address_field(tbl) : attached.find_address_field(tbl); }
+    std::unique_ptr<json::map> gen_memory_resource_allocation_tbl_cfg(
+            const char *type, const std::vector<Layout> &layout,
+            bool skip_spare_bank = false) const override;
+    json::map &get_tbl_top(json::vector &out) const;
+    Call &action_call() override { return indirect ? indirect->action : action; }
+    Call &instruction_call() override { return indirect ? indirect->instruction: instruction; }
+    int json_memunit(const MemUnit &u) const override {
+        return u.row + u.col*12; }
+    bool is_ternary() override { return true; }
+    bool has_indirect() { return indirect; }
+    int hit_next_size() const override {
+        if (indirect && indirect->hit_next.size() > 0)
+            return indirect->hit_next.size();
+        return hit_next.size(); }
+    table_type_t table_type() const override { return TERNARY; }
+    void gen_entry_cfg(json::vector &out, std::string name,
+        unsigned lsb_offset, unsigned lsb_idx, unsigned msb_idx,
+        std::string source, unsigned start_bit, unsigned field_width,
+        unsigned index, bitvec &tcam_bits, unsigned byte_offset) const;
+    void gen_entry_cfg2(json::vector &out, std::string field_name, std::string global_name,
+        unsigned lsb_offset, unsigned lsb_idx, unsigned msb_idx, std::string source,
+        unsigned start_bit, unsigned field_width, bitvec &tcam_bits) const;
+    void gen_entry_range_cfg(json::map &entry, bool duplicate, unsigned nibble_offset) const;
+    void set_partition_action_handle(unsigned handle) {
+        if (p4_table) p4_table->set_partition_action_handle(handle); }
+    void set_partition_field_name(std::string name) {
+        if (p4_table) p4_table->set_partition_field_name(name); }
+    void base_alpm_pre_classifier_tbl_cfg(json::map &pre_classifier_tbl,
+            const char *type, int size) const {
+        if (p4_table)
+            p4_table->base_alpm_tbl_cfg(pre_classifier_tbl, size, this, P4Table::PreClassifier);
+    }
+    virtual void gen_match_fields_pvp(json::vector &match_field_list, unsigned word,
+        bool uses_versioning, unsigned version_word_group, bitvec &tcam_bits) const;
+    virtual void gen_match_fields(json::vector &match_field_list,
+                                  std::vector<bitvec> &tcam_bits) const;
+    unsigned get_default_action_handle() const override {
+        unsigned def_act_handle = Table::get_default_action_handle();
+        return def_act_handle > 0 ? def_act_handle :
+            indirect ? indirect->get_default_action_handle() ?
+            indirect->get_default_action_handle() : action ?
+            action->default_action_handle : 0 : 0;
+    }
+    std::string get_default_action() override {
+        std::string def_act = Table::get_default_action();
+        return !def_act.empty() ? def_act : indirect ? indirect->default_action : ""; }
+    Format* get_format() const override {
+        return indirect ? indirect->get_format() : MatchTable::get_format(); }
+    template<class REGS> void write_merge_regs_vt(REGS &regs, int type, int bus) {
+        attached.write_merge_regs(regs, this, type, bus); }
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD,
+        void write_merge_regs, (mau_regs &regs, int type, int bus), override {
+            write_merge_regs_vt(regs, type, bus); })
+    void add_result_physical_buses(json::map &stage_tbl) const override;
+    default_action_params* get_default_action_parameters() override {
+        if (!default_action_parameters.empty()) return &default_action_parameters;
+        auto def_action_params = indirect ? indirect->get_default_action_parameters() : nullptr;
+        return def_action_params; }
+    bitvec compute_reachable_tables() override;
+    int get_tcam_id() const override { return tcam_id; }
+    virtual void setup_indirect(const value_t &v) {
+        if (CHECKTYPE(v, tSTR))
+            indirect = v; }
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ private:
+    template<class REGS> void tcam_table_map(REGS &regs, int row, int col);
+)
+
+DECLARE_TABLE_TYPE(
+    Phase0MatchTable, MatchTable, "phase0_match", int size = MAX_PORTS; int width = 1;
+    int constant_value = 0; table_type_t table_type() const override { return PHASE0; }
+    // Phase0 Tables are not actual tables. They cannot have action data
+    // or attached tables and do not need a logical id assignment, hence
+    // we skip pass0
+    void pass0() override {} void set_pred() override { return; } bool needs_next() const override {
+        return false;
+    } int ram_word_width() const override { return Target::PHASE0_FORMAT_WIDTH(); })
+DECLARE_TABLE_TYPE(
+    HashActionTable, MatchTable, "hash_action", public
+    :
+    // int                                 row = -1, bus = -1;
+    table_type_t table_type() const override { return HASH_ACTION; } template <class REGS>
+    void write_merge_regs_vt(REGS &regs, int type, int bus);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_merge_regs,
+                          (mau_regs & regs, int type, int bus), override) Format::Field *
+    lookup_field(const std::string &n, const std::string &act = "") const override;
+    void add_hash_functions(json::map &stage_tbl) const override;
+    void determine_word_and_result_bus() override;
+    Layout::bus_type_t default_bus_type() const override { return Layout::RESULT_BUS; })
+
+DECLARE_TABLE_TYPE(TernaryIndirectTable, Table, "ternary_indirect",
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ protected:
+    TernaryMatchTable           *match_table = nullptr;
+    AttachedTables              attached;
+    table_type_t table_type() const override { return TERNARY_INDIRECT; }
+    table_type_t set_match_table(MatchTable *m, bool indirect) override;
+    void vpn_params(int &width, int &depth, int &period, const char *&period_name) const override {
+        width = (format->size-1)/128 + 1;
+        depth = layout_size() / width;
+        period = 1;
+        period_name = 0; }
+    Actions *get_actions() const override {
+        return actions ? actions.get() : (match_table ? match_table->actions.get() : nullptr);
+    }
+    const AttachedTables *get_attached() const override { return &attached; }
+    AttachedTables *get_attached() override { return &attached; }
+    const GatewayTable *get_gateway() const override { return match_table->get_gateway(); }
+    const MatchTable *get_match_table() const override { return match_table; }
+    std::set<MatchTable *> get_match_tables() override {
+        std::set<MatchTable *> rv;
+        if (match_table) rv.insert(match_table);
+        return rv; }
+    SelectionTable *get_selector() const override { return attached.get_selector(); }
+    StatefulTable *get_stateful() const override { return attached.get_stateful(); }
+    MeterTable* get_meter() const override { return attached.get_meter(); }
+    bool is_attached(const Table *tbl) const override { return attached.is_attached(tbl); }
+    Format::Field *find_address_field(const AttachedTable *tbl) const override {
+        return attached.find_address_field(tbl); }
+    template<class REGS> void write_merge_regs_vt(REGS &regs, int type, int bus) {
+        attached.write_merge_regs(regs, match_table, type, bus); }
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD,
+        void write_merge_regs, (mau_regs &regs, int type, int bus), override {
+            write_merge_regs_vt(regs, type, bus); })
+    int unitram_type() override { return UnitRam::TERNARY_INDIRECTION; }
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ public:
+    Format::Field *lookup_field(const std::string &n,
+                                        const std::string &act = "") const override;
+    MatchTable *get_match_table() override { return match_table; }
+    const std::vector<NextTables> &get_hit_next() const override {
+        if (hit_next.empty() && match_table)
+            return match_table->get_hit_next();
+        return Table::get_hit_next(); }
+    const NextTables &get_miss_next() const override {
+        if (!miss_next.set() && match_table)
+            return match_table->get_miss_next();
+        return Table::get_miss_next(); }
+    int address_shift() const override { return std::min(5U, format->log2size - 2); }
+    unsigned get_default_action_handle() const override {
+        unsigned def_act_handle = Table::get_default_action_handle();
+        return def_act_handle ? def_act_handle : action ? action->default_action_handle : 0; }
+    bool needs_handle() const override { return true; }
+    bool needs_next() const override { return true; }
+    void determine_word_and_result_bus() override;
+    bitvec compute_reachable_tables() override;
+    int get_tcam_id() const override { return match_table->tcam_id; }
+    Layout::bus_type_t default_bus_type() const override { return Layout::TIND_BUS; }
+)
+
+DECLARE_ABSTRACT_TABLE_TYPE(
+    AttachedTable, Table,
+    /* table that can be attached to multiple match tables to do something */
+    std::set<MatchTable *> match_tables;
+    bool direct = false, indirect = false; bool per_flow_enable = false;
+    std::string per_flow_enable_param = "";
+    virtual unsigned per_flow_enable_bit(MatchTable *m = nullptr) const;
+    table_type_t set_match_table(MatchTable * m, bool indirect) override {
+        if ((indirect && direct) || (!indirect && this->indirect))
+            error(lineno, "Table %s is accessed with direct and indirect indices", name());
+        this->indirect = indirect;
+        direct = !indirect;
+        match_tables.insert(m);
+        if ((unsigned)m->logical_id < (unsigned)logical_id) logical_id = m->logical_id;
+        return table_type();
+    } const GatewayTable *get_gateway() const override {
+        return match_tables.size() == 1 ? (*match_tables.begin())->get_gateway() : 0;
+    } SelectionTable *get_selector() const override;
+    StatefulTable * get_stateful() const override; MeterTable * get_meter() const override;
+    Call &
+    action_call() override {
+        return match_tables.size() == 1 ? (*match_tables.begin())->action_call() : action;
+    }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    int json_memunit(const MemUnit &u) const override;
+    void pass1() override;
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    virtual unsigned get_alu_index() const {
+        if (layout.size() > 0) return layout[0].row / 4U;
+        error(lineno, "Cannot determine ALU Index for table %s", name());
+        return 0;
+    } unsigned determine_meter_shiftcount(Table::Call &call, int group, int word, int tcam_shift)
+        const;
+    void determine_meter_merge_regs(MatchTable *match, int type, int bus,
+                                    const std::vector<Call::Arg> &arg,
+                                    METER_ACCESS_TYPE default_type, unsigned &adr_mask,
+                                    unsigned &per_entry_mux_ctl, unsigned &adr_default,
+                                    unsigned &meter_type_position);
+
+    // NOLINTNEXTLINE (whitespace/indent)
+    protected
+    :
+    // Accessed by Meter/Selection/Stateful Tables as "meter_alu_index"
+    // Accessed by Statistics (Counter) Tables as "stats_alu_index"
+    void add_alu_index(json::map &stage_tbl, std::string alu_index) const;
+
+    // NOLINTNEXTLINE (whitespace/indent)
+    public
+    :
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    const MatchTable *get_match_table()
+        const override { return match_tables.size() == 1 ? *match_tables.begin() : 0; }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    MatchTable *get_match_table()
+        override { return match_tables.size() == 1 ? *match_tables.begin() : 0; }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    std::set<MatchTable *>
+        get_match_tables() override { return match_tables; } bool has_per_flow_enable()
+            const { return per_flow_enable; } std::string get_per_flow_enable_param() {
+                return per_flow_enable_param;
+            } Format::Field *get_per_flow_enable_param(MatchTable *m) const override {
+                return per_flow_enable ? m->lookup_field(per_flow_enable_param) : nullptr;
+            } Format::Field *get_meter_address_param(MatchTable *m) const override {
+                std::string pfe_name =
+                    per_flow_enable_param.substr(0, per_flow_enable_param.find("_pfe"));
+                return per_flow_enable ? m->lookup_field(pfe_name + "_addr") : nullptr;
+            } Format::Field *get_meter_type_param(MatchTable *m) const override {
+                std::string pfe_name =
+                    per_flow_enable_param.substr(0, per_flow_enable_param.find("_pfe"));
+                return per_flow_enable ? m->lookup_field(pfe_name + "_type") : nullptr;
+            }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    bool get_per_flow_enable() { return per_flow_enable; } bool is_direct() const { return direct; }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    virtual int default_pfe_adjust() const { return 0; }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    std::string get_default_action() override {
+        if (!default_action.empty()) return default_action;
+        for (auto m : match_tables) {
+            std::string def_action = m->get_default_action();
+            if (!def_action.empty()) return def_action;
+        }
+        return "";
+    }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    default_action_params *get_default_action_parameters() override {
+        if (!default_action_parameters.empty()) return &default_action_parameters;
+        for (auto m : match_tables) {
+            if (auto def_action_params = m->get_default_action_parameters())
+                if (!def_action_params->empty()) return def_action_params;
+        }
+        return nullptr;
+    } bool validate_call(Table::Call &call, MatchTable *self, size_t required_args,
+                         int hash_dist_type, Table::Call &first_call) override;
+    // used by Selection and Stateful tables.
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, int meter_alu_fifo_enable_from_mask,
+                          (mau_regs &, unsigned bytemask)))
+
+DECLARE_TABLE_TYPE(
+    ActionTable, AttachedTable, "action", protected
+    : int action_id = -1;
+    std::map<int, bitvec> home_rows_per_word; int home_lineno = -1;
+    std::map<std::string, std::unique_ptr<Format>> action_formats;
+    std::map<std::string, Actions::Action *> pack_actions;
+    static const std::map<unsigned, std::vector<std::string>> action_data_address_huffman_encoding;
+    void vpn_params(int &width, int &depth, int &period, const char *&period_name) const override;
+    int get_start_vpn() override; std::string find_field(Format::Field * field) override;
+    int find_field_lineno(Format::Field *field) override;
+    Format::Field * lookup_field(const std::string &name, const std::string &action) const override;
+    void apply_to_field(const std::string &n, std::function<void(Format::Field *)> fn) override;
+    int find_on_actionbus(const char *n, TableOutputModifier mod, int lo, int hi, int size,
+                          int *len) override;
+    int find_on_actionbus(const ActionBusSource &src, int lo, int hi, int size, int pos = -1)
+        override;
+    void need_on_actionbus(const ActionBusSource &src, int lo, int hi, int size) override;
+    void need_on_actionbus(Table *att, TableOutputModifier mod, int lo, int hi, int size) override;
+    table_type_t table_type() const override { return ACTION; } int unitram_type()
+        override { return UnitRam::ACTION; } void pad_format_fields();
+    unsigned get_do_care_count(std::string bstring);
+    unsigned get_lower_huffman_encoding_bits(unsigned width); public
+    : const std::map<std::string, std::unique_ptr<Format>> &get_action_formats()
+        const { return action_formats; } unsigned get_size() const {
+            unsigned size = 0;
+            if (format) size = format->size;
+            for (auto &f : get_action_formats()) {
+                unsigned fsize = f.second->size;
+                if (fsize > size) size = fsize;
+            }
+            return size;
+        } unsigned get_log2size() const {
+            unsigned size = get_size();
+            return ceil_log2(size);
+        } unsigned determine_shiftcount(Table::Call &call, int group, unsigned word, int tcam_shift)
+            const override;
+    unsigned determine_default(Table::Call &call) const;
+    unsigned determine_mask(Table::Call &call) const;
+    unsigned determine_vpn_shiftcount(Table::Call &call) const; bool needs_handle()
+        const override { return true; } bool needs_next() const override { return true; })
+
+DECLARE_TABLE_TYPE(GatewayTable, Table, "gateway",
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ protected:
+    MatchTable                  *match_table = 0;
+    uint64_t                    payload = -1;
+    int                         have_payload = -1;
+    std::vector<int>            payload_map;
+    int                         match_address = -1;
+    int                         gw_unit = -1;
+    int                         payload_unit = -1;
+    enum range_match_t { NONE, DC_2BIT, DC_4BIT }
+                                range_match = NONE;
+    std::string                 gateway_name;
+    std::string                 gateway_cond;
+    bool                        always_run = false;  // only for standalone
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ public:
+    struct MatchKey {
+        int                     offset;
+        Phv::Ref                val;
+        bool                    valid;  /* implicit valid bit for tofino1 only */
+        MatchKey(gress_t gr, int stg, value_t &v) :
+            offset(-1), val(gr, stg, v), valid(false) {}
+        MatchKey(int off, gress_t gr, int stg, value_t &v) :
+            offset(off), val(gr, stg, v), valid(false) {}
+        // tofino1 only: phv has an implicit valid bit that can be matched in
+        // gateway or ternary table.
+        MatchKey(int off, gress_t gr, int stg, value_t &v, bool vld) :
+            offset(off), val(gr, stg, v), valid(vld) {}
+        bool operator<(const MatchKey &a) const { return offset < a.offset; }
+    };
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ protected:
+    std::vector<MatchKey>       match, xor_match;
+    struct Match {
+        int                     lineno = 0;
+        uint16_t                range[6] = { 0, 0, 0, 0, 0, 0 };
+        wmatch_t                val;
+        bool                    run_table = false;
+        NextTables              next;
+        std::string             action;  // FIXME -- need arguments?
+        int                     next_map_lut = -1;
+        Match() {}
+        Match(value_t *v, value_t &data, range_match_t range_match);
+    }                           miss, cond_true, cond_false;
+    std::vector<Match>          table;
+    bool                        need_next_map_lut = false;
+    template<class REGS> void payload_write_regs(REGS &, int row, int type, int bus);
+    template<class REGS> void standalone_write_regs(REGS &regs);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD,
+        virtual void write_next_table_regs, (mau_regs &), { BUG(); })
+    bool gateway_needs_ixbar_group() {
+        for (auto& m : match)
+            if (m.offset < 32)
+                return true;
+        return !xor_match.empty(); }
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ public:
+    table_type_t table_type() const override { return GATEWAY; }
+    virtual int find_next_lut_entry(Table *tbl, const Match &match);
+    const MatchTable *get_match_table() const override { return match_table; }
+    MatchTable *get_match_table() override { return match_table; }
+    std::set<MatchTable *> get_match_tables() override {
+        std::set<MatchTable *> rv;
+        if (match_table) rv.insert(match_table);
+        return rv; }
+    table_type_t set_match_table(MatchTable *m, bool indirect) override {
+        match_table = m;
+        if ((unsigned)m->logical_id < (unsigned)logical_id) logical_id = m->logical_id;
+        return GATEWAY; }
+    virtual void setup_map_indexing(Table *tbl) { return; }
+    static GatewayTable *create(int lineno, const std::string &name, gress_t gress,
+                                Stage *stage, int lid, VECTOR(pair_t) &data)
+        { return table_type_singleton.create(lineno, name.c_str(), gress, stage, lid, data); }
+    const GatewayTable *get_gateway() const override { return this; }
+    AttachedTables *get_attached() const override {
+        return match_table ? match_table->get_attached() : 0; }
+    SelectionTable *get_selector() const override {
+        return match_table ? match_table->get_selector() : 0; }
+    StatefulTable *get_stateful() const override {
+        return match_table ? match_table->get_stateful() : 0; }
+    MeterTable *get_meter() const override {
+        return match_table ? match_table->get_meter() : 0; }
+    bool empty_match() const { return match.empty() && xor_match.empty(); }
+    unsigned input_use() const;
+    bool needs_handle() const override { return true; }
+    bool needs_next() const override { return true; }
+    bool is_branch() const;   // Tofino2 needs is_a_brnch set to use next_table
+    void verify_format();
+    bool is_always_run() const override { return always_run; }
+    virtual bool check_match_key(MatchKey &, const std::vector<MatchKey> &, bool);
+    virtual int gw_memory_unit() const = 0;
+)
+
+DECLARE_TABLE_TYPE(
+    SelectionTable, AttachedTable, "selection",
+    bool non_linear_hash = false, /* == enable_sps_scrambling */
+    resilient_hash = false;       /* false is fair hash */
+    int mode_lineno = -1, param = -1; std::vector<int> pool_sizes;
+    int min_words = -1, max_words = -1; int selection_hash = -1; public
+    : StatefulTable *bound_stateful = nullptr;
+    table_type_t table_type()
+        const override { return SELECTION; } void vpn_params(int &width, int &depth, int &period,
+                                                             const char *&period_name)
+            const override {
+                width = period = 1;
+                depth = layout_size();
+                period_name = 0;
+            }
+
+    template <class REGS>
+    void write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                             const std::vector<Call::Arg> &args);
+    template <class REGS> void setup_logical_alu_map(REGS &regs, int logical_id, int alu);
+    template <class REGS> void setup_physical_alu_map(REGS &regs, int type, int bus, int alu);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_merge_regs,
+                          (mau_regs & regs, MatchTable *match, int type, int bus,
+                           const std::vector<Call::Arg> &args),
+                          override) int address_shift()
+        const override { return 7; } std::vector<int>
+            determine_spare_bank_memory_units() const override;
+    unsigned meter_group() const { return layout.at(0).row / 4U; } int home_row() const override {
+        return layout.at(0).row | 3;
+    } int unitram_type() override { return UnitRam::SELECTOR; } StatefulTable *get_stateful()
+        const override {
+            return bound_stateful;
+        } unsigned determine_shiftcount(Table::Call &call, int group, unsigned word, int tcam_shift)
+            const override;
+    void set_stateful(StatefulTable *s) override {
+        bound_stateful = s;
+    } unsigned per_flow_enable_bit(MatchTable *m = nullptr) const override;
+    int indirect_shiftcount() const override;
+    unsigned determine_length_shiftcount(const Table::Call &call, int group, int word) const;
+    unsigned determine_length_mask(const Table::Call &call) const;
+    unsigned determine_length_default(const Table::Call &call) const;
+    bool validate_length_call(const Table::Call &call);)
+
+class IdletimeTable : public Table {
+    MatchTable *match_table = 0;
+    int sweep_interval = 7, precision = 3;
+    bool disable_notification = false;
+    bool two_way_notification = false;
+    bool per_flow_enable = false;
+
+    IdletimeTable(int lineno, const char *name, gress_t gress, Stage *stage, int lid)
+        : Table(lineno, name, gress, stage, lid) {}
+    void setup(VECTOR(pair_t) & data) override;
+
+ public:
+    table_type_t table_type() const override { return IDLETIME; }
+    table_type_t set_match_table(MatchTable *m, bool indirect) override {
+        match_table = m;
+        if ((unsigned)m->logical_id < (unsigned)logical_id) logical_id = m->logical_id;
+        return IDLETIME;
+    }
+    void vpn_params(int &width, int &depth, int &period, const char *&period_name) const override {
+        width = period = 1;
+        depth = layout_size();
+        period_name = 0;
+    }
+    int json_memunit(const MemUnit &u) const override;
+    int precision_shift() const;
+    int direct_shiftcount() const override;
+    void pass1() override;
+    void pass2() override;
+    void pass3() override;
+    template <class REGS>
+    void write_merge_regs_vt(REGS &regs, int type, int bus);
+    template <class REGS>
+    void write_regs_vt(REGS &regs);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_regs, (mau_regs & regs), override)
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_merge_regs,
+                          (mau_regs & regs, int type, int bus), override)
+    void gen_tbl_cfg(json::vector &out) const override { /* nothing at top level */ }
+    void gen_stage_tbl_cfg(json::map &out) const;
+    static IdletimeTable *create(int lineno, const std::string &name, gress_t gress, Stage *stage,
+                                 int lid, VECTOR(pair_t) & data) {
+        IdletimeTable *rv = new IdletimeTable(lineno, name.c_str(), gress, stage, lid);
+        rv->setup(data);
+        return rv;
+    }
+    bool needs_handle() const override { return true; }
+    bool needs_next() const override { return true; }
+    Layout::bus_type_t default_bus_type() const override { return Layout::IDLE_BUS; }
+};
+
+DECLARE_ABSTRACT_TABLE_TYPE(
+    Synth2Port, AttachedTable,
+    void vpn_params(int &width, int &depth, int &period, const char *&period_name) const override {
+        width = period = 1;
+        depth = layout_size();
+        period_name = 0;
+    } bool global_binding = false;
+    bool output_used = false; int home_lineno = -1; std::set<int, std::greater<int>> home_rows;
+    json::map * add_stage_tbl_cfg(json::map & tbl, const char *type, int size) const override;
+    public
+    : int get_home_row_for_row(int row) const;
+    void add_alu_indexes(json::map &stage_tbl, std::string alu_indexes) const;
+    OVERLOAD_FUNC_FOREACH(TARGET_CLASS, std::vector<int>, determine_spare_bank_memory_units,
+                          () const, (), override)
+        OVERLOAD_FUNC_FOREACH(TARGET_CLASS, void, alloc_vpns, (), ()) template <class REGS>
+        void write_regs_vt(REGS &regs);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_regs, (mau_regs & regs), override)
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_merge_regs,
+                          (mau_regs & regs, MatchTable *match, int type, int bus,
+                           const std::vector<Call::Arg> &args),
+                          override = 0)
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    void common_init_setup(const VECTOR(pair_t) &, bool, P4Table::type) override;
+    bool common_setup(pair_t &, const VECTOR(pair_t) &, P4Table::type) override;
+    void pass1() override; void pass2() override; void pass3() override;)
+
+DECLARE_TABLE_TYPE(
+    CounterTable, Synth2Port, "counter",
+    enum {NONE = 0, PACKETS = 1, BYTES = 2, BOTH = 3} type = NONE;
+    int teop = -1; bool teop_initialized = false; int bytecount_adjust = 0;
+    table_type_t table_type() const override { return COUNTER; }
+    // FIXME: This comment is necessary to stop cpplint from complaining. The format is off because
+    // this code is within a macro.
+    template <class REGS>
+    void write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                             const std::vector<Call::Arg> &args);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_merge_regs,
+                          (mau_regs & regs, MatchTable *match, int type, int bus,
+                           const std::vector<Call::Arg> &args),
+                          override)
+
+        template <class REGS>
+        void setup_teop_regs(REGS &regs, int stats_group_index);
+    template <class REGS> void write_alu_vpn_range(REGS &regs);
+    template <class REGS> void setup_teop_regs_2(REGS &regs, int stats_group_index);
+    template <class REGS> void write_alu_vpn_range_2(REGS &regs);
+
+    struct lrt_params {  // largest recent with threshold paramters
+        int lineno;
+        int64_t threshold;
+        int interval;
+        lrt_params(int l, int64_t t, int i) : lineno(l), threshold(t), interval(i) {}
+        explicit lrt_params(const value_t &);
+    };
+    std::vector<lrt_params> lrt; public
+    : int home_row() const override { return layout.at(0).row; } int direct_shiftcount()
+        const override;
+    int indirect_shiftcount() const override;
+    unsigned determine_shiftcount(Table::Call &call, int group, unsigned word, int tcam_shift)
+        const override;
+    int address_shift() const override;
+    bool run_at_eop() override { return (type & BYTES) != 0; } bool adr_mux_select_stats()
+        override { return true; } int unitram_type() override { return UnitRam::STATISTICS; })
+
+DECLARE_TABLE_TYPE(
+    MeterTable, Synth2Port, "meter", int red_nodrop_value = -1; int red_drop_value = -1;
+    int green_value = 0; int yellow_value = 1; int red_value = 3; int profile = 0; int teop = -1;
+    bool teop_initialized = false; int bytecount_adjust = 0;
+    enum {NONE = 0, STANDARD = 1, LPF = 2, RED = 3} type = NONE;
+    enum {NONE_ = 0, PACKETS = 1, BYTES = 2} count = NONE_; std::vector<Layout> color_maprams;
+    table_type_t table_type() const override { return METER; } template <class REGS>
+    void write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                             const std::vector<Call::Arg> &args);
+    template <class REGS> void meter_color_logical_to_phys(REGS &regs, int logical_id, int alu);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD, void write_merge_regs,
+                          (mau_regs & regs, MatchTable *match, int type, int bus,
+                           const std::vector<Call::Arg> &args),
+                          override)
+
+        template <class REGS>
+        void setup_teop_regs(REGS &regs, int meter_group_index);
+    template <class REGS> void write_alu_vpn_range(REGS &regs);
+    template <class REGS> void write_regs_home_row(REGS &regs, unsigned row);
+    template <class REGS> void write_mapram_color_regs(REGS &regs, bool &push_on_overflow);
+
+    template <class REGS> void setup_teop_regs_2(REGS &regs, int stats_group_index);
+    template <class REGS> void write_alu_vpn_range_2(REGS &regs);
+
+    int sweep_interval = 2; public
+    : enum {NO_COLOR_MAP, IDLE_MAP_ADDR, STATS_MAP_ADDR} color_mapram_addr = NO_COLOR_MAP;
+    int direct_shiftcount() const override; int indirect_shiftcount() const override;
+    int address_shift() const override; bool color_aware = false;
+    bool color_aware_per_flow_enable = false; bool color_used = false;
+    int pre_color_hash_dist_unit = -1; int pre_color_bit_lo = -1;
+    bool run_at_eop() override { return type == STANDARD; } int unitram_type() override {
+        return UnitRam::METER;
+    } int home_row() const override { return layout.at(0).row | 3; } unsigned meter_group()
+        const { return layout.at(0).row / 4U; } bool uses_colormaprams() const override {
+            return !color_maprams.empty();
+        } unsigned determine_shiftcount(Table::Call &call, int group, unsigned word, int tcam_shift)
+            const override;
+    void add_cfg_reg(json::vector &cfg_cache, std::string full_name, std::string name, unsigned val,
+                     unsigned width);
+    Layout::bus_type_t default_bus_type() const override; int default_pfe_adjust() const override {
+        return color_aware ? -METER_TYPE_BITS : 0;
+    } void set_color_used() override { color_used = true; } void set_output_used() override {
+        output_used = true;
+    } int color_shiftcount(Table::Call &call, int group, int tcam_shift) const override;
+    template <class REGS>
+    void setup_exact_shift(REGS &merge, int bus, int group, int word, int word_group,
+                           Call &meter_call, Call &color_call);
+    template <class REGS>
+    void setup_tcam_shift(REGS &merge, int bus, int tcam_shift, Call &meter_call, Call &color_call);
+    template <class REGS> void write_color_regs(REGS &regs, MatchTable *match, int type, int bus,
+                                                const std::vector<Call::Arg> &args);)
+
+namespace StatefulAlu {
+struct TMatchOP;
+struct TMatchInfo {
+    const Table::Actions::Action *act;
+    const TMatchOP *op;
+};
+
+Instruction *genNoop(StatefulTable *tbl, Table::Actions::Action *act);
+}  // namespace StatefulAlu
+
+DECLARE_TABLE_TYPE(StatefulTable, Synth2Port, "stateful",
+    table_type_t table_type() const override { return STATEFUL; }
+    bool setup_jbay(const pair_t &kv);
+    template<class REGS> void write_action_regs_vt(REGS &regs, const Actions::Action *);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD,
+        void write_action_regs, (mau_regs &regs, const Actions::Action *act), override)
+    template<class REGS> void write_merge_regs_vt(REGS &regs, MatchTable *match, int type, int bus,
+                                                  const std::vector<Call::Arg> &args);
+    template<class REGS> void write_logging_regs(REGS &regs);
+    FOR_ALL_REGISTER_SETS(TARGET_OVERLOAD,
+        void write_merge_regs, (mau_regs &regs, MatchTable *match, int type,
+                                int bus, const std::vector<Call::Arg> &args), override)
+    template<class REGS> void write_tofino2_common_regs(REGS &regs);
+    struct const_info_t {
+        int         lineno;
+        int64_t     value;
+        bool        is_param;
+        std::string param_name;
+        unsigned    param_handle;
+        static unsigned unique_register_param_handle;
+        const_info_t() = default;
+        const_info_t(int lineno,
+                     int64_t value,
+                     bool is_param = false,
+                     std::string param_name = "",
+                     unsigned param_handle = 0)
+            : lineno(lineno), value(value), is_param(is_param),
+              param_name(param_name), param_handle(param_handle) {
+                if (is_param) this->param_handle = unique_register_param_handle++;
+            }
+    };
+    std::vector<const_info_t> const_vals;
+    struct MathTable {
+        int                     lineno = -1;
+        std::vector<int>        data;
+        bool                    invert = false;
+        int                     shift = 0, scale = 0;
+        explicit operator bool() { return lineno >= 0; }
+        void check();
+    }                   math_table;
+    bool dual_mode = false;
+    bool offset_vpn = false;
+    bool address_used = false;
+    int meter_adr_shift = 0;
+    int stateful_counter_mode = 0;
+    int watermark_level = 0;
+    int watermark_pop_not_push = 0;
+    uint64_t initial_value_lo = 0;
+    uint64_t initial_value_hi = 0;
+    unsigned data_bytemask = 0;
+    unsigned hash_bytemask = 0;
+    int logvpn_lineno = -1;
+    int logvpn_min = -1, logvpn_max = -1;
+    int pred_shift = 0, pred_comb_shift = 0;
+    int stage_alu_id = -1;
+    Ref underflow_action, overflow_action;
+
+  // NOLINTNEXTLINE (whitespace/indent)
+ public:
+    Ref                 bound_selector;
+    unsigned            phv_byte_mask = 0;
+    std::vector<Ref>    sbus_learn, sbus_match;
+    enum { SBUS_OR = 0, SBUS_AND = 1 } sbus_comb = SBUS_OR;
+    int                 phv_hash_shift = 0;
+    bitvec              phv_hash_mask = bitvec(0, 128);
+    Instruction         *output_lmatch = nullptr;  // output instruction using lmatch
+    bitvec              clear_value;
+    uint32_t            busy_value = 0;
+    bool                divmod_used = false;
+    int instruction_set() override { return 1; /* STATEFUL_ALU */ }
+    int direct_shiftcount() const override;
+    int indirect_shiftcount() const override;
+    int address_shift() const override;
+    int unitram_type() override { return UnitRam::STATEFUL; }
+    int get_const(int lineno, int64_t v);
+    bool is_dual_mode() const { return dual_mode; }
+    int alu_size() const { return 1 << std::min(5U, format->log2size - is_dual_mode()); }
+    int home_row() const override { return layout.at(0).row | 3; }
+    unsigned meter_group() const { return layout.at(0).row/4U; }
+    unsigned determine_shiftcount(Table::Call &call, int group, unsigned word,
+            int tcam_shift) const override;
+    unsigned per_flow_enable_bit(MatchTable *m = nullptr) const override;
+    void set_address_used() override { address_used = true; }
+    void set_output_used() override { output_used = true; }
+    void parse_register_params(int idx, const value_t &val);
+    int64_t get_const_val(int index) const { return const_vals.at(index).value; }
+    Actions::Action *action_for_table_action(const MatchTable *tbl, const Actions::Action *) const;
+    OVERLOAD_FUNC_FOREACH(REGISTER_SET, static int, parse_counter_mode, (const value_t &v), (v))
+    OVERLOAD_FUNC_FOREACH(REGISTER_SET, void, set_counter_mode, (int mode), (mode))
+    OVERLOAD_FUNC_FOREACH(REGISTER_SET,
+        void, gen_tbl_cfg, (json::map &tbl, json::map &stage_tbl) const, (tbl, stage_tbl))
+    BFN::Alloc1D<StatefulAlu::TMatchInfo, Target::JBay::STATEFUL_TMATCH_UNITS>       tmatch_use;
+
+    bool p4c_5192_workaround(const Actions::Action *) const;
+)
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TABLES_H_ */
diff --git a/backends/tofino/bf-asm/target.cpp b/backends/tofino/bf-asm/target.cpp
new file mode 100644
index 00000000000..41b3b1f7c50
--- /dev/null
+++ b/backends/tofino/bf-asm/target.cpp
@@ -0,0 +1,321 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/target.h"
+
+#include "asm-types.h"
+#include "backends/tofino/bf-asm/config.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "bson.h"
+#include "parser.h"
+#include "ubits.h"
+
+void declare_registers(const Target::Tofino::top_level_regs *regs) {
+    declare_registers(&regs->mem_top, sizeof(regs->mem_top),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "memories.top";
+                          regs->mem_top.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->mem_pipe, sizeof(regs->mem_pipe),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "memories.pipe";
+                          regs->mem_pipe.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->reg_top, sizeof(regs->reg_top),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "registers.top";
+                          regs->reg_top.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->reg_pipe, sizeof(regs->reg_pipe),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "registers.pipe";
+                          regs->reg_pipe.emit_fieldname(out, addr, end);
+                      });
+}
+void undeclare_registers(const Target::Tofino::top_level_regs *regs) {
+    undeclare_registers(&regs->mem_top);
+    undeclare_registers(&regs->mem_pipe);
+    undeclare_registers(&regs->reg_top);
+    undeclare_registers(&regs->reg_pipe);
+}
+
+void declare_registers(const Target::Tofino::parser_regs *regs) {
+    declare_registers(&regs->memory[INGRESS], sizeof regs->memory[INGRESS],
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.mem[INGRESS]";
+                          regs->memory[INGRESS].emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->memory[EGRESS], sizeof regs->memory[EGRESS],
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.mem[EGRESS]";
+                          regs->memory[EGRESS].emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->ingress, sizeof regs->ingress,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.ibp_reg";
+                          regs->ingress.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->egress, sizeof regs->egress,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.ebp_reg";
+                          regs->egress.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->merge, sizeof regs->merge,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.merge";
+                          regs->merge.emit_fieldname(out, addr, end);
+                      });
+}
+void undeclare_registers(const Target::Tofino::parser_regs *regs) {
+    undeclare_registers(&regs->memory[INGRESS]);
+    undeclare_registers(&regs->memory[EGRESS]);
+    undeclare_registers(&regs->ingress);
+    undeclare_registers(&regs->egress);
+    undeclare_registers(&regs->merge);
+}
+void declare_registers(const Target::Tofino::mau_regs *regs, bool, int stage) {
+    declare_registers(regs, sizeof *regs,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "mau[" << stage << "]";
+                          regs->emit_fieldname(out, addr, end);
+                      });
+}
+void declare_registers(const Target::Tofino::deparser_regs *regs) {
+    declare_registers(&regs->input, sizeof(regs->input),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "deparser.input_phase";
+                          regs->input.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->header, sizeof(regs->header),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "deparser.header_phase";
+                          regs->header.emit_fieldname(out, addr, end);
+                      });
+}
+void undeclare_registers(const Target::Tofino::deparser_regs *regs) {
+    undeclare_registers(&regs->input);
+    undeclare_registers(&regs->header);
+}
+
+void emit_parser_registers(const Target::Tofino::top_level_regs *regs, std::ostream &out) {
+    std::set<int> emitted_parsers;
+    // The driver can reprogram parser blocks at runtime. We output parser
+    // blocks in the binary with the same base address. The driver uses the
+    // parser handle at the start of each block to associate the parser block
+    // with its respective parser node in context.json.
+    // In a p4 program, the user can associate multiple parsers to a
+    // multi-parser configuration but only map a few ports. The unmapped
+    // parser(s) will be output in context.json node and binary but not have an
+    // associated port map in context.json. The driver will not initialize any
+    // parsers with these unmapped parser(s) but use them to reconfigure at
+    // runtime if required.
+    uint64_t pipe_mem_base_addr = 0x200000000000;
+    uint64_t prsr_mem_base_addr = (pipe_mem_base_addr + 0x1C800000000) >> 4;
+    uint64_t pipe_regs_base_addr = 0x2000000;
+    uint64_t prsr_regs_base_addr = pipe_regs_base_addr + 0x700000;
+    for (auto ig : regs->parser_ingress) {
+        out << binout::tag('P') << binout::byte4(ig.first);
+        ig.second->emit_binary(out, prsr_regs_base_addr);
+    }
+    for (auto ig : regs->parser_memory[INGRESS]) {
+        out << binout::tag('P') << binout::byte4(ig.first);
+        ig.second->emit_binary(out, prsr_mem_base_addr);
+    }
+    prsr_regs_base_addr = pipe_regs_base_addr + 0x740000;
+    for (auto eg : regs->parser_egress) {
+        out << binout::tag('P') << binout::byte4(eg.first);
+        eg.second->emit_binary(out, prsr_regs_base_addr);
+    }
+    prsr_mem_base_addr = (pipe_mem_base_addr + 0x1C800400000) >> 4;
+    for (auto eg : regs->parser_memory[EGRESS]) {
+        out << binout::tag('P') << binout::byte4(eg.first);
+        eg.second->emit_binary(out, prsr_mem_base_addr);
+    }
+}
+
+void declare_registers(const Target::JBay::top_level_regs *regs) {
+    declare_registers(&regs->mem_top, sizeof(regs->mem_top),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "memories.top";
+                          regs->mem_top.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->mem_pipe, sizeof(regs->mem_pipe),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "memories.pipe";
+                          regs->mem_pipe.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->reg_top, sizeof(regs->reg_top),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "registers.top";
+                          regs->reg_top.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->reg_pipe, sizeof(regs->reg_pipe),
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "registers.pipe";
+                          regs->reg_pipe.emit_fieldname(out, addr, end);
+                      });
+}
+void undeclare_registers(const Target::JBay::top_level_regs *regs) {
+    undeclare_registers(&regs->mem_top);
+    undeclare_registers(&regs->mem_pipe);
+    undeclare_registers(&regs->reg_top);
+    undeclare_registers(&regs->reg_pipe);
+}
+void declare_registers(const Target::JBay::parser_regs *regs) {
+    declare_registers(&regs->memory[INGRESS], sizeof regs->memory[INGRESS],
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.mem[INGRESS]";
+                          regs->memory[INGRESS].emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->memory[EGRESS], sizeof regs->memory[EGRESS],
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.mem[EGRESS]";
+                          regs->memory[EGRESS].emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->ingress, sizeof regs->ingress,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.ipb_reg";
+                          regs->ingress.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->egress, sizeof regs->egress,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.epb_reg";
+                          regs->egress.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->main[INGRESS], sizeof regs->main[INGRESS],
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.ingress.main";
+                          regs->main[INGRESS].emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->main[EGRESS], sizeof regs->main[EGRESS],
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.egress.main";
+                          regs->main[EGRESS].emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&regs->merge, sizeof regs->merge,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "parser.merge";
+                          regs->merge.emit_fieldname(out, addr, end);
+                      });
+}
+void undeclare_registers(const Target::JBay::parser_regs *regs) {
+    undeclare_registers(&regs->memory[INGRESS]);
+    undeclare_registers(&regs->memory[EGRESS]);
+    undeclare_registers(&regs->ingress);
+    undeclare_registers(&regs->egress);
+    undeclare_registers(&regs->main[INGRESS]);
+    undeclare_registers(&regs->main[EGRESS]);
+    undeclare_registers(&regs->merge);
+}
+void declare_registers(const Target::JBay::mau_regs *regs, bool, int stage) {
+    declare_registers(regs, sizeof *regs,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "mau[" << stage << "]";
+                          regs->emit_fieldname(out, addr, end);
+                      });
+}
+void declare_registers(const Target::JBay::deparser_regs *regs) {
+    declare_registers(regs, sizeof *regs,
+                      [=](std::ostream &out, const char *addr, const void *end) {
+                          out << "deparser.regs";
+                          regs->emit_fieldname(out, addr, end);
+                      });
+}
+
+void emit_parser_registers(const Target::JBay::top_level_regs *regs, std::ostream &out) {
+    std::set<int> emitted_parsers;
+    for (auto ig : regs->parser_ingress) {
+        json::map header;
+        header["handle"] = ig.first;
+        out << binout::tag('P') << json::binary(header);
+        ig.second->emit_binary(out, 0);
+    }
+    for (auto eg : regs->parser_egress) {
+        json::map header;
+        header["handle"] = eg.first;
+        out << binout::tag('P') << json::binary(header);
+        eg.second->emit_binary(out, 0);
+    }
+    for (auto ig : regs->parser_main[INGRESS]) {
+        json::map header;
+        header["handle"] = ig.first;
+        out << binout::tag('P') << json::binary(header);
+        ig.second->emit_binary(out, 0);
+    }
+    for (auto eg : regs->parser_main[EGRESS]) {
+        json::map header;
+        header["handle"] = eg.first;
+        out << binout::tag('P') << json::binary(header);
+        eg.second->emit_binary(out, 0);
+    }
+    for (auto ig : regs->parser_memory[INGRESS]) {
+        json::map header;
+        header["handle"] = ig.first;
+        out << binout::tag('P') << json::binary(header);
+        ig.second->emit_binary(out, 0);
+    }
+    for (auto eg : regs->parser_memory[EGRESS]) {
+        json::map header;
+        header["handle"] = eg.first;
+        out << binout::tag('P') << json::binary(header);
+        eg.second->emit_binary(out, 0);
+    }
+}
+
+int Target::numMauStagesOverride = 0;
+
+int Target::encodeConst(int src) {
+    SWITCH_FOREACH_TARGET(options.target, return TARGET::encodeConst(src););
+    BUG();
+    return 0;
+}
+
+void Target::OVERRIDE_NUM_MAU_STAGES(int num) {
+    int allowed = NUM_MAU_STAGES_PRIVATE();
+    BUG_CHECK(num > 0 && num <= allowed,
+              "Invalid override for NUM_MAU_STAGES. Allowed range is <1, %d>, got %d.", allowed,
+              num);
+
+    numMauStagesOverride = num;
+    return;
+}
+
+int Target::NUM_BUS_OF_TYPE_v(int bus_type) const {
+    // default values for Tofino1/2
+    switch (static_cast<Table::Layout::bus_type_t>(bus_type)) {
+        case Table::Layout::SEARCH_BUS:
+        case Table::Layout::RESULT_BUS:
+        case Table::Layout::TIND_BUS:
+            return 2;
+        case Table::Layout::IDLE_BUS:
+            return 20;
+        default:
+            return 0;
+    }
+}
+
+int Target::NUM_BUS_OF_TYPE(int bus_type) {
+    SWITCH_FOREACH_TARGET(options.target, return TARGET().NUM_BUS_OF_TYPE_v(bus_type);)
+}
+
+// should these be inline in the header file?
+#define DEFINE_PER_TARGET_CONSTANT(TYPE, NAME)                                      \
+    TYPE Target::NAME() {                                                           \
+        SWITCH_FOREACH_TARGET(options.target, return TARGET::NAME;)                 \
+        return std::conditional_t<std::is_pointer_v<TYPE>, std::nullptr_t, TYPE>(); \
+    }
+PER_TARGET_CONSTANTS(DEFINE_PER_TARGET_CONSTANT)
diff --git a/backends/tofino/bf-asm/target.h b/backends/tofino/bf-asm/target.h
new file mode 100644
index 00000000000..6c6b6103ffa
--- /dev/null
+++ b/backends/tofino/bf-asm/target.h
@@ -0,0 +1,710 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef TARGET_H_
+#define TARGET_H_
+
+#include "asm-types.h"
+#include "backends/tofino/bf-asm/config.h"
+#include "bfas.h"
+#include "map.h"
+
+struct MemUnit;
+
+/** FOR_ALL_TARGETS -- metamacro that expands a macro for each defined target
+ *  FOR_ALL_REGISTER_SETS -- metamacro that expands for each distinct register set;
+ *              basically a subset of targets with one per distinct register set
+ *  FOR_ALL_TARGET_CLASSES -- metamacro that expands for each distinct target class
+ *              a subset of the register sets
+ */
+#define FOR_ALL_TARGETS(M, ...) \
+    M(Tofino, ##__VA_ARGS__)    \
+    M(JBay, ##__VA_ARGS__)      \
+    M(Tofino2H, ##__VA_ARGS__)  \
+    M(Tofino2M, ##__VA_ARGS__)  \
+    M(Tofino2U, ##__VA_ARGS__)  \
+    M(Tofino2A0, ##__VA_ARGS__)
+#define FOR_ALL_REGISTER_SETS(M, ...) \
+    M(Tofino, ##__VA_ARGS__)          \
+    M(JBay, ##__VA_ARGS__)
+#define FOR_ALL_TARGET_CLASSES(M, ...) M(Tofino, ##__VA_ARGS__)
+
+// alias FOR_ALL -> FOR_EACH so the the group name does need to be plural
+#define FOR_EACH_TARGET FOR_ALL_TARGETS
+#define FOR_EACH_REGISTER_SET FOR_ALL_REGISTER_SETS
+#define FOR_EACH_TARGET_CLASS FOR_ALL_TARGET_CLASSES
+
+#define TARGETS_IN_CLASS_Tofino(M, ...) \
+    M(Tofino, ##__VA_ARGS__)            \
+    M(JBay, ##__VA_ARGS__)              \
+    M(Tofino2H, ##__VA_ARGS__)          \
+    M(Tofino2M, ##__VA_ARGS__)          \
+    M(Tofino2U, ##__VA_ARGS__)          \
+    M(Tofino2A0, ##__VA_ARGS__)
+#define REGSETS_IN_CLASS_Tofino(M, ...) \
+    M(Tofino, ##__VA_ARGS__)            \
+    M(JBay, ##__VA_ARGS__)
+
+#define TARGETS_USING_REGS_JBay(M, ...) \
+    M(JBay, ##__VA_ARGS__)              \
+    M(Tofino2H, ##__VA_ARGS__)          \
+    M(Tofino2M, ##__VA_ARGS__)          \
+    M(Tofino2U, ##__VA_ARGS__)          \
+    M(Tofino2A0, ##__VA_ARGS__)
+#define TARGETS_USING_REGS_Tofino(M, ...) M(Tofino, ##__VA_ARGS__)
+
+#define TARGETS_IN_CLASS(CL, ...) TARGETS_IN_CLASS_##CL(__VA_ARGS__)
+#define TARGETS_USING_REGS(CL, ...) TARGETS_USING_REGS_##CL(__VA_ARGS__)
+#define REGSETS_IN_CLASS(CL, ...) REGSETS_IN_CLASS_##CL(__VA_ARGS__)
+
+#define EXPAND(...) __VA_ARGS__
+#define EXPAND_COMMA(...) , ##__VA_ARGS__
+#define EXPAND_COMMA_CLOSE(...) ,##__VA_ARGS__ )
+#define INSTANTIATE_TARGET_TEMPLATE(TARGET, FUNC, ...) template FUNC(Target::TARGET::__VA_ARGS__);
+#define DECLARE_TARGET_CLASS(TARGET, ...) class TARGET __VA_ARGS__;
+#define FRIEND_TARGET_CLASS(TARGET, ...) friend class Target::TARGET __VA_ARGS__;
+#define TARGET_OVERLOAD(TARGET, FN, ARGS, ...) FN(Target::TARGET::EXPAND ARGS) __VA_ARGS__;
+
+#define PER_TARGET_CONSTANTS(M)                         \
+    M(const char *, name)                               \
+    M(target_t, register_set)                           \
+    M(int, ARAM_UNITS_PER_STAGE)                        \
+    M(int, DEPARSER_CHECKSUM_UNITS)                     \
+    M(int, DEPARSER_CONSTANTS)                          \
+    M(int, DEPARSER_MAX_FD_ENTRIES)                     \
+    M(int, DEPARSER_MAX_POV_BYTES)                      \
+    M(int, DEPARSER_MAX_POV_PER_USE)                    \
+    M(int, DP_UNITS_PER_STAGE)                          \
+    M(int, DYNAMIC_CONFIG)                              \
+    M(int, DYNAMIC_CONFIG_INPUT_BITS)                   \
+    M(bool, EGRESS_SEPARATE)                            \
+    M(int, END_OF_PIPE)                                 \
+    M(int, EXACT_HASH_GROUPS)                           \
+    M(int, EXACT_HASH_TABLES)                           \
+    M(int, EXTEND_ALU_8_SLOTS)                          \
+    M(int, EXTEND_ALU_16_SLOTS)                         \
+    M(int, EXTEND_ALU_32_SLOTS)                         \
+    M(bool, GATEWAY_INHIBIT_INDEX)                      \
+    M(int, GATEWAY_MATCH_BITS)                          \
+    M(bool, GATEWAY_NEEDS_SEARCH_BUS)                   \
+    M(int, GATEWAY_PAYLOAD_GROUPS)                      \
+    M(int, GATEWAY_ROWS)                                \
+    M(bool, GATEWAY_SINGLE_XBAR_GROUP)                  \
+    M(bool, HAS_MPR)                                    \
+    M(int, INSTR_SRC2_BITS)                             \
+    M(int, IMEM_COLORS)                                 \
+    M(int, IXBAR_HASH_GROUPS)                           \
+    M(int, IXBAR_HASH_INDEX_MAX)                        \
+    M(int, IXBAR_HASH_INDEX_STRIDE)                     \
+    M(int, LOCAL_TIND_UNITS)                            \
+    M(int, LONG_BRANCH_TAGS)                            \
+    M(int, MAX_IMMED_ACTION_DATA)                       \
+    M(int, MAX_OVERHEAD_OFFSET)                         \
+    M(int, MAX_OVERHEAD_OFFSET_NEXT)                    \
+    M(int, MATCH_BYTE_16BIT_PAIRS)                      \
+    M(int, MATCH_REQUIRES_PHYSID)                       \
+    M(int, MAU_BASE_DELAY)                              \
+    M(int, MAU_BASE_PREDICATION_DELAY)                  \
+    M(int, MAU_ERROR_DELAY_ADJUST)                      \
+    M(int, METER_ALU_GROUP_DATA_DELAY)                  \
+    M(int, MINIMUM_INSTR_CONSTANT)                      \
+    M(bool, NEXT_TABLE_EXEC_COMBINED)                   \
+    M(int, NEXT_TABLE_SUCCESSOR_TABLE_DEPTH)            \
+    M(int, NUM_MAU_STAGES_PRIVATE)                      \
+    M(int, NUM_EGRESS_STAGES_PRIVATE)                   \
+    M(int, NUM_PARSERS)                                 \
+    M(int, NUM_PIPES)                                   \
+    M(bool, OUTPUT_STAGE_EXTENSION_PRIVATE)             \
+    M(int, PARSER_CHECKSUM_UNITS)                       \
+    M(bool, PARSER_EXTRACT_BYTES)                       \
+    M(int, PARSER_DEPTH_MAX_BYTES_INGRESS)              \
+    M(int, PARSER_DEPTH_MAX_BYTES_EGRESS)               \
+    M(int, PARSER_DEPTH_MAX_BYTES_MULTITHREADED_EGRESS) \
+    M(int, PARSER_DEPTH_MIN_BYTES_INGRESS)              \
+    M(int, PARSER_DEPTH_MIN_BYTES_EGRESS)               \
+    M(int, PHASE0_FORMAT_WIDTH)                         \
+    M(bool, REQUIRE_TCAM_ID)                            \
+    M(int, SRAM_EGRESS_ROWS)                            \
+    M(bool, SRAM_GLOBAL_ACCESS)                         \
+    M(int, SRAM_HBUS_SECTIONS_PER_STAGE)                \
+    M(int, SRAM_HBUSSES_PER_ROW)                        \
+    M(int, SRAM_INGRESS_ROWS)                           \
+    M(int, SRAM_LOGICAL_UNITS_PER_ROW)                  \
+    M(int, SRAM_LAMBS_PER_STAGE)                        \
+    M(int, SRAM_REMOVED_COLUMNS)                        \
+    M(int, SRAM_STRIDE_COLUMN)                          \
+    M(int, SRAM_STRIDE_ROW)                             \
+    M(int, SRAM_STRIDE_STAGE)                           \
+    M(int, SRAM_UNITS_PER_ROW)                          \
+    M(int, STATEFUL_ALU_ADDR_WIDTH)                     \
+    M(int, STATEFUL_ALU_CONST_MASK)                     \
+    M(int, STATEFUL_ALU_CONST_MAX)                      \
+    M(int, STATEFUL_ALU_CONST_MIN)                      \
+    M(int, STATEFUL_ALU_CONST_WIDTH)                    \
+    M(int, STATEFUL_CMP_ADDR_WIDTH)                     \
+    M(int, STATEFUL_CMP_CONST_MASK)                     \
+    M(int, STATEFUL_CMP_CONST_MAX)                      \
+    M(int, STATEFUL_CMP_CONST_MIN)                      \
+    M(int, STATEFUL_CMP_CONST_WIDTH)                    \
+    M(int, STATEFUL_CMP_UNITS)                          \
+    M(int, STATEFUL_OUTPUT_UNITS)                       \
+    M(int, STATEFUL_PRED_MASK)                          \
+    M(int, STATEFUL_REGFILE_CONST_WIDTH)                \
+    M(int, STATEFUL_REGFILE_ROWS)                       \
+    M(int, STATEFUL_TMATCH_UNITS)                       \
+    M(bool, SUPPORT_ALWAYS_RUN)                         \
+    M(bool, SUPPORT_CONCURRENT_STAGE_DEP)               \
+    M(bool, SUPPORT_OVERFLOW_BUS)                       \
+    M(bool, SUPPORT_SALU_FAST_CLEAR)                    \
+    M(bool, SUPPORT_TRUE_EOP)                           \
+    M(bool, SYNTH2PORT_NEED_MAPRAMS)                    \
+    M(bool, TCAM_EXTRA_NIBBLE)                          \
+    M(bool, TCAM_GLOBAL_ACCESS)                         \
+    M(int, TCAM_MATCH_BUSSES)                           \
+    M(int, TCAM_MEMORY_FULL_WIDTH)                      \
+    M(int, TCAM_ROWS)                                   \
+    M(int, TCAM_UNITS_PER_ROW)                          \
+    M(int, TCAM_XBAR_GROUPS)                            \
+    M(bool, TABLES_REQUIRE_ROW)
+
+#define DECLARE_PER_TARGET_CONSTANT(TYPE, NAME) static TYPE NAME();
+
+#define TARGET_CLASS_SPECIFIC_CLASSES \
+    class ActionTable;                \
+    class CounterTable;               \
+    class ExactMatchTable;            \
+    class GatewayTable;               \
+    class MeterTable;                 \
+    class StatefulTable;              \
+    class TernaryIndirectTable;       \
+    class TernaryMatchTable;
+#define REGISTER_SET_SPECIFIC_CLASSES /* none */
+#define TARGET_SPECIFIC_CLASSES       /* none */
+
+class Target {
+ public:
+    class Phv;
+    FOR_ALL_TARGETS(DECLARE_TARGET_CLASS)
+    PER_TARGET_CONSTANTS(DECLARE_PER_TARGET_CONSTANT)
+
+    static int encodeConst(int src);
+
+    static int NUM_MAU_STAGES() {
+        return numMauStagesOverride ? numMauStagesOverride : NUM_MAU_STAGES_PRIVATE();
+    }
+    static int NUM_EGRESS_STAGES() {
+        int egress_stages = NUM_EGRESS_STAGES_PRIVATE();
+        return numMauStagesOverride && numMauStagesOverride < egress_stages ? numMauStagesOverride
+                                                                            : egress_stages;
+    }
+    static int NUM_STAGES(gress_t gr) {
+        return gr == EGRESS ? NUM_EGRESS_STAGES() : NUM_MAU_STAGES();
+    }
+
+    static int OUTPUT_STAGE_EXTENSION() {
+        return numMauStagesOverride ? 1 : OUTPUT_STAGE_EXTENSION_PRIVATE();
+    }
+
+    static void OVERRIDE_NUM_MAU_STAGES(int num);
+
+    static int SRAM_ROWS(gress_t gr) {
+        return gr == EGRESS ? SRAM_EGRESS_ROWS() : SRAM_INGRESS_ROWS();
+    }
+
+    // FIXME -- bus_type here is a Table::Layout::bus_type_t, but can't forward
+    // declare a nested type.
+    virtual int NUM_BUS_OF_TYPE_v(int bus_type) const;
+    static int NUM_BUS_OF_TYPE(int bus_type);
+
+ private:
+    static int numMauStagesOverride;
+};
+
+#include "backends/tofino/bf-asm/gen/tofino/memories.pipe_addrmap.h"
+#include "backends/tofino/bf-asm/gen/tofino/memories.pipe_top_level.h"
+#include "backends/tofino/bf-asm/gen/tofino/memories.prsr_mem_main_rspec.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.dprsr_hdr.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.dprsr_inp.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.ebp_rspec.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.ibp_rspec.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.mau_addrmap.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.pipe_addrmap.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.prsr_reg_merge_rspec.h"
+#include "backends/tofino/bf-asm/gen/tofino/regs.tofino.h"
+
+class Target::Tofino : public Target {
+ public:
+    static constexpr const char *const name = "tofino";
+    static constexpr target_t tag = TOFINO;
+    static constexpr target_t register_set = TOFINO;
+    typedef Target::Tofino target_type;
+    typedef Target::Tofino register_type;
+    class Phv;
+    struct top_level_regs {
+        typedef ::Tofino::memories_top _mem_top;
+        typedef ::Tofino::memories_pipe _mem_pipe;
+        typedef ::Tofino::regs_top _regs_top;
+        typedef ::Tofino::regs_pipe _regs_pipe;
+
+        ::Tofino::memories_top mem_top;
+        ::Tofino::memories_pipe mem_pipe;
+        ::Tofino::regs_top reg_top;
+        ::Tofino::regs_pipe reg_pipe;
+
+        // map from handle to parser regs
+        std::map<unsigned, ::Tofino::memories_all_parser_ *> parser_memory[2];
+        std::map<unsigned, ::Tofino::regs_all_parser_ingress *> parser_ingress;
+        std::map<unsigned, ::Tofino::regs_all_parser_egress *> parser_egress;
+        ::Tofino::regs_all_parse_merge parser_merge;
+    };
+    struct parser_regs : public ParserRegisterSet {
+        typedef ::Tofino::memories_all_parser_ _memory;
+        typedef ::Tofino::regs_all_parser_ingress _ingress;
+        typedef ::Tofino::regs_all_parser_egress _egress;
+        typedef ::Tofino::regs_all_parse_merge _merge;
+
+        ::Tofino::memories_all_parser_ memory[2];
+        ::Tofino::regs_all_parser_ingress ingress;
+        ::Tofino::regs_all_parser_egress egress;
+        ::Tofino::regs_all_parse_merge merge;
+    };
+
+    typedef ::Tofino::regs_match_action_stage_ mau_regs;
+    struct deparser_regs {
+        typedef ::Tofino::regs_all_deparser_input_phase _input;
+        typedef ::Tofino::regs_all_deparser_header_phase _header;
+
+        ::Tofino::regs_all_deparser_input_phase input;
+        ::Tofino::regs_all_deparser_header_phase header;
+    };
+    enum {
+        ARAM_UNITS_PER_STAGE = 0,
+        PARSER_CHECKSUM_UNITS = 2,
+        PARSER_EXTRACT_BYTES = false,
+        PARSER_DEPTH_MAX_BYTES_INGRESS = (((1 << 10) - 1) * 16),
+        PARSER_DEPTH_MAX_BYTES_EGRESS = (((1 << 10) - 1) * 16),
+        PARSER_DEPTH_MAX_BYTES_MULTITHREADED_EGRESS = 160,
+        PARSER_DEPTH_MIN_BYTES_INGRESS = 0,
+        PARSER_DEPTH_MIN_BYTES_EGRESS = 65,
+        MATCH_BYTE_16BIT_PAIRS = true,
+        MATCH_REQUIRES_PHYSID = false,
+        MAX_IMMED_ACTION_DATA = 32,
+        MAX_OVERHEAD_OFFSET = 64,
+        MAX_OVERHEAD_OFFSET_NEXT = 40,
+        NUM_MAU_STAGES_PRIVATE = 12,
+        NUM_EGRESS_STAGES_PRIVATE = NUM_MAU_STAGES_PRIVATE,
+        ACTION_INSTRUCTION_MAP_WIDTH = 7,
+        DEPARSER_CHECKSUM_UNITS = 6,
+        DEPARSER_CONSTANTS = 0,
+        DEPARSER_MAX_POV_BYTES = 32,
+        DEPARSER_MAX_POV_PER_USE = 1,
+        DEPARSER_MAX_FD_ENTRIES = 192,
+        DP_UNITS_PER_STAGE = 0,
+        DYNAMIC_CONFIG = 0,
+        DYNAMIC_CONFIG_INPUT_BITS = 0,
+        EGRESS_SEPARATE = false,
+        END_OF_PIPE = 0xff,
+        EXACT_HASH_GROUPS = 8,
+        EXACT_HASH_TABLES = 16,
+        EXTEND_ALU_8_SLOTS = 0,
+        EXTEND_ALU_16_SLOTS = 0,
+        EXTEND_ALU_32_SLOTS = 0,
+        GATEWAY_INHIBIT_INDEX = false,
+        GATEWAY_MATCH_BITS = 56,  // includes extra expansion for range match
+        GATEWAY_NEEDS_SEARCH_BUS = true,
+        GATEWAY_PAYLOAD_GROUPS = 1,
+        GATEWAY_ROWS = 8,
+        GATEWAY_SINGLE_XBAR_GROUP = true,
+        SUPPORT_TRUE_EOP = 0,
+        INSTR_SRC2_BITS = 4,
+        IMEM_COLORS = 2,
+        IXBAR_HASH_GROUPS = 8,
+        IXBAR_HASH_INDEX_MAX = 40,
+        IXBAR_HASH_INDEX_STRIDE = 10,
+        LOCAL_TIND_UNITS = 0,
+        LONG_BRANCH_TAGS = 0,
+        MAU_BASE_DELAY = 20,
+        MAU_BASE_PREDICATION_DELAY = 11,
+        MAU_ERROR_DELAY_ADJUST = 2,
+        METER_ALU_GROUP_DATA_DELAY = 13,
+        // To avoid under run scenarios, there is a minimum egress pipeline latency required
+        MINIMUM_REQUIRED_EGRESS_PIPELINE_LATENCY = 160,
+        NEXT_TABLE_EXEC_COMBINED = false,  // no next_exec on tofino1 at all
+        NEXT_TABLE_SUCCESSOR_TABLE_DEPTH = 8,
+        PHASE0_FORMAT_WIDTH = 64,
+        REQUIRE_TCAM_ID = false,  // miss-only tables do not need a tcam id
+        SRAM_EGRESS_ROWS = 8,
+        SRAM_GLOBAL_ACCESS = false,
+        SRAM_HBUS_SECTIONS_PER_STAGE = 0,
+        SRAM_HBUSSES_PER_ROW = 0,
+        SRAM_INGRESS_ROWS = 8,
+        SRAM_LAMBS_PER_STAGE = 0,
+        SRAM_LOGICAL_UNITS_PER_ROW = 6,
+        SRAM_REMOVED_COLUMNS = 2,
+        SRAM_STRIDE_COLUMN = 1,
+        SRAM_STRIDE_ROW = 12,
+        SRAM_STRIDE_STAGE = 0,
+        SRAM_UNITS_PER_ROW = 12,
+        STATEFUL_CMP_UNITS = 2,
+        STATEFUL_CMP_ADDR_WIDTH = 2,
+        STATEFUL_CMP_CONST_WIDTH = 4,
+        STATEFUL_CMP_CONST_MASK = 0xf,
+        STATEFUL_CMP_CONST_MIN = -8,
+        STATEFUL_CMP_CONST_MAX = 7,
+        STATEFUL_TMATCH_UNITS = 0,
+        STATEFUL_OUTPUT_UNITS = 1,
+        STATEFUL_PRED_MASK = (1U << (1 << STATEFUL_CMP_UNITS)) - 1,
+        STATEFUL_REGFILE_ROWS = 4,
+        STATEFUL_REGFILE_CONST_WIDTH = 32,
+        SUPPORT_ALWAYS_RUN = 0,
+        HAS_MPR = 0,
+        SUPPORT_CONCURRENT_STAGE_DEP = 1,
+        SUPPORT_OVERFLOW_BUS = 1,
+        SUPPORT_SALU_FAST_CLEAR = 0,
+        STATEFUL_ALU_ADDR_WIDTH = 2,
+        STATEFUL_ALU_CONST_WIDTH = 4,
+        STATEFUL_ALU_CONST_MASK = 0xf,
+        STATEFUL_ALU_CONST_MIN = -8,  // TODO Is the same as the following one?
+        STATEFUL_ALU_CONST_MAX = 7,
+        MINIMUM_INSTR_CONSTANT = -8,  // TODO
+        NUM_PARSERS = 18,
+        NUM_PIPES = 4,
+        OUTPUT_STAGE_EXTENSION_PRIVATE = 0,
+        SYNTH2PORT_NEED_MAPRAMS = true,
+        TCAM_EXTRA_NIBBLE = true,
+        TCAM_GLOBAL_ACCESS = false,
+        TCAM_MATCH_BUSSES = 2,
+        TCAM_MEMORY_FULL_WIDTH = 47,
+        TCAM_ROWS = 12,
+        TCAM_UNITS_PER_ROW = 2,
+        TCAM_XBAR_GROUPS = 12,
+        TABLES_REQUIRE_ROW = 1,
+    };
+    static int encodeConst(int src) { return (src >> 10 << 15) | (0x8 << 10) | (src & 0x3ff); }
+    TARGET_SPECIFIC_CLASSES
+    REGISTER_SET_SPECIFIC_CLASSES
+    TARGET_CLASS_SPECIFIC_CLASSES
+};
+
+void declare_registers(const Target::Tofino::top_level_regs *regs);
+void undeclare_registers(const Target::Tofino::top_level_regs *regs);
+void declare_registers(const Target::Tofino::parser_regs *regs);
+void undeclare_registers(const Target::Tofino::parser_regs *regs);
+void declare_registers(const Target::Tofino::mau_regs *regs, bool ignore, int stage);
+void declare_registers(const Target::Tofino::deparser_regs *regs);
+void undeclare_registers(const Target::Tofino::deparser_regs *regs);
+void emit_parser_registers(const Target::Tofino::top_level_regs *regs, std::ostream &);
+
+#include "backends/tofino/bf-asm/gen/jbay/memories.jbay_mem.h"
+#include "backends/tofino/bf-asm/gen/jbay/memories.pipe_addrmap.h"
+#include "backends/tofino/bf-asm/gen/jbay/memories.prsr_mem_main_rspec.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.dprsr_reg.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.epb_prsr4_reg.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.ipb_prsr4_reg.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.jbay_reg.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.mau_addrmap.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.pipe_addrmap.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.pmerge_reg.h"
+#include "backends/tofino/bf-asm/gen/jbay/regs.prsr_reg_main_rspec.h"
+
+class Target::JBay : public Target {
+ public:
+    static constexpr const char *const name = "tofino2";
+    static constexpr target_t tag = JBAY;
+    static constexpr target_t register_set = JBAY;
+    typedef Target::JBay target_type;
+    typedef Target::JBay register_type;
+    class Phv;
+    struct top_level_regs {
+        typedef ::JBay::memories_top _mem_top;
+        typedef ::JBay::memories_pipe _mem_pipe;
+        typedef ::JBay::regs_top _regs_top;
+        typedef ::JBay::regs_pipe _regs_pipe;
+
+        ::JBay::memories_top mem_top;
+        ::JBay::memories_pipe mem_pipe;
+        ::JBay::regs_top reg_top;
+        ::JBay::regs_pipe reg_pipe;
+
+        // map from handle to parser regs
+        std::map<unsigned, ::JBay::memories_parser_ *> parser_memory[2];
+        std::map<unsigned, ::JBay::regs_parser_ingress *> parser_ingress;
+        std::map<unsigned, ::JBay::regs_parser_egress *> parser_egress;
+        std::map<unsigned, ::JBay::regs_parser_main_ *> parser_main[2];
+        ::JBay::regs_parse_merge parser_merge;
+    };
+    struct parser_regs : public ParserRegisterSet {
+        typedef ::JBay::memories_parser_ _memory;
+        typedef ::JBay::regs_parser_ingress _ingress;  // [9]
+        typedef ::JBay::regs_parser_egress _egress;    // [9]
+        typedef ::JBay::regs_parser_main_ _main;       // [9]
+        typedef ::JBay::regs_parse_merge _merge;       // [1]
+
+        ::JBay::memories_parser_ memory[2];
+        ::JBay::regs_parser_ingress ingress;
+        ::JBay::regs_parser_egress egress;
+        ::JBay::regs_parser_main_ main[2];
+        ::JBay::regs_parse_merge merge;
+    };
+
+    typedef ::JBay::regs_match_action_stage_ mau_regs;
+    typedef ::JBay::regs_deparser deparser_regs;
+    enum : int {
+        ARAM_UNITS_PER_STAGE = 0,
+        PARSER_CHECKSUM_UNITS = 5,
+        PARSER_EXTRACT_BYTES = true,
+        PARSER_DEPTH_MAX_BYTES_INGRESS = (((1 << 10) - 1) * 16),
+        PARSER_DEPTH_MAX_BYTES_EGRESS = (32 * 16),
+        PARSER_DEPTH_MAX_BYTES_MULTITHREADED_EGRESS = (32 * 16),
+        PARSER_DEPTH_MIN_BYTES_INGRESS = 0,
+        PARSER_DEPTH_MIN_BYTES_EGRESS = 0,
+        MATCH_BYTE_16BIT_PAIRS = false,
+        MATCH_REQUIRES_PHYSID = false,
+        MAX_IMMED_ACTION_DATA = 32,
+        MAX_OVERHEAD_OFFSET = 64,
+        MAX_OVERHEAD_OFFSET_NEXT = 40,
+#ifdef EMU_OVERRIDE_STAGE_COUNT
+        NUM_MAU_STAGES_PRIVATE = EMU_OVERRIDE_STAGE_COUNT,
+        OUTPUT_STAGE_EXTENSION_PRIVATE = 1,
+#else
+        NUM_MAU_STAGES_PRIVATE = 20,
+        OUTPUT_STAGE_EXTENSION_PRIVATE = 0,
+#endif
+        NUM_EGRESS_STAGES_PRIVATE = NUM_MAU_STAGES_PRIVATE,
+        ACTION_INSTRUCTION_MAP_WIDTH = 8,
+        DEPARSER_CHECKSUM_UNITS = 8,
+        DEPARSER_CONSTANTS = 8,
+        DEPARSER_MAX_POV_BYTES = 16,
+        DEPARSER_MAX_POV_PER_USE = 1,
+        DEPARSER_CHUNKS_PER_GROUP = 8,
+        DEPARSER_CHUNK_SIZE = 8,
+        DEPARSER_CHUNK_GROUPS = 16,
+        DEPARSER_CLOTS_PER_GROUP = 4,
+        DEPARSER_TOTAL_CHUNKS = DEPARSER_CHUNK_GROUPS * DEPARSER_CHUNKS_PER_GROUP,
+        DEPARSER_MAX_FD_ENTRIES = DEPARSER_TOTAL_CHUNKS,
+        DP_UNITS_PER_STAGE = 0,
+        DYNAMIC_CONFIG = 0,
+        DYNAMIC_CONFIG_INPUT_BITS = 0,
+        EGRESS_SEPARATE = false,
+        END_OF_PIPE = 0x1ff,
+        EXACT_HASH_GROUPS = 8,
+        EXACT_HASH_TABLES = 16,
+        EXTEND_ALU_8_SLOTS = 0,
+        EXTEND_ALU_16_SLOTS = 0,
+        EXTEND_ALU_32_SLOTS = 0,
+        GATEWAY_INHIBIT_INDEX = false,
+        GATEWAY_MATCH_BITS = 56,  // includes extra expansion for range match
+        GATEWAY_NEEDS_SEARCH_BUS = true,
+        GATEWAY_PAYLOAD_GROUPS = 5,
+        GATEWAY_ROWS = 8,
+        GATEWAY_SINGLE_XBAR_GROUP = true,
+        SUPPORT_TRUE_EOP = 1,
+        INSTR_SRC2_BITS = 5,
+        IMEM_COLORS = 2,
+        IXBAR_HASH_GROUPS = 8,
+        IXBAR_HASH_INDEX_MAX = 40,
+        IXBAR_HASH_INDEX_STRIDE = 10,
+        LOCAL_TIND_UNITS = 0,
+        LONG_BRANCH_TAGS = 8,
+        MAU_BASE_DELAY = 23,
+        MAU_BASE_PREDICATION_DELAY = 13,
+        MAU_ERROR_DELAY_ADJUST = 3,
+        METER_ALU_GROUP_DATA_DELAY = 15,
+        NEXT_TABLE_EXEC_COMBINED = true,
+        NEXT_TABLE_SUCCESSOR_TABLE_DEPTH = 8,
+        PHASE0_FORMAT_WIDTH = 128,
+        REQUIRE_TCAM_ID = false,  // miss-only tables do not need a tcam id
+        SRAM_EGRESS_ROWS = 8,
+        SRAM_GLOBAL_ACCESS = false,
+        SRAM_HBUS_SECTIONS_PER_STAGE = 0,
+        SRAM_HBUSSES_PER_ROW = 0,
+        SRAM_INGRESS_ROWS = 8,
+        SRAM_LAMBS_PER_STAGE = 0,
+        SRAM_LOGICAL_UNITS_PER_ROW = 6,
+        SRAM_REMOVED_COLUMNS = 2,
+        SRAM_STRIDE_COLUMN = 1,
+        SRAM_STRIDE_ROW = 12,
+        SRAM_STRIDE_STAGE = 0,
+        SRAM_UNITS_PER_ROW = 12,
+        STATEFUL_CMP_UNITS = 4,
+        STATEFUL_CMP_ADDR_WIDTH = 2,
+        STATEFUL_CMP_CONST_WIDTH = 6,
+        STATEFUL_CMP_CONST_MASK = 0x3f,
+        STATEFUL_CMP_CONST_MIN = -32,
+        STATEFUL_CMP_CONST_MAX = 31,
+        STATEFUL_TMATCH_UNITS = 2,
+        STATEFUL_OUTPUT_UNITS = 4,
+        STATEFUL_PRED_MASK = (1U << (1 << STATEFUL_CMP_UNITS)) - 1,
+        STATEFUL_REGFILE_ROWS = 4,
+        STATEFUL_REGFILE_CONST_WIDTH = 34,
+        SUPPORT_ALWAYS_RUN = 1,
+        HAS_MPR = 1,
+        SUPPORT_CONCURRENT_STAGE_DEP = 0,
+        SUPPORT_OVERFLOW_BUS = 0,
+        SUPPORT_SALU_FAST_CLEAR = 1,
+        STATEFUL_ALU_ADDR_WIDTH = 2,
+        STATEFUL_ALU_CONST_WIDTH = 4,
+        STATEFUL_ALU_CONST_MASK = 0xf,
+        STATEFUL_ALU_CONST_MIN = -8,  // TODO Is the same as the following one?
+        STATEFUL_ALU_CONST_MAX = 7,
+        MINIMUM_INSTR_CONSTANT = -4,  // TODO
+        NUM_PARSERS = 36,
+        NUM_PIPES = 4,
+        TABLES_REQUIRE_ROW = 1,
+        SYNTH2PORT_NEED_MAPRAMS = true,
+        TCAM_EXTRA_NIBBLE = true,
+        TCAM_GLOBAL_ACCESS = false,
+        TCAM_MATCH_BUSSES = 2,
+        TCAM_MEMORY_FULL_WIDTH = 47,
+        TCAM_ROWS = 12,
+        TCAM_UNITS_PER_ROW = 2,
+        TCAM_XBAR_GROUPS = 12,
+    };
+    static int encodeConst(int src) { return (src >> 11 << 16) | (0x8 << 11) | (src & 0x7ff); }
+    TARGET_SPECIFIC_CLASSES
+    REGISTER_SET_SPECIFIC_CLASSES
+};
+void declare_registers(const Target::JBay::top_level_regs *regs);
+void undeclare_registers(const Target::JBay::top_level_regs *regs);
+void declare_registers(const Target::JBay::parser_regs *regs);
+void undeclare_registers(const Target::JBay::parser_regs *regs);
+void declare_registers(const Target::JBay::mau_regs *regs, bool ignore, int stage);
+void declare_registers(const Target::JBay::deparser_regs *regs);
+
+class Target::Tofino2H : public Target::JBay {
+ public:
+    static constexpr const char *const name = "tofino2h";
+    static constexpr target_t tag = TOFINO2H;
+    typedef Target::Tofino2H target_type;
+    class Phv;
+    enum {
+        NUM_MAU_STAGES_PRIVATE = 6,
+        NUM_EGRESS_STAGES_PRIVATE = NUM_MAU_STAGES_PRIVATE,
+        OUTPUT_STAGE_EXTENSION_PRIVATE = 1,
+    };
+    TARGET_SPECIFIC_CLASSES
+};
+
+class Target::Tofino2M : public Target::JBay {
+ public:
+    static constexpr const char *const name = "tofino2m";
+    static constexpr target_t tag = TOFINO2M;
+    typedef Target::Tofino2M target_type;
+    class Phv;
+    enum {
+        NUM_MAU_STAGES_PRIVATE = 12,
+        NUM_EGRESS_STAGES_PRIVATE = NUM_MAU_STAGES_PRIVATE,
+        OUTPUT_STAGE_EXTENSION_PRIVATE = 1,
+    };
+    TARGET_SPECIFIC_CLASSES
+};
+
+class Target::Tofino2U : public Target::JBay {
+ public:
+    static constexpr const char *const name = "tofino2u";
+    static constexpr target_t tag = TOFINO2U;
+    typedef Target::Tofino2U target_type;
+    class Phv;
+    enum {
+        NUM_MAU_STAGES_PRIVATE = 20,
+        NUM_EGRESS_STAGES_PRIVATE = NUM_MAU_STAGES_PRIVATE,
+    };
+    TARGET_SPECIFIC_CLASSES
+};
+
+class Target::Tofino2A0 : public Target::JBay {
+ public:
+    static constexpr const char *const name = "tofino2a0";
+    static constexpr target_t tag = TOFINO2A0;
+    typedef Target::Tofino2A0 target_type;
+    class Phv;
+    enum {
+        NUM_MAU_STAGES_PRIVATE = 20,
+        NUM_EGRESS_STAGES_PRIVATE = NUM_MAU_STAGES_PRIVATE,
+    };
+    TARGET_SPECIFIC_CLASSES
+};
+
+void emit_parser_registers(const Target::JBay::top_level_regs *regs, std::ostream &);
+
+/** Macro to buid a switch table switching on a target_t, expanding to the same
+ *  code for each target, with TARGET being a typedef for the target type */
+#define SWITCH_FOREACH_TARGET(VAR, ...)                        \
+    switch (VAR) {                                             \
+        FOR_ALL_TARGETS(DO_SWITCH_FOREACH_TARGET, __VA_ARGS__) \
+        default:                                               \
+            BUG("invalid target");                             \
+    }
+
+#define DO_SWITCH_FOREACH_TARGET(TARGET_, ...) \
+    case Target::TARGET_::tag: {               \
+        typedef Target::TARGET_ TARGET;        \
+        __VA_ARGS__                            \
+        break;                                 \
+    }
+
+#define SWITCH_FOREACH_REGISTER_SET(VAR, ...)                              \
+    switch (VAR) {                                                         \
+        FOR_ALL_REGISTER_SETS(DO_SWITCH_FOREACH_REGISTER_SET, __VA_ARGS__) \
+        default:                                                           \
+            BUG("invalid target");                                         \
+    }
+
+#define DO_SWITCH_FOREACH_REGISTER_SET(REGS_, ...) \
+    TARGETS_USING_REGS(REGS_, CASE_FOR_TARGET) {   \
+        typedef Target::REGS_ TARGET;              \
+        __VA_ARGS__                                \
+        break;                                     \
+    }
+
+#define SWITCH_FOREACH_TARGET_CLASS(VAR, ...)                               \
+    switch (VAR) {                                                          \
+        FOR_ALL_TARGET_CLASSES(DO_SWITCH_FOREACH_TARGET_CLASS, __VA_ARGS__) \
+        default:                                                            \
+            BUG("invalid target");                                          \
+    }
+
+#define DO_SWITCH_FOREACH_TARGET_CLASS(CLASS_, ...) \
+    TARGETS_IN_CLASS(CLASS_, CASE_FOR_TARGET) {     \
+        typedef Target::CLASS_ TARGET;              \
+        __VA_ARGS__                                 \
+        break;                                      \
+    }
+
+#define CASE_FOR_TARGET(TARGET) case Target::TARGET::tag:
+
+/* macro to define a function that overloads over a GROUP of types -- will declare all the
+ * functions that overload on a Target::type argument and a 'generic' overload that calls
+ * the right specific overload based on options.target
+ * GROUP can be one of
+ *    TARGET -- overload on all the different targets
+ *    REGISTER_SET -- overload just on the register sets (targets that share a register
+ *                    set will only have one overload)
+ *    TARGET_CLASS -- overload based on the CLASS
+ * RTYPE NAME ARGDECL together make the declaration of the (generic) function, the overloads
+ * will all have a Target::type argument prepended.  The final ARGS argument is the argument
+ * list that that will be forwarded (basically ARGDECL without the types)
+ */
+#define DECL_OVERLOAD_FUNC(TARGET, RTYPE, NAME, ARGDECL, ARGS) \
+    RTYPE NAME(Target::TARGET EXPAND_COMMA_CLOSE ARGDECL;
+#define OVERLOAD_FUNC_FOREACH(GROUP, RTYPE, NAME, ARGDECL, ARGS, ...)                    \
+    FOR_EACH_##GROUP(DECL_OVERLOAD_FUNC, RTYPE, NAME, ARGDECL, ARGS)                     \
+        RTYPE NAME ARGDECL __VA_ARGS__ {                                                 \
+        SWITCH_FOREACH_##GROUP(options.target, return NAME(TARGET() EXPAND_COMMA ARGS);) \
+    }
+
+#endif /* TARGET_H_ */
diff --git a/backends/tofino/bf-asm/ternary_match.cpp b/backends/tofino/bf-asm/ternary_match.cpp
new file mode 100644
index 00000000000..688c0e3a869
--- /dev/null
+++ b/backends/tofino/bf-asm/ternary_match.cpp
@@ -0,0 +1,1226 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "tofino/ternary_match.h"
+
+#include "action_bus.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "input_xbar.h"
+#include "instruction.h"
+#include "lib/algorithm.h"
+#include "lib/range.h"
+#include "misc.h"
+
+Table::Format::Field *TernaryMatchTable::lookup_field(const std::string &n,
+                                                      const std::string &act) const {
+    auto *rv = format ? format->field(n) : nullptr;
+    if (!rv && gateway) rv = gateway->lookup_field(n, act);
+    if (!rv && indirect) rv = indirect->lookup_field(n, act);
+    if (!rv && !act.empty()) {
+        if (auto call = get_action()) {
+            rv = call->lookup_field(n, act);
+        }
+    }
+    return rv;
+}
+
+Table::Format::Field *TernaryIndirectTable::lookup_field(const std::string &n,
+                                                         const std::string &act) const {
+    auto *rv = format ? format->field(n) : nullptr;
+    if (!rv && !act.empty()) {
+        if (auto call = get_action()) rv = call->lookup_field(n, act);
+    }
+    return rv;
+}
+
+void TernaryMatchTable::vpn_params(int &width, int &depth, int &period,
+                                   const char *&period_name) const {
+    if ((width = match.size()) == 0) {
+        BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+        width = input_xbar[0]->tcam_width();
+    }
+    depth = width ? layout_size() / width : 0;
+    period = 1;
+    period_name = 0;
+}
+
+void TernaryMatchTable::alloc_vpns() {
+    if (no_vpns || layout.size() == 0 || layout[0].vpns.size() > 0) return;
+    int period, width, depth;
+    const char *period_name;
+    vpn_params(width, depth, period, period_name);
+    if (width == 0) return;
+    std::vector<Layout *> rows;
+    std::set<std::pair<int, int>> stage_cols;
+    for (auto &r : layout) {
+        for (auto &mem : r.memunits) stage_cols.emplace(mem.stage, mem.col);
+        rows.push_back(&r);
+        r.vpns.resize(r.memunits.size());
+    }
+    std::sort(rows.begin(), rows.end(),
+              [](Layout *const &a, Layout *const &b) -> bool { return a->row < b->row; });
+    int vpn = 0;
+    for (auto [stage, col] : stage_cols) {
+        for (auto *r : rows) {
+            unsigned idx = find(r->memunits, MemUnit(stage, r->row, col)) - r->memunits.begin();
+            if (idx < r->vpns.size()) r->vpns[idx] = vpn++ / width;
+        }
+        if (vpn % width != 0)
+            error(layout[0].lineno,
+                  "%d-wide ternary match must use a multiple of %d tcams "
+                  "in each column",
+                  width, width);
+    }
+}
+
+TernaryMatchTable::Match::Match(const value_t &v) : lineno(v.lineno) {
+    if (v.type == tVEC) {
+        if (v.vec.size < 2 || v.vec.size > 3) {
+            error(v.lineno, "Syntax error");
+            return;
+        }
+        if (!CHECKTYPE(v[0], tINT) || !CHECKTYPE(v[v.vec.size - 1], tINT)) return;
+        if ((word_group = v[0].i) < 0 || v[0].i >= Target::TCAM_XBAR_GROUPS())
+            error(v[0].lineno, "Invalid input xbar group %" PRId64, v[0].i);
+        if (Target::TCAM_EXTRA_NIBBLE() && v.vec.size == 3 && CHECKTYPE(v[1], tINT)) {
+            if ((byte_group = v[1].i) < 0 || v[1].i >= Target::TCAM_XBAR_GROUPS() / 2)
+                error(v[1].lineno, "Invalid input xbar group %" PRId64, v[1].i);
+        } else {
+            byte_group = -1;
+        }
+        if ((byte_config = v[v.vec.size - 1].i) < 0 || byte_config >= 4)
+            error(v[v.vec.size - 1].lineno, "Invalid input xbar byte control %d", byte_config);
+    } else if (CHECKTYPE(v, tMAP)) {
+        for (auto &kv : MapIterChecked(v.map)) {
+            if (kv.key == "group") {
+                if (kv.value.type != tINT || kv.value.i < 0 ||
+                    kv.value.i >= Target::TCAM_XBAR_GROUPS())
+                    error(kv.value.lineno, "Invalid input xbar group %s", value_desc(kv.value));
+                else
+                    word_group = kv.value.i;
+            } else if (Target::TCAM_EXTRA_NIBBLE() && kv.key == "byte_group") {
+                if (kv.value.type != tINT || kv.value.i < 0 ||
+                    kv.value.i >= Target::TCAM_XBAR_GROUPS() / 2)
+                    error(kv.value.lineno, "Invalid input xbar group %s", value_desc(kv.value));
+                else
+                    byte_group = kv.value.i;
+            } else if (Target::TCAM_EXTRA_NIBBLE() && kv.key == "byte_config") {
+                if (kv.value.type != tINT || kv.value.i < 0 || kv.value.i >= 4)
+                    error(kv.value.lineno, "Invalid byte group config %s", value_desc(kv.value));
+                else
+                    byte_config = kv.value.i;
+            } else if (kv.key == "dirtcam") {
+                if (kv.value.type != tINT || kv.value.i < 0 || kv.value.i > 0xfff)
+                    error(kv.value.lineno, "Invalid dirtcam mode %s", value_desc(kv.value));
+                else
+                    dirtcam = kv.value.i;
+            } else {
+                error(kv.key.lineno, "Unknown key '%s' in ternary match spec", value_desc(kv.key));
+            }
+        }
+    }
+}
+
+void TernaryMatchTable::setup(VECTOR(pair_t) & data) {
+    tcam_id = -1;
+    indirect_bus = -1;
+    common_init_setup(data, true, P4Table::MatchEntry);
+    if (input_xbar.empty()) input_xbar.emplace_back(InputXbar::create(this));
+    if (auto *m = get(data, "match")) {
+        if (CHECKTYPE2(*m, tVEC, tMAP)) {
+            if (m->type == tVEC)
+                for (auto &v : m->vec) match.emplace_back(v);
+            else
+                match.emplace_back(*m);
+        }
+    }
+    for (auto &kv : MapIterChecked(data, {"meter", "stats", "stateful"})) {
+        if (common_setup(kv, data, P4Table::MatchEntry)) {
+        } else if (kv.key == "match") {
+            /* done above to be done before vpns */
+        } else if (kv.key == "indirect") {
+            setup_indirect(kv.value);
+        } else if (kv.key == "indirect_bus") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                if (kv.value.i < 0 || kv.value.i >= 16) {
+                    error(kv.value.lineno, "Invalid ternary indirect bus number");
+                } else {
+                    indirect_bus = kv.value.i;
+                    if (auto *old =
+                            stage->tcam_indirect_bus_use[indirect_bus / 2][indirect_bus & 1])
+                        error(kv.value.lineno, "Indirect bus %d already in use by table %s",
+                              indirect_bus, old->name());
+                }
+            }
+        } else if (kv.key == "tcam_id") {
+            if (CHECKTYPE(kv.value, tINT)) {
+                if ((tcam_id = kv.value.i) < 0 || tcam_id >= TCAM_TABLES_PER_STAGE)
+                    error(kv.key.lineno, "Invalid tcam_id %d", tcam_id);
+                else if (stage->tcam_id_use[tcam_id])
+                    error(kv.key.lineno, "Tcam id %d already in use by table %s", tcam_id,
+                          stage->tcam_id_use[tcam_id]->name());
+                else
+                    stage->tcam_id_use[tcam_id] = this;
+                physical_ids[tcam_id] = 1;
+            }
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (Target::TCAM_GLOBAL_ACCESS())
+        alloc_global_tcams();
+    else
+        alloc_rams(false, stage->tcam_use, &stage->tcam_match_bus_use);
+    check_tcam_match_bus(layout);
+    if (indirect_bus >= 0) {
+        stage->tcam_indirect_bus_use[indirect_bus / 2][indirect_bus & 1] = this;
+    }
+    if (indirect.set()) {
+        if (indirect_bus >= 0)
+            error(lineno, "Table %s has both ternary indirect table and explicit indirect bus",
+                  name());
+        if (!attached.stats.empty() || !attached.meters.empty() || !attached.statefuls.empty())
+            error(lineno,
+                  "Table %s has ternary indirect table and directly attached stats/meters"
+                  " -- move them to indirect table",
+                  name());
+    } else if (!action.set() && !actions) {
+        error(lineno, "Table %s has no indirect, action table or immediate actions", name());
+    }
+    if (action && !action_bus) action_bus = ActionBus::create();
+}
+
+bitvec TernaryMatchTable::compute_reachable_tables() {
+    MatchTable::compute_reachable_tables();
+    if (indirect) reachable_tables_ |= indirect->reachable_tables();
+    return reachable_tables_;
+}
+
+void TernaryMatchTable::pass0() {
+    MatchTable::pass0();
+    if (indirect.check() && indirect->set_match_table(this, false) != TERNARY_INDIRECT)
+        error(indirect.lineno, "%s is not a ternary indirect table", indirect->name());
+}
+
+void TernaryMatchTable::pass1() {
+    LOG1("### Ternary match table " << name() << " pass1 " << loc());
+    if (action_bus) action_bus->pass1(this);
+    MatchTable::pass1();
+    stage->table_use[timing_thread(gress)] |= Stage::USE_TCAM;
+    if (layout_size() == 0) layout.clear();
+    BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+    if (match.empty() && input_xbar[0]->tcam_width() && layout.size() != 0) {
+        match.resize(input_xbar[0]->tcam_width());
+        for (unsigned i = 0; i < match.size(); i++) {
+            match[i].word_group = input_xbar[0]->tcam_word_group(i);
+            match[i].byte_group = input_xbar[0]->tcam_byte_group(i / 2);
+            match[i].byte_config = i & 1;
+        }
+        match.back().byte_config = 3;
+    }
+    if (match.size() == 0) {
+        if (layout.size() != 0)
+            error(layout[0].lineno, "No match or input_xbar in non-empty ternary table %s", name());
+    } else if (layout.size() % match.size() != 0) {
+        error(layout[0].lineno, "Rows not a multiple of the match width in tables %s", name());
+    } else if (layout.size() == 0) {
+        error(lineno, "Empty ternary table with non-empty match");
+    } else {
+        auto mg = match.begin();
+        for (auto &row : layout) {
+            if (!row.bus.count(Layout::SEARCH_BUS))
+                row.bus[Layout::SEARCH_BUS] = row.memunits.at(0).col;
+            auto bus = row.bus.at(Layout::SEARCH_BUS);
+            if (mg->byte_group >= 0) {
+                auto &bg_use = stage->tcam_byte_group_use[row.row / 2][bus];
+                if (bg_use.first) {
+                    if (bg_use.second != mg->byte_group) {
+                        error(mg->lineno,
+                              "Conflicting tcam byte group between rows %d and %d "
+                              "in col %d for table %s",
+                              row.row, row.row ^ 1, bus, name());
+                        if (bg_use.first != this)
+                            error(bg_use.first->lineno, "...also used in table %s",
+                                  bg_use.first->name());
+                    }
+                } else {
+                    bg_use.first = this;
+                    bg_use.second = mg->byte_group;
+                }
+            }
+            if (++mg == match.end()) mg = match.begin();
+        }
+    }
+    if (error_count > 0) return;
+    for (auto &chain_rows_col : chain_rows) chain_rows_col = 0;
+    unsigned row_use = 0;
+    for (auto &row : layout) row_use |= 1U << row.row;
+    unsigned word = 0, wide_row_use = 0;
+    int prev_row = -1;
+    std::vector<MemUnit> *memunits = nullptr;
+    for (auto &row : layout) {
+        if (row.memunits.empty()) {
+            error(row.lineno, "Empty row in ternary table %s", name());
+            continue;
+        }
+        if (memunits) {
+            if (row.memunits.size() != memunits->size())
+                error(row.lineno, "Column mismatch across rows in wide tcam match");
+            for (size_t i = 0; i < row.memunits.size(); ++i)
+                if (row.memunits[i].stage != memunits->at(i).stage ||
+                    row.memunits[i].col != memunits->at(i).col)
+                    error(row.lineno, "Column mismatch across rows in wide tcam match");
+        } else {
+            memunits = &row.memunits;
+        }
+        wide_row_use |= 1U << row.row;
+        if (++word == match.size()) {
+            int top_row = floor_log2(wide_row_use);
+            int bottom_row = top_row + 1 - match.size();
+            if (wide_row_use + (1U << bottom_row) != 1U << (top_row + 1)) {
+                error(row.lineno,
+                      "Ternary match rows must be contiguous "
+                      "within each group of rows in a wide match");
+            } else {
+                // rows chain towards row 6
+                if (top_row < 6)
+                    wide_row_use -= 1U << top_row;
+                else if (bottom_row > 6)
+                    wide_row_use -= 1U << bottom_row;
+                else
+                    wide_row_use -= 1U << 6;
+                for (auto &memunit : *memunits) {
+                    int col = memunit.col;
+                    if (col < 0 || col >= TCAM_UNITS_PER_ROW)
+                        error(row.lineno, "Invalid column %d in table %s", col, name());
+                    else
+                        chain_rows[col] |= wide_row_use;
+                }
+            }
+            word = 0;
+            memunits = nullptr;
+            wide_row_use = 0;
+        }
+    }
+    if (indirect) {
+        if (hit_next.size() > 0 && indirect->hit_next.size() > 0)
+            error(lineno, "Ternary Match table with both direct and indirect next tables");
+        if (!indirect->p4_table) indirect->p4_table = p4_table;
+        if (hit_next.size() > 1 || indirect->hit_next.size() > 1) {
+            if (auto *next = indirect->format->field("next")) {
+                if (next->bit(0) != 0)
+                    error(indirect->format->lineno,
+                          "ternary indirect 'next' field must be"
+                          " at bit 0");
+            } else if (auto *action = indirect->format->field("action")) {
+                if (action->bit(0) != 0)
+                    error(indirect->format->lineno,
+                          "ternary indirect 'action' field must be"
+                          " at bit 0 to be used as next table selector");
+            } else {
+                error(indirect->format->lineno, "No 'next' or 'action' field in format");
+            }
+        }
+        if (format)
+            error(format->lineno,
+                  "Format unexpected in Ternary Match table %s with separate "
+                  "Indirect table %s",
+                  name(), indirect->name());
+    } else if (format) {
+        format->pass1(this);
+    }
+    attached.pass1(this);
+    if (hit_next.size() > 2 && !indirect)
+        error(lineno, "Ternary Match tables cannot directly specify more than 2 hit next tables");
+}
+
+void TernaryMatchTable::pass2() {
+    LOG1("### Ternary match table " << name() << " pass2 " << loc());
+    if (logical_id < 0) choose_logical_id();
+    for (auto &ixb : input_xbar) ixb->pass2();
+    if (!indirect && indirect_bus < 0) {
+        for (int i = 0; i < 16; i++)
+            if (!stage->tcam_indirect_bus_use[i / 2][i & 1]) {
+                indirect_bus = i;
+                stage->tcam_indirect_bus_use[i / 2][i & 1] = this;
+                break;
+            }
+        if (indirect_bus < 0)
+            error(lineno, "No ternary indirect bus available for table %s", name());
+    }
+    if (actions) actions->pass2(this);
+    if (action_bus) action_bus->pass2(this);
+    if (gateway) gateway->pass2();
+    if (idletime) idletime->pass2();
+    if (is_alpm()) {
+        if (auto *acts = get_actions()) {
+            for (auto act = acts->begin(); act != acts->end(); act++) {
+                set_partition_action_handle(act->handle);
+                if (act->p4_params_list.size() > 0) {
+                    // assume first parameter is partition_field_name
+                    set_partition_field_name(act->p4_params_list[0].name);
+                }
+            }
+        }
+    }
+    for (auto &hd : hash_dist) hd.pass2(this);
+}
+
+void TernaryMatchTable::pass3() {
+    LOG1("### Ternary match table " << name() << " pass3 " << loc());
+    MatchTable::pass3();
+    if (action_bus) action_bus->pass3(this);
+}
+
+extern int get_address_mau_actiondata_adr_default(unsigned log2size, bool per_flow_enable);
+
+template <class REGS>
+inline static void tcam_ghost_enable(REGS &regs, int row, int col) {
+    regs.tcams.col[col].tcam_ghost_thread_en[row] = 1;
+}
+template <>
+void tcam_ghost_enable(Target::Tofino::mau_regs &regs, int row, int col) {}
+
+template <class REGS>
+void TernaryMatchTable::tcam_table_map(REGS &regs, int row, int col) {
+    if (tcam_id >= 0) {
+        if (!((chain_rows[col] >> row) & 1))
+            regs.tcams.col[col].tcam_table_map[tcam_id] |= 1U << row;
+    }
+}
+
+static void set_tcam_mode_logical_table(ubits<4> &reg, int tcam_id, int logical_id) {
+    reg = logical_id;
+}
+// TODO: Unused?
+// static void set_tcam_mode_logical_table(ubits<8> &reg, int tcam_id, int logical_id) {
+//     reg |= 1U << tcam_id;
+// }
+
+template <class REGS>
+void TernaryMatchTable::write_regs_vt(REGS &regs) {
+    LOG1("### Ternary match table " << name() << " write_regs " << loc());
+    MatchTable::write_regs(regs, 1, indirect);
+    unsigned word = 0;
+    auto &merge = regs.rams.match.merge;
+    for (Layout &row : layout) {
+        auto vpn = row.vpns.begin();
+        for (const auto &tcam : row.memunits) {
+            BUG_CHECK(tcam.stage == INT_MIN && tcam.row == row.row, "bogus tcam %s in row %d",
+                      tcam.desc(), row.row);
+            auto &tcam_mode = regs.tcams.col[tcam.col].tcam_mode[row.row];
+            // tcam_mode.tcam_data1_select = row.bus; -- no longer used
+            if (options.match_compiler) tcam_mode.tcam_data1_select = tcam.col;
+            tcam_mode.tcam_chain_out_enable = (chain_rows[tcam.col] >> row.row) & 1;
+            if (gress == INGRESS)
+                tcam_mode.tcam_ingress = 1;
+            else if (gress == EGRESS)
+                tcam_mode.tcam_egress = 1;
+            else if (gress == GHOST)
+                tcam_ghost_enable(regs, row.row, tcam.col);
+            tcam_mode.tcam_match_output_enable =
+                ((~chain_rows[tcam.col] | ALWAYS_ENABLE_ROW) >> row.row) & 1;
+            tcam_mode.tcam_vpn = *vpn++;
+            set_tcam_mode_logical_table(tcam_mode.tcam_logical_table, tcam_id, logical_id);
+            tcam_mode.tcam_data_dirtcam_mode = match[word].dirtcam & 0x3ff;
+            tcam_mode.tcam_vbit_dirtcam_mode = match[word].dirtcam >> 10;
+            /* TODO -- always disable tcam_validbit_xbar? */
+            auto &tcam_vh_xbar = regs.tcams.vh_data_xbar;
+            if (options.match_compiler) {
+                for (int i = 0; i < 8; i++)
+                    tcam_vh_xbar.tcam_validbit_xbar_ctl[tcam.col][row.row / 2][i] |= 15;
+            }
+            auto &halfbyte_mux_ctl = tcam_vh_xbar.tcam_row_halfbyte_mux_ctl[tcam.col][row.row];
+            halfbyte_mux_ctl.tcam_row_halfbyte_mux_ctl_select = match[word].byte_config;
+            halfbyte_mux_ctl.tcam_row_halfbyte_mux_ctl_enable = 1;
+            halfbyte_mux_ctl.tcam_row_search_thread = timing_thread(gress);
+            if (match[word].word_group >= 0)
+                setup_muxctl(tcam_vh_xbar.tcam_row_output_ctl[tcam.col][row.row],
+                             match[word].word_group);
+            if (match[word].byte_group >= 0)
+                setup_muxctl(tcam_vh_xbar.tcam_extra_byte_ctl[tcam.col][row.row / 2],
+                             match[word].byte_group);
+            tcam_table_map(regs, row.row, tcam.col);
+        }
+        if (++word == match.size()) word = 0;
+    }
+    if (tcam_id >= 0)
+        setup_muxctl(merge.tcam_hit_to_logical_table_ixbar_outputmap[tcam_id], logical_id);
+    if (tcam_id >= 0) {
+        if (stage->table_use[timing_thread(gress)] & Stage::USE_TCAM)
+            merge.tcam_table_prop[tcam_id].tcam_piped = 1;
+        merge.tcam_table_prop[tcam_id].thread = timing_thread(gress);
+        merge.tcam_table_prop[tcam_id].enabled = 1;
+        regs.tcams.tcam_output_table_thread[tcam_id] = 1 << timing_thread(gress);
+    }
+    if (indirect_bus >= 0) {
+        /* FIXME -- factor into corresponding code in MatchTable::write_regs */
+        setup_muxctl(merge.match_to_logical_table_ixbar_outputmap[1][indirect_bus], logical_id);
+        setup_muxctl(merge.match_to_logical_table_ixbar_outputmap[3][indirect_bus], logical_id);
+        if (tcam_id >= 0) {
+            setup_muxctl(merge.tcam_match_adr_to_physical_oxbar_outputmap[indirect_bus], tcam_id);
+        }
+        if (action) {
+            /* FIXME -- factor with TernaryIndirect code below */
+            if (auto adt = action->to<ActionTable>()) {
+                merge.mau_actiondata_adr_default[1][indirect_bus] = adt->determine_default(action);
+                merge.mau_actiondata_adr_mask[1][indirect_bus] = adt->determine_mask(action);
+                merge.mau_actiondata_adr_vpn_shiftcount[1][indirect_bus] =
+                    adt->determine_vpn_shiftcount(action);
+                merge.mau_actiondata_adr_tcam_shiftcount[indirect_bus] =
+                    adt->determine_shiftcount(action, 0, 0, 0);
+            }
+        }
+        attached.write_tcam_merge_regs(regs, this, indirect_bus, 0);
+        merge.tind_bus_prop[indirect_bus].tcam_piped = 1;
+        merge.tind_bus_prop[indirect_bus].thread = timing_thread(gress);
+        merge.tind_bus_prop[indirect_bus].enabled = 1;
+        if (idletime)
+            merge.mau_idletime_adr_tcam_shiftcount[indirect_bus] = idletime->direct_shiftcount();
+    }
+    if (actions) actions->write_regs(regs, this);
+    if (gateway) gateway->write_regs(regs);
+    if (idletime) idletime->write_regs(regs);
+    for (auto &hd : hash_dist) hd.write_regs(regs, this);
+    merge.exact_match_logical_result_delay |= 1 << logical_id;
+    regs.cfg_regs.mau_cfg_movereg_tcam_only |= 1U << logical_id;
+
+    // FIXME -- this is wrong; when should we use the actionbit?  glass never does any more?
+    // if (hit_next.size() > 1 && !indirect)
+    //     merge.next_table_tcam_actionbit_map_en |= 1 << logical_id;
+    // if (!indirect)
+    //     merge.mau_action_instruction_adr_tcam_actionbit_map_en |= 1 << logical_id;
+}
+
+std::unique_ptr<json::map> TernaryMatchTable::gen_memory_resource_allocation_tbl_cfg(
+    const char *type, const std::vector<Layout> &layout, bool skip_spare_bank) const {
+    if (layout.size() == 0) return nullptr;
+    BUG_CHECK(!skip_spare_bank);  // never spares in tcam
+    json::map mra{{"memory_type", json::string(type)}};
+    json::vector &mem_units_and_vpns = mra["memory_units_and_vpns"];
+    json::vector mem_units;
+    unsigned word = 0;
+    bool done = false;
+    unsigned lrow = 0;
+    for (auto colnum = 0U; !done; colnum++) {
+        done = true;
+        for (auto &row : layout) {
+            if (colnum >= row.memunits.size()) continue;
+            auto mu = row.memunits[colnum];
+            auto vpn = row.vpns[colnum];
+            mem_units.push_back(json_memunit(mu));
+            lrow = json_memunit(mu);
+            if (++word == match.size()) {
+                mem_units_and_vpns.push_back(json::map{{"memory_units", std::move(mem_units)},
+                                                       {"vpns", json::vector{json::number(vpn)}}});
+                mem_units = json::vector();
+                word = 0;
+            }
+            done = false;
+        }
+    }
+    // For keyless table, add empty vectors
+    if (mem_units_and_vpns.size() == 0)
+        mem_units_and_vpns.push_back(
+            json::map{{"memory_units", json::vector()}, {"vpns", json::vector()}});
+    mra["spare_bank_memory_unit"] = lrow;
+    return json::mkuniq<json::map>(std::move(mra));
+}
+
+void TernaryMatchTable::gen_entry_cfg2(json::vector &out, std::string field_name,
+                                       std::string global_name, unsigned lsb_offset,
+                                       unsigned lsb_idx, unsigned msb_idx, std::string source,
+                                       unsigned start_bit, unsigned field_width,
+                                       bitvec &tcam_bits) const {
+    json::map entry;
+    entry["field_name"] = field_name;
+    entry["global_name"] = global_name;
+    entry["lsb_mem_word_offset"] = lsb_offset;
+    entry["lsb_mem_word_idx"] = lsb_idx;
+    entry["msb_mem_word_idx"] = msb_idx;
+    entry["source"] = source;
+    entry["start_bit"] = start_bit;
+    entry["field_width"] = field_width;
+    out.push_back(std::move(entry));
+    // For a range with field width < nibble width, mark the entire
+    // nibble in tcam_bits as used. The driver expects no overlap with other
+    // format entries with the unused bits in the nibble.
+    int tcam_bit_width = source == "range" ? 4 : field_width;
+    tcam_bits.setrange(lsb_offset, tcam_bit_width);
+}
+
+void TernaryMatchTable::gen_entry_range_cfg(json::map &entry, bool duplicate,
+                                            unsigned nibble_offset) const {
+    json::map &entry_range = entry["range"];
+    entry_range["type"] = 4;
+    entry_range["is_duplicate"] = duplicate;
+    entry_range["nibble_offset"] = nibble_offset;
+}
+
+void TernaryMatchTable::gen_entry_cfg(json::vector &out, std::string name, unsigned lsb_offset,
+                                      unsigned lsb_idx, unsigned msb_idx, std::string source,
+                                      unsigned start_bit, unsigned field_width, unsigned index,
+                                      bitvec &tcam_bits, unsigned nibble_offset = 0) const {
+    LOG3("Adding entry to Ternary Table : name: "
+         << name << " lsb_offset: " << lsb_offset << " lsb_idx: " << lsb_idx
+         << " msb_idx: " << msb_idx << " source: " << source << " start_bit: " << start_bit
+         << " field_width: " << field_width << " index: " << index << " tcam_bits: " << tcam_bits
+         << " nibble_offset: " << nibble_offset);
+    std::string field_name(name);
+
+    // If the name has a slice in it, remove it and add the lo bit of
+    // the slice to field_bit.  This takes the place of
+    // canon_field_list(), rather than extracting the slice component
+    // of the field name, if present, and appending it to the key name.
+    int slice_offset = remove_name_tail_range(field_name);
+    LOG4("    Field Name: " << field_name << " slice_offset: " << slice_offset);
+
+    // Get the key name, if any.
+    int param_start_bit = slice_offset + start_bit;
+    auto params = find_p4_params(field_name, "", param_start_bit, field_width);
+    std::string global_name = "";
+    if (params.size() == 0) {
+        gen_entry_cfg2(out, field_name, global_name, lsb_offset, lsb_idx, msb_idx, source,
+                       param_start_bit, field_width, tcam_bits);
+    } else {
+        for (auto param : params) {
+            if (!param) continue;
+            if (!param->key_name.empty()) {
+                LOG4("    Found param : " << *param);
+                field_name = param->key_name;
+                global_name = param->name;
+            }
+            // For multiple params concatenated within the field width, we only
+            // chose the param width which represents the slice.
+            field_width = std::min(param->bit_width, field_width);
+
+            // For range match we need bytes to decide which nibble is being used, hence
+            // split the field in bytes. For normal match entire slice can be used
+            // directly.
+            auto *p = find_p4_param(name, "range", param_start_bit, field_width);
+            if (p) {
+                int lsb_lo = lsb_offset - TCAM_MATCH_BITS_START;
+                int lsb_hi = lsb_lo + field_width - 1;
+                /**
+                 * For each byte of range match, the range match happens over either the lower
+                 * nibble or higher nibble given the encoding scheme.  The nibble is transformed
+                 * into an encoding over a byte.  This breaks up the range over each match nibble on
+                 * a byte by byte boundary, and outputs the JSON for that nibble
+                 *
+                 * @seealso bf-p4c/mau/table_format.cpp comments on range
+                 * @seealso bf-p4c/mau/resource_estimate.cpp comments on range
+                 *
+                 * The range context JSON encoding is the following:
+                 *     - The lsb_mem_word_offset is always the beginning of the byte (as the
+                 * encoding takes the whole byte)
+                 *     - The width is the width of the field in the nibble (up to 4 bits)
+                 *     - The nibble_offset is where in the nibble the key starts in the ixbar byte
+                 *
+                 * A "is_duplicate" nibble is provided.The driver uses this for not double counting,
+                 * maybe.  Henry and I both agree that is really doesn't make any sense and can be
+                 * deleted, but remains in there now
+                 */
+                for (int bit = (lsb_lo / 8) * 8; bit <= (lsb_hi / 8) * 8; bit += 8) {
+                    int lsb_lo_bit_in_byte = std::max(lsb_lo, bit) % 8;
+                    int lsb_hi_bit_in_byte = std::min(lsb_hi, bit + 7) % 8;
+                    auto dirtcam_mode = get_dirtcam_mode(index, (bit / 8));
+
+                    if (!(DIRTCAM_4B_LO == dirtcam_mode || DIRTCAM_4B_HI == dirtcam_mode)) continue;
+
+                    bitvec nibbles_of_range;
+                    nibbles_of_range.setbit(lsb_lo_bit_in_byte / 4);
+                    nibbles_of_range.setbit(lsb_hi_bit_in_byte / 4);
+                    int range_start_bit = start_bit + slice_offset;
+                    int range_width;
+                    int nibble_offset;
+
+                    // Determine which section of the byte based on which nibble is provided
+                    if (dirtcam_mode == DIRTCAM_4B_LO) {
+                        BUG_CHECK(nibbles_of_range.getbit(0));
+                        // Add the difference from the first bit of this byte and the lowest bit
+                        range_start_bit += bit + lsb_lo_bit_in_byte - lsb_lo;
+                        range_width =
+                            std::min(static_cast<int>(field_width), 4 - lsb_lo_bit_in_byte);
+                        range_width = std::min(static_cast<int>(range_width), lsb_hi - bit + 1);
+                        nibble_offset = lsb_lo_bit_in_byte % 4;
+                    } else {
+                        BUG_CHECK(nibbles_of_range.getbit(1));
+                        // Because the bit starts at the upper nibble, the start bit is either the
+                        // beginning of the nibble or more
+                        range_start_bit += bit + std::max(4, lsb_lo_bit_in_byte) - lsb_lo;
+                        range_width =
+                            std::min(static_cast<int>(field_width), lsb_hi_bit_in_byte - 3);
+                        range_width = std::min(static_cast<int>(range_width),
+                                               lsb_hi_bit_in_byte - lsb_lo_bit_in_byte + 1);
+                        nibble_offset = std::max(4, lsb_lo_bit_in_byte) % 4;
+                    }
+
+                    // Add the range entry
+                    gen_entry_cfg2(out, field_name, global_name, bit + TCAM_MATCH_BITS_START,
+                                   lsb_idx, msb_idx, "range", range_start_bit, range_width,
+                                   tcam_bits);
+                    auto &last_entry = out.back()->to<json::map>();
+                    gen_entry_range_cfg(last_entry, false, nibble_offset);
+
+                    // Adding the duplicate range entry
+                    gen_entry_cfg2(out, field_name, global_name, bit + TCAM_MATCH_BITS_START + 4,
+                                   lsb_idx, msb_idx, "range", range_start_bit, range_width,
+                                   tcam_bits);
+                    auto &last_entry_dup = out.back()->to<json::map>();
+                    gen_entry_range_cfg(last_entry_dup, true, nibble_offset);
+                }
+
+            } else {
+                gen_entry_cfg2(out, field_name, global_name, lsb_offset, lsb_idx, msb_idx, source,
+                               param_start_bit, field_width, tcam_bits);
+            }
+            param_start_bit += field_width;
+        }
+    }
+}
+
+void TernaryMatchTable::gen_match_fields_pvp(json::vector &match_field_list, unsigned word,
+                                             bool uses_versioning, unsigned version_word_group,
+                                             bitvec &tcam_bits) const {
+    // Tcam bits are arranged as follows in each tcam word
+    // LSB -------------------------------------MSB
+    // PAYLOAD BIT - TCAM BITS - [VERSION] - PARITY
+    auto start_bit = 0;      // always 0 for fields not on input xbar
+    auto dirtcam_index = 0;  // not relevant for fields not on input xbar
+    auto payload_name = "--tcam_payload_" + std::to_string(word) + "--";
+    auto parity_name = "--tcam_parity_" + std::to_string(word) + "--";
+    auto version_name = "--version--";
+    gen_entry_cfg(match_field_list, payload_name, TCAM_PAYLOAD_BITS_START, word, word, "payload",
+                  start_bit, TCAM_PAYLOAD_BITS, dirtcam_index, tcam_bits);
+    if (uses_versioning && (version_word_group == word)) {
+        gen_entry_cfg(match_field_list, version_name, TCAM_VERSION_BITS_START, word, word,
+                      "version", start_bit, TCAM_VERSION_BITS, dirtcam_index, tcam_bits);
+    }
+    gen_entry_cfg(match_field_list, parity_name, TCAM_PARITY_BITS_START, word, word, "parity",
+                  start_bit, TCAM_PARITY_BITS, dirtcam_index, tcam_bits);
+}
+
+void TernaryMatchTable::gen_match_fields(json::vector &match_field_list,
+                                         std::vector<bitvec> &tcam_bits) const {
+    unsigned match_index = match.size() - 1;
+    for (auto &ixb : input_xbar) {
+        for (const auto &[field_group, field_phv] : *ixb) {
+            switch (field_group.type) {
+                case InputXbar::Group::EXACT:
+                    continue;
+                case InputXbar::Group::TERNARY: {
+                    int word = match_index - match_word(field_group.index);
+                    if (word < 0) continue;
+                    std::string source = "spec";
+                    std::string field_name = field_phv.what.name();
+                    unsigned lsb_mem_word_offset = 0;
+                    if (field_phv.hi > 40) {
+                        // FIXME -- no longer needed if we always convert these to Group::BYTE?
+                        // a field in the (mid) byte group, which is shared with the adjacent word
+                        // group each word gets only 4 bits of the byte group and is placed at msb
+                        // Check mid-byte field does not cross byte boundary (40-47)
+                        BUG_CHECK(field_phv.hi < 48);
+                        // Check mid-byte field is associated with even group
+                        // | == 5 == | == 1 == | == 5 == | == 5 == | == 1 == | == 5 == |
+                        // | Grp 0   | Midbyte0| Grp 1   | Grp 2   | Midbyte1| Grp 3   |
+                        BUG_CHECK((field_group.index & 1) == 0);
+                        // Find groups to place this byte nibble. Check group which has this
+                        // group as the byte_group
+                        for (auto &m : match) {
+                            if (m.byte_group * 2 == field_group.index) {
+                                // Check byte_config to determine where to place the nibble
+                                lsb_mem_word_offset = 1 + field_phv.lo;
+                                int nibble_offset = 0;
+                                int hwidth = 44 - field_phv.lo;
+                                int start_bit = 0;
+                                if (m.byte_config == MIDBYTE_NIBBLE_HI) {
+                                    nibble_offset += 4;
+                                    start_bit = hwidth;
+                                    hwidth = field_phv.hi - 43;
+                                }
+                                int midbyte_word_group = match_index - match_word(m.word_group);
+                                gen_entry_cfg(match_field_list, field_name, lsb_mem_word_offset,
+                                              midbyte_word_group, midbyte_word_group, source,
+                                              field_phv.what.lobit() + start_bit, hwidth,
+                                              field_group.index, tcam_bits[midbyte_word_group]);
+                            }
+                        }
+                    } else {
+                        lsb_mem_word_offset = 1 + field_phv.lo;
+                        gen_entry_cfg(match_field_list, field_name, lsb_mem_word_offset, word, word,
+                                      source, field_phv.what.lobit(),
+                                      field_phv.hi - field_phv.lo + 1, field_group.index,
+                                      tcam_bits[word], field_phv.what->lo % 4);
+                    }
+                    break;
+                }
+                case InputXbar::Group::BYTE:
+                    // The byte group represents what goes in top nibble in the tcam
+                    // word. Based on the byte config, the corresponding match word is
+                    // selected and the field (slice) is placed in the nibble.
+                    //   byte group 5: { 0: HillTop.Lamona.Whitefish(0..1) ,
+                    //                   2: HillTop.RossFork.Adona(0..5) }
+                    //   match:
+                    //        - { group: 10, byte_group: 5, byte_config: 0, dirtcam: 0x555 }
+                    //        - { group: 11, byte_group: 5, byte_config: 1, dirtcam: 0x555 }
+                    // Placement
+                    //  --------------------------
+                    //  Group 10 - Midbyte Nibble Lo
+                    //  --------------------------
+                    //  Word 1    : 41  42  43  44
+                    //  Whitefish :  0   1   X   X
+                    //  Adona     :  X   X   0   1
+                    //  --------------------------
+                    //  Group 11 - Midbyte Nibble Hi
+                    //  --------------------------
+                    //  Word 0    : 41  42  43  44
+                    //  Whitefish :  X   X   X   X
+                    //  Adona     :  2   3   4   5
+                    //  --------------------------
+                    for (size_t word = 0; word < match.size(); word++) {
+                        if (match[word].byte_group != field_group.index) continue;
+                        auto source = "spec";
+                        auto field_name = field_phv.what.name();
+                        int byte_lo = field_phv.lo;
+                        int field_lo = field_phv.what.lobit();
+                        int width = field_phv.what.size();
+                        int nibble_lo = byte_lo;
+                        if (match[word].byte_config == MIDBYTE_NIBBLE_HI) {
+                            if (byte_lo >= 4) {
+                                // NIBBLE HI  | NIBBLE LO
+                                // 7  6  5  4 | 3  2  1  0
+                                // x  x  x    |
+                                // byte_lo = 5 (start of byte)
+                                nibble_lo = byte_lo - 4;  // Get nibble_lo from nibble boundary
+                                // nibble_lo = 1
+                            } else {
+                                // NIBBLE HI  | NIBBLE LO
+                                // 7  6  5  4 | 3  2  1  0
+                                //       x  x | x  x
+                                // say field f1(3..7)
+                                // field_lo = 3
+                                // byte_lo = 2 (start of byte)
+                                // width   = 4
+                                width -= 4 - byte_lo;      // Adjust width to what must
+                                                           // fit in the nibble
+                                if (width <= 0) continue;  // No field in nibble, skip
+                                // width = 2
+                                nibble_lo = 0;            // Field starts at nibble boundary
+                                field_lo += 4 - byte_lo;  // Adjust field lo bit to start of nibble
+                                // field_lo = 5
+                            }
+                        } else if (match[word].byte_config == MIDBYTE_NIBBLE_LO) {
+                            if (byte_lo >= 4) {
+                                // NIBBLE HI  | NIBBLE LO
+                                // 7  6  5  4 | 3  2  1  0
+                                // x  x  x    |
+                                // byte_lo = 5 (start of byte)
+                                continue;  // No field in nibble, skip
+                            } else {
+                                // NIBBLE HI  | NIBBLE LO
+                                // 7  6  5  4 | 3  2  1  0
+                                //    x  x  x | x  x
+                                // byte_lo = 2 (start of byte)
+                                // width   = 5
+                                nibble_lo = byte_lo;
+                                int nibble_left = 4 - nibble_lo;
+                                width = (width > nibble_left) ? nibble_left : width;
+                                // width = 2
+                            }
+                        }
+                        gen_entry_cfg(match_field_list, field_name, 41 + nibble_lo,
+                                      match_index - word, match_index - word, source, field_lo,
+                                      width, match[word].byte_group, tcam_bits[match_index - word]);
+                    }
+                    break;
+                default:
+                    BUG("Unknown group type");
+            }
+        }
+    }
+}
+
+json::map &TernaryMatchTable::get_tbl_top(json::vector &out) const {
+    unsigned number_entries = match.size() ? layout_size() / match.size() * 512 : 0;
+    // For ALPM tables, this sets up the top level ALPM table and this ternary
+    // table as its preclassifier. As the pre_classifier is always in the
+    // previous stage as the atcams, this function will be called before the
+    // atcam cfg generation. The atcam will check for presence of this table and
+    // add the atcam cfg gen
+    if (is_alpm()) {
+        json::map *alpm_ptr = base_tbl_cfg(out, "match_entry", number_entries);
+        json::map &alpm = *alpm_ptr;
+        json::map &match_attributes = alpm["match_attributes"];
+        match_attributes["match_type"] = "algorithmic_lpm";
+        json::map &alpm_pre_classifier = match_attributes["pre_classifier"];
+        base_alpm_pre_classifier_tbl_cfg(alpm_pre_classifier, "match_entry", number_entries);
+        // top level alpm table has the same key as alpm preclassifier
+        add_match_key_cfg(alpm);
+        return alpm_pre_classifier;
+    } else {
+        return *base_tbl_cfg(out, "match_entry", number_entries);
+    }
+}
+
+void TernaryMatchTable::gen_tbl_cfg(json::vector &out) const {
+    unsigned number_entries = match.size() ? layout_size() / match.size() * 512 : 0;
+    json::map &tbl = get_tbl_top(out);
+    bool uses_versioning = false;
+    unsigned version_word_group = -1;
+    unsigned match_index = match.size() - 1;
+    unsigned index = 0;
+    json::vector match_field_list;
+    for (auto &m : match) {
+        if (m.byte_config == 3) {
+            uses_versioning = true;
+            version_word_group = match_index - index;
+            break;
+        }
+        index++;
+    }
+    // Determine the zero padding necessary by creating a bitvector (for each
+    // word). While creating entries for pack format set bits used. The unused
+    // bits must be padded with zero field entries.
+    std::vector<bitvec> tcam_bits(match.size());
+    // Set pvp bits for each tcam word
+    for (unsigned i = 0; i < match.size(); i++) {
+        gen_match_fields_pvp(match_field_list, i, uses_versioning, version_word_group,
+                             tcam_bits[i]);
+    }
+    json::map &match_attributes = tbl["match_attributes"];
+    json::vector &stage_tables = match_attributes["stage_tables"];
+    json::map &stage_tbl = *add_stage_tbl_cfg(match_attributes, "ternary_match", number_entries);
+    // This is a only a glass required field, as it is only required when no default action
+    // is specified, which is impossible for Brig through p4-16
+    stage_tbl["default_next_table"] = Stage::end_of_pipe();
+    json::map &pack_fmt =
+        add_pack_format(stage_tbl, Target::TCAM_MEMORY_FULL_WIDTH(), match.size(), 1);
+    stage_tbl["memory_resource_allocation"] =
+        gen_memory_resource_allocation_tbl_cfg("tcam", layout);
+    // FIXME-JSON: If the next table is modifiable then we set it to what it's mapped
+    // to. Otherwise, set it to the default next table for this stage.
+    // stage_tbl["default_next_table"] = Target::END_OF_PIPE();
+    // FIXME: How to deal with multiple next hit tables?
+    stage_tbl["default_next_table"] =
+        hit_next.size() > 0 ? hit_next[0].next_table_id() : Target::END_OF_PIPE();
+    add_result_physical_buses(stage_tbl);
+    gen_match_fields(match_field_list, tcam_bits);
+
+    // For keyless table, just add parity & payload bits
+    if (p4_params_list.empty()) {
+        tcam_bits.resize(1);
+        gen_match_fields_pvp(match_field_list, 0, false, -1, tcam_bits[0]);
+    }
+
+    // tcam_bits is a vector indexed by tcam word and has all used bits set. We
+    // loop through this bitvec for each word and add a zero padding entry for
+    // the unused bits.
+    // For ternary all unused bits must be marked as source
+    // 'zero' for correctness during entry encoding.
+    for (unsigned word = 0; word < match.size(); word++) {
+        bitvec &pb = tcam_bits[word];
+        unsigned start_bit = 0;  // always 0 for padded fields
+        int dirtcam_index = -1;  // irrelevant in this context
+        if (pb != bitvec(0)) {
+            int idx_lo = 0;
+            std::string pad_name = "--unused--";
+            for (auto p : pb) {
+                if (p > idx_lo) {
+                    gen_entry_cfg(match_field_list, pad_name, idx_lo, word, word, "zero", start_bit,
+                                  p - idx_lo, dirtcam_index, tcam_bits[word]);
+                }
+                idx_lo = p + 1;
+            }
+            auto fw = TCAM_VERSION_BITS;
+            if (idx_lo < fw) {
+                gen_entry_cfg(match_field_list, pad_name, idx_lo, word, word, "zero", start_bit,
+                              fw - idx_lo, dirtcam_index, tcam_bits[word]);
+            }
+        }
+    }
+
+    pack_fmt["entries"] = json::vector{
+        json::map{{"entry_number", json::number(0)}, {"fields", std::move(match_field_list)}}};
+    add_all_reference_tables(tbl);
+    json::map &tind = stage_tbl["ternary_indirection_stage_table"] = json::map();
+    if (indirect) {
+        unsigned fmt_width = 1U << indirect->format->log2size;
+        // json::map tind;
+        tind["stage_number"] = stage->stageno;
+        tind["stage_table_type"] = "ternary_indirection";
+        tind["size"] = indirect->layout_size() * 128 / fmt_width * 1024;
+        indirect->add_pack_format(tind, indirect->format.get());
+        tind["memory_resource_allocation"] =
+            indirect->gen_memory_resource_allocation_tbl_cfg("sram", indirect->layout);
+        // Add action formats for actions present in table or attached action table
+        auto *acts = indirect->get_actions();
+        if (acts) acts->add_action_format(this, tind);
+        add_all_reference_tables(tbl, indirect);
+        if (indirect->actions)
+            indirect->actions->gen_tbl_cfg(tbl["actions"]);
+        else if (indirect->action && indirect->action->actions)
+            indirect->action->actions->gen_tbl_cfg(tbl["actions"]);
+        indirect->common_tbl_cfg(tbl);
+    } else {
+        // FIXME: Add a fake ternary indirect table (as otherwise driver complains)
+        // if tind not present - to be removed with update on driver side
+        auto *acts = get_actions();
+        if (acts) acts->add_action_format(this, tind);
+        tind["memory_resource_allocation"] = nullptr;
+        json::vector &pack_format = tind["pack_format"] = json::vector();
+        json::map pack_format_entry;
+        pack_format_entry["memory_word_width"] = 128;
+        pack_format_entry["entries_per_table_word"] = 1;
+        json::vector &entries = pack_format_entry["entries"] = json::vector();
+        entries.push_back(json::map{{"entry_number", json::number(0)}, {"fields", json::vector()}});
+        pack_format_entry["table_word_width"] = 0;
+        pack_format_entry["number_memory_units_per_table_word"] = 0;
+        pack_format.push_back(std::move(pack_format_entry));
+        tind["logical_table_id"] = logical_id;
+        tind["stage_number"] = stage->stageno;
+        tind["stage_table_type"] = "ternary_indirection";
+        tind["size"] = 0;
+    }
+    common_tbl_cfg(tbl);
+    if (actions)
+        actions->gen_tbl_cfg(tbl["actions"]);
+    else if (action && action->actions)
+        action->actions->gen_tbl_cfg(tbl["actions"]);
+    gen_idletime_tbl_cfg(stage_tbl);
+    merge_context_json(tbl, stage_tbl);
+    match_attributes["match_type"] = "ternary";
+}
+
+void TernaryIndirectTable::setup(VECTOR(pair_t) & data) {
+    match_table = 0;
+    common_init_setup(data, true, P4Table::MatchEntry);
+    if (format) {
+        if (format->size > 64) error(format->lineno, "ternary indirect format larger than 64 bits");
+        if (format->size < 4) {
+            /* pad out to minumum size */
+            format->size = 4;
+            format->log2size = 2;
+        }
+    } else {
+        error(lineno, "No format specified in table %s", name());
+    }
+    for (auto &kv : MapIterChecked(data, {"meter", "stats", "stateful"})) {
+        if (common_setup(kv, data, P4Table::MatchEntry)) {
+        } else if (kv.key == "input_xbar") {
+            if (CHECKTYPE(kv.value, tMAP))
+                input_xbar.emplace_back(InputXbar::create(this, false, kv.key, kv.value.map));
+        } else if (kv.key == "hash_dist") {
+            /* parsed in common_init_setup */
+        } else if (kv.key == "selector") {
+            attached.selector.setup(kv.value, this);
+        } else if (kv.key == "selector_length") {
+            attached.selector_length.setup(kv.value, this);
+        } else if (kv.key == "meter_color") {
+            attached.meter_color.setup(kv.value, this);
+        } else if (kv.key == "stats") {
+            if (kv.value.type == tVEC)
+                for (auto &v : kv.value.vec) attached.stats.emplace_back(v, this);
+            else
+                attached.stats.emplace_back(kv.value, this);
+        } else if (kv.key == "meter") {
+            if (kv.value.type == tVEC)
+                for (auto &v : kv.value.vec) attached.meters.emplace_back(v, this);
+            else
+                attached.meters.emplace_back(kv.value, this);
+        } else if (kv.key == "stateful") {
+            if (kv.value.type == tVEC)
+                for (auto &v : kv.value.vec) attached.statefuls.emplace_back(v, this);
+            else
+                attached.statefuls.emplace_back(kv.value, this);
+        } else {
+            warning(kv.key.lineno, "ignoring unknown item %s in table %s", value_desc(kv.key),
+                    name());
+        }
+    }
+    if (Target::SRAM_GLOBAL_ACCESS())
+        alloc_global_srams();
+    else
+        alloc_rams(false, stage->sram_use, &stage->tcam_indirect_bus_use, Layout::TIND_BUS);
+    if (!action.set() && !actions)
+        error(lineno, "Table %s has neither action table nor immediate actions", name());
+    if (actions && !action_bus) action_bus = ActionBus::create();
+}
+
+Table::table_type_t TernaryIndirectTable::set_match_table(MatchTable *m, bool indirect) {
+    if (match_table) {
+        error(lineno, "Multiple references to ternary indirect table %s", name());
+    } else if (!(match_table = dynamic_cast<TernaryMatchTable *>(m))) {
+        error(lineno, "Trying to link ternary indirect table %s to non-ternary table %s", name(),
+              m->name());
+    } else {
+        if (action.check() && action->set_match_table(m, !action.is_direct_call()) != ACTION)
+            error(action.lineno, "%s is not an action table", action->name());
+        attached.pass0(m);
+        logical_id = m->logical_id;
+        p4_table = m->p4_table;
+    }
+    return TERNARY_INDIRECT;
+}
+
+bitvec TernaryIndirectTable::compute_reachable_tables() {
+    Table::compute_reachable_tables();
+    if (match_table) reachable_tables_ |= match_table->reachable_tables();
+    reachable_tables_ |= attached.compute_reachable_tables();
+    return reachable_tables_;
+}
+
+void TernaryIndirectTable::pass1() {
+    LOG1("### Ternary indirect table " << name() << " pass1");
+    determine_word_and_result_bus();
+    Table::pass1();
+    if (action_enable >= 0)
+        if (action.args.size() < 1 || action.args[0].size() <= (unsigned)action_enable)
+            error(lineno, "Action enable bit %d out of range for action selector", action_enable);
+    if (format) format->pass1(this);
+    for (auto &hd : hash_dist) {
+        hd.pass1(this, HashDistribution::OTHER, false);
+    }
+}
+
+/**
+ * The bus by definition for ternary indirect is the result bus, and all TernaryIndirect tables
+ * are at most 64 bits, meaning that all their words are equal to 0.
+ */
+void TernaryIndirectTable::determine_word_and_result_bus() {
+    for (auto &row : layout) {
+        row.word = 0;
+    }
+}
+
+void TernaryIndirectTable::pass2() {
+    LOG1("### Ternary indirect table " << name() << " pass2");
+    if (logical_id < 0 && match_table) logical_id = match_table->logical_id;
+    if (!match_table) error(lineno, "No match table for ternary indirect table %s", name());
+    if (actions) actions->pass2(this);
+    if (action_bus) action_bus->pass2(this);
+    if (format) format->pass2(this);
+}
+
+void TernaryIndirectTable::pass3() {
+    LOG1("### Ternary indirect table " << name() << " pass3");
+    if (action_bus) action_bus->pass3(this);
+}
+
+template <class REGS>
+void TernaryIndirectTable::write_regs_vt(REGS &regs) {
+    LOG1("### Ternary indirect table " << name() << " write_regs");
+    int tcam_id = match_table->tcam_id;
+    int tcam_shift = format->log2size - 2;
+    if (tcam_id >= 0) regs.tcams.tcam_match_adr_shift[tcam_id] = tcam_shift;
+    auto &merge = regs.rams.match.merge;
+    for (Layout &row : layout) {
+        int bus = row.bus.at(Layout::TIND_BUS);
+        auto vpn = row.vpns.begin();
+        auto &ram_row = regs.rams.array.row[row.row];
+        for (auto &memunit : row.memunits) {
+            int col = memunit.col;
+            BUG_CHECK(memunit.stage == INT_MIN && memunit.row == row.row, "bogus %s in row %d",
+                      memunit.desc(), row.row);
+            auto &unit_ram_ctl = ram_row.ram[col].unit_ram_ctl;
+            unit_ram_ctl.match_ram_write_data_mux_select = 7; /* disable */
+            unit_ram_ctl.match_ram_read_data_mux_select = 7;  /* disable */
+            unit_ram_ctl.tind_result_bus_select = 1U << bus;
+            auto &mux_ctl =
+                regs.rams.map_alu.row[row.row].adrmux.ram_address_mux_ctl[col / 6][col % 6];
+            mux_ctl.ram_unitram_adr_mux_select = bus + 2;
+            auto &unitram_config =
+                regs.rams.map_alu.row[row.row].adrmux.unitram_config[col / 6][col % 6];
+            unitram_config.unitram_type = 6;
+            unitram_config.unitram_vpn = *vpn++;
+            unitram_config.unitram_logical_table = logical_id;
+            if (gress == INGRESS || gress == GHOST)
+                unitram_config.unitram_ingress = 1;
+            else
+                unitram_config.unitram_egress = 1;
+            unitram_config.unitram_enable = 1;
+            auto &xbar_ctl =
+                regs.rams.map_alu.row[row.row].vh_xbars.adr_dist_tind_adr_xbar_ctl[bus];
+            if (tcam_id >= 0) setup_muxctl(xbar_ctl, tcam_id);
+            if (gress == EGRESS)
+                regs.cfg_regs.mau_cfg_uram_thread[col / 4U] |= 1U << (col % 4U * 8U + row.row);
+            ram_row.tind_ecc_error_uram_ctl[timing_thread(gress)] |= 1 << (col - 2);
+        }
+        int r_bus = row.row * 2 + bus;
+        merge.tind_ram_data_size[r_bus] = format->log2size - 1;
+        if (tcam_id >= 0)
+            setup_muxctl(merge.tcam_match_adr_to_physical_oxbar_outputmap[r_bus], tcam_id);
+        merge.tind_bus_prop[r_bus].tcam_piped = 1;
+        merge.tind_bus_prop[r_bus].thread = timing_thread(gress);
+        merge.tind_bus_prop[r_bus].enabled = 1;
+        if (instruction) {
+            int shiftcount = 0;
+            if (auto field = instruction.args[0].field())
+                shiftcount = field->bit(0);
+            else if (auto field = instruction.args[1].field())
+                shiftcount = field->immed_bit(0);
+            merge.mau_action_instruction_adr_tcam_shiftcount[r_bus] = shiftcount;
+        }
+        if (format->immed) merge.mau_immediate_data_tcam_shiftcount[r_bus] = format->immed->bit(0);
+        if (action) {
+            if (auto adt = action->to<ActionTable>()) {
+                merge.mau_actiondata_adr_default[1][r_bus] = adt->determine_default(action);
+                merge.mau_actiondata_adr_mask[1][r_bus] = adt->determine_mask(action);
+                merge.mau_actiondata_adr_vpn_shiftcount[1][r_bus] =
+                    adt->determine_vpn_shiftcount(action);
+                merge.mau_actiondata_adr_tcam_shiftcount[r_bus] =
+                    adt->determine_shiftcount(action, 0, 0, tcam_shift);
+            }
+        }
+        if (attached.selector) {
+            auto sel = get_selector();
+            merge.mau_meter_adr_tcam_shiftcount[r_bus] =
+                sel->determine_shiftcount(attached.selector, 0, 0, format->log2size - 2);
+            merge.mau_selectorlength_shiftcount[1][r_bus] =
+                sel->determine_length_shiftcount(attached.selector_length, 0, 0);
+            merge.mau_selectorlength_mask[1][r_bus] =
+                sel->determine_length_mask(attached.selector_length);
+            merge.mau_selectorlength_default[1][r_bus] =
+                sel->determine_length_default(attached.selector_length);
+        }
+        if (match_table->idletime)
+            merge.mau_idletime_adr_tcam_shiftcount[r_bus] =
+                66 + format->log2size - match_table->idletime->precision_shift();
+        attached.write_tcam_merge_regs(regs, match_table, r_bus, tcam_shift);
+    }
+    if (actions) actions->write_regs(regs, this);
+    for (auto &hd : hash_dist) hd.write_regs(regs, this);
+}
+
+void TernaryIndirectTable::gen_tbl_cfg(json::vector &out) const {}
+
+void TernaryMatchTable::add_result_physical_buses(json::map &stage_tbl) const {
+    json::vector &result_physical_buses = stage_tbl["result_physical_buses"] = json::vector();
+    if (indirect) {
+        for (auto l : indirect->layout) {
+            if (l.bus.count(Layout::TIND_BUS)) {
+                result_physical_buses.push_back(l.row * 2 + l.bus.at(Layout::TIND_BUS));
+            }
+        }
+    } else {
+        result_physical_buses.push_back(indirect_bus);
+    }
+}
+
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(TernaryMatchTable, TARGET_CLASS)
+DEFINE_TABLE_TYPE_WITH_SPECIALIZATION(TernaryIndirectTable, TARGET_CLASS)
diff --git a/backends/tofino/bf-asm/tofino/CMakeLists.txt b/backends/tofino/bf-asm/tofino/CMakeLists.txt
new file mode 100644
index 00000000000..e1d922b3e7e
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+set (GEN_TOFINO
+  memories.pipe_addrmap
+  memories.pipe_top_level
+  memories.prsr_mem_main_rspec
+  regs.dprsr_hdr
+  regs.dprsr_inp
+  regs.ebp_rspec
+  regs.ibp_rspec
+  regs.mau_addrmap
+  regs.pipe_addrmap
+  regs.prsr_reg_merge_rspec
+  regs.tofino
+  )
+
+foreach(f IN LISTS GEN_TOFINO)
+  list (APPEND GEN_TOFINO_SRCS ${BFASM_BINARY_DIR}/gen/tofino/${f}.cpp)
+  list (APPEND GEN_TOFINO_HDRS ${BFASM_BINARY_DIR}/gen/tofino/${f}.h)
+endforeach()
+
+set_source_files_properties(${GEN_TOFINO_SRCS} ${GEN_TOFINO_HDRS} PROPERTIES GENERATED TRUE)
+
+add_custom_command(OUTPUT ${GEN_TOFINO_HDRS} ${GEN_TOFINO_SRCS}
+  COMMAND ${BFASM_WALLE} --schema chip.schema --generate-cpp template_objects.yaml -o ${BFASM_BINARY_DIR}/gen/tofino
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS template_objects.yaml chip.schema ${WALLE_SOURCES}
+  COMMENT "Generating cpp code for tofino from tofino/chip.schema")
+
+set (BFAS_TOFINO_SRCS
+  tofino/exact_match.cpp
+  tofino/gateway.cpp
+  tofino/input_xbar.cpp
+  tofino/parser.cpp
+  tofino/sram_match.cpp
+  tofino/stateful.cpp
+  tofino/ternary_match.cpp
+  PARENT_SCOPE
+  )
+
+
+add_library (regs_tofino ${GEN_TOFINO_SRCS})
+target_link_libraries (regs_tofino p4ctoolkit)
+# Disable errors for warnings. FIXME: Get rid of this.
+target_compile_options(regs_tofino PUBLIC -Wno-error -Wno-unused-parameter -Wno-unused-variable -Wno-type-limits -Wno-sign-compare)
diff --git a/backends/tofino/bf-asm/tofino/action_table.h b/backends/tofino/bf-asm/tofino/action_table.h
new file mode 100644
index 00000000000..1c8c39ac142
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/action_table.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_ACTION_TABLE_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_ACTION_TABLE_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+
+class Target::Tofino::ActionTable : public ::ActionTable {
+    friend class ::ActionTable;
+    ActionTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::ActionTable(line, n, gr, s, lid) {}
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_ACTION_TABLE_H_ */
diff --git a/backends/tofino/bf-asm/tofino/chip.schema b/backends/tofino/bf-asm/tofino/chip.schema
new file mode 100644
index 00000000000..c99cc1a9d88
Binary files /dev/null and b/backends/tofino/bf-asm/tofino/chip.schema differ
diff --git a/backends/tofino/bf-asm/tofino/counter.h b/backends/tofino/bf-asm/tofino/counter.h
new file mode 100644
index 00000000000..484b5f49c52
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/counter.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_COUNTER_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_COUNTER_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+
+class Target::Tofino::CounterTable : public ::CounterTable {
+    friend class ::CounterTable;
+    CounterTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::CounterTable(line, n, gr, s, lid) {}
+};
+
+template <>
+void CounterTable::setup_teop_regs(Target::Tofino::mau_regs &, int) {
+    BUG();  // no teop on tofino
+}
+
+template <>
+void CounterTable::write_alu_vpn_range(Target::Tofino::mau_regs &) {
+    BUG();  // not available on tofino
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_COUNTER_H_ */
diff --git a/backends/tofino/bf-asm/tofino/deparser.cpp b/backends/tofino/bf-asm/tofino/deparser.cpp
new file mode 100644
index 00000000000..fa7cefa4f6a
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/deparser.cpp
@@ -0,0 +1,926 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* deparser template specializations for tofino -- #included directly in top-level deparser.cpp */
+
+#define YES(X) X
+#define NO(X)
+
+#define SIMPLE_INTRINSIC(GR, PFX, NAME, IF_SHIFT)                                        \
+    DEPARSER_INTRINSIC(Tofino, GR, NAME, 1) {                                            \
+        PFX.NAME.phv = intrin.vals[0].val->reg.deparser_id();                            \
+        IF_SHIFT(PFX.NAME.shft = intrin.vals[0].val->lo;)                                \
+        if (!intrin.vals[0].pov.empty())                                                 \
+            error(intrin.vals[0].pov.front().lineno, "No POV support in tofino " #NAME); \
+        PFX.NAME.valid = 1;                                                              \
+    }
+#define SIMPLE_INTRINSIC_RENAME(GR, PFX, NAME, REGNAME, IF_SHIFT) \
+    DEPARSER_INTRINSIC(Tofino, GR, NAME, 1) {                     \
+        PFX.REGNAME.phv = intrin.vals[0].val->reg.deparser_id();  \
+        IF_SHIFT(PFX.REGNAME.shft = intrin.vals[0].val->lo;)      \
+        PFX.REGNAME.valid = 1;                                    \
+    }
+#define IIR_MAIN_INTRINSIC(NAME, SHFT) SIMPLE_INTRINSIC(INGRESS, regs.input.iir.main_i, NAME, SHFT)
+#define IIR_INTRINSIC(NAME, SHFT) SIMPLE_INTRINSIC(INGRESS, regs.input.iir.ingr, NAME, SHFT)
+#define HIR_INTRINSIC(NAME, SHFT) SIMPLE_INTRINSIC(INGRESS, regs.header.hir.ingr, NAME, SHFT)
+#define HIR_INTRINSIC_RENAME(NAME, REGNAME, SHFT) \
+    SIMPLE_INTRINSIC_RENAME(INGRESS, regs.header.hir.ingr, NAME, REGNAME, SHFT)
+#define IER_MAIN_INTRINSIC(NAME, SHFT) SIMPLE_INTRINSIC(EGRESS, regs.input.ier.main_e, NAME, SHFT)
+#define HER_INTRINSIC(NAME, SHFT) SIMPLE_INTRINSIC(EGRESS, regs.header.her.egr, NAME, SHFT)
+
+IIR_MAIN_INTRINSIC(egress_unicast_port, NO)
+IIR_MAIN_INTRINSIC(drop_ctl, YES)
+IIR_INTRINSIC(copy_to_cpu, YES)
+HIR_INTRINSIC_RENAME(egress_multicast_group_0, egress_multicast_group[0], NO)
+HIR_INTRINSIC_RENAME(egress_multicast_group_1, egress_multicast_group[1], NO)
+HIR_INTRINSIC_RENAME(hash_lag_ecmp_mcast_0, hash_lag_ecmp_mcast[0], NO)
+HIR_INTRINSIC_RENAME(hash_lag_ecmp_mcast_1, hash_lag_ecmp_mcast[1], NO)
+HIR_INTRINSIC(copy_to_cpu_cos, YES)
+DEPARSER_INTRINSIC(Tofino, INGRESS, ingress_port_source, 1) {
+    regs.header.hir.ingr.ingress_port.phv = intrin.vals[0].val->reg.deparser_id();
+    regs.header.hir.ingr.ingress_port.sel = 0;
+}
+HIR_INTRINSIC(deflect_on_drop, YES)
+HIR_INTRINSIC(meter_color, YES)
+HIR_INTRINSIC(icos, YES)
+HIR_INTRINSIC(qid, YES)
+HIR_INTRINSIC(xid, NO)
+HIR_INTRINSIC(yid, NO)
+HIR_INTRINSIC(rid, NO)
+HIR_INTRINSIC(bypss_egr, YES)
+HIR_INTRINSIC(ct_disable, YES)
+HIR_INTRINSIC(ct_mcast, YES)
+
+IER_MAIN_INTRINSIC(egress_unicast_port, NO)
+IER_MAIN_INTRINSIC(drop_ctl, YES)
+HER_INTRINSIC(force_tx_err, YES)
+HER_INTRINSIC(tx_pkt_has_offsets, YES)
+HER_INTRINSIC(capture_tx_ts, YES)
+HER_INTRINSIC(coal, NO)
+HER_INTRINSIC(ecos, YES)
+
+#undef SIMPLE_INTRINSIC
+#undef IIR_MAIN_INTRINSIC
+#undef IIR_INTRINSIC
+#undef HIR_INTRINSIC
+#undef IER_INTRINSIC
+#undef HER_INTRINSIC
+
+#define TOFINO_DIGEST(GRESS, NAME, CFG, TBL, IFSHIFT, IFID, CNT)                                \
+    DEPARSER_DIGEST(Tofino, GRESS, NAME, CNT, IFSHIFT(can_shift = true;)) {                     \
+        CFG.phv = data.select->reg.deparser_id();                                               \
+        IFSHIFT(CFG.shft = data.shift + data.select->lo;)                                       \
+        CFG.valid = 1;                                                                          \
+        if (!data.select.pov.empty())                                                           \
+            error(data.select.pov.front().lineno, "No POV bit support in tofino %s digest",     \
+                  #NAME);                                                                       \
+        for (auto &set : data.layout) {                                                         \
+            int id = set.first >> data.shift;                                                   \
+            unsigned idx = 0;                                                                   \
+            bool first = true, ok = true;                                                       \
+            int last = -1;                                                                      \
+            int maxidx = TBL[id].phvs.size() - 1;                                               \
+            for (auto &reg : set.second) {                                                      \
+                if (first) {                                                                    \
+                    first = false;                                                              \
+                    IFID(TBL[id].id_phv = reg->reg.deparser_id(); continue;)                    \
+                }                                                                               \
+                /* The same 16b/32b container cannot appear consecutively, but 8b can. */       \
+                if (last == reg->reg.deparser_id() && reg->reg.size != 8) {                     \
+                    error(data.lineno, "%s: %db container %s seen in consecutive locations",    \
+                          #NAME, reg->reg.size, reg->reg.name);                                 \
+                    continue;                                                                   \
+                }                                                                               \
+                for (int i = reg->reg.size / 8; i > 0; i--) {                                   \
+                    if (idx > maxidx) {                                                         \
+                        error(data.lineno, "%s digest limited to %d bytes", #NAME, maxidx + 1); \
+                        ok = false;                                                             \
+                        break;                                                                  \
+                    }                                                                           \
+                    TBL[id].phvs[idx++] = reg->reg.deparser_id();                               \
+                }                                                                               \
+                last = reg->reg.deparser_id();                                                  \
+                if (!ok) break;                                                                 \
+            }                                                                                   \
+            TBL[id].valid = 1;                                                                  \
+            TBL[id].len = idx;                                                                  \
+        }                                                                                       \
+    }
+
+TOFINO_DIGEST(INGRESS, learning, regs.input.iir.ingr.learn_cfg, regs.input.iir.ingr.learn_tbl, NO,
+              NO, 8)
+TOFINO_DIGEST(INGRESS, mirror, regs.header.hir.main_i.mirror_cfg, regs.header.hir.main_i.mirror_tbl,
+              YES, YES, 8)
+TOFINO_DIGEST(EGRESS, mirror, regs.header.her.main_e.mirror_cfg, regs.header.her.main_e.mirror_tbl,
+              YES, YES, 8)
+TOFINO_DIGEST(INGRESS, resubmit, regs.input.iir.ingr.resub_cfg, regs.input.iir.ingr.resub_tbl, YES,
+              NO, 8)
+
+void tofino_field_dictionary(checked_array_base<fde_pov> &fde_control,
+                             checked_array_base<fde_phv> &fde_data,
+                             checked_array_base<ubits<8>> &pov_layout,
+                             std::vector<Phv::Ref> &pov_order,
+                             ordered_map<const Phv::Register *, unsigned> &reg_pov,
+                             std::vector<Deparser::FDEntry> &dict, json::vector &fd_gress,
+                             json::vector &fd_entries, gress_t gress) {
+    std::map<unsigned, unsigned> pov;
+    json::vector chunk_bytes;
+    json::vector fd_entry_chunk_bytes;
+    unsigned pov_byte = 0, pov_size = 0, total_headers = 0;
+    for (auto &ent : pov_order)
+        if (pov.count(ent->reg.deparser_id()) == 0) {
+            total_headers++;
+            pov[ent->reg.deparser_id()] = pov_size;
+            pov_size += ent->reg.size;
+            for (unsigned i = 0; i < ent->reg.size; i += 8) {
+                if (pov_byte >= Target::Tofino::DEPARSER_MAX_POV_BYTES) {
+                    error(ent.lineno,
+                          "Exceeded hardware limit for POV bits (%d) in deparser. "
+                          "Using %d or more headers. Please reduce the number of headers",
+                          Target::Tofino::DEPARSER_MAX_POV_BYTES * 8, total_headers);
+                    return;
+                }
+                pov_layout[pov_byte++] = ent->reg.deparser_id();
+            }
+        }
+    while (pov_byte < Target::Tofino::DEPARSER_MAX_POV_BYTES) pov_layout[pov_byte++] = 0xff;
+
+    int row = -1, prev = -1, prev_pov = -1;
+    bool prev_is_checksum = false;
+    unsigned pos = 0;
+    unsigned total_bytes = 0;
+    int prev_row = 0;
+    for (auto &ent : dict) {
+        unsigned size = ent.what->size();
+        total_bytes += size;
+        int pov_bit = pov[ent.pov.front()->reg.deparser_id()] + ent.pov.front()->lo;
+
+        if (options.match_compiler) {
+            if (ent.what->is<Deparser::FDEntry::Checksum>()) {
+                /* checksum unit -- make sure it gets its own dictionary line */
+                prev_pov = -1;
+                prev_is_checksum = true;
+            } else {
+                if (prev_is_checksum) prev_pov = -1;
+                prev_is_checksum = false;
+            }
+        }
+
+        if (ent.what->is<Deparser::FDEntry::Phv>() && prev_pov == pov_bit &&
+            int(ent.what->encode()) == prev && ent.what->size() & 6)
+            error(ent.lineno, "16 and 32-bit container cannot be repeatedly deparsed");
+        while (size--) {
+            if (pov_bit != prev_pov || pos >= 4 /*|| (pos & (size-1)) != 0*/) {
+                if (row >= 0) {
+                    fde_control[row].num_bytes = pos & 3;
+                    fde_data[row].num_bytes = pos & 3;
+                }
+                // Entries used - (192 each in INGRESS & EGRESS for Tofino)
+                if (++row >= Target::Tofino::DEPARSER_MAX_FD_ENTRIES) {
+                    error(ent.lineno,
+                          "Exceeded hardware limit for "
+                          "deparser field dictionary entries (%d). Using %d headers and %" PRIu64
+                          " containers. Please reduce the number of headers and/or their length.",
+                          Target::Tofino::DEPARSER_MAX_FD_ENTRIES, total_headers,
+                          uint64_t(dict.size()));
+                    return;
+                }
+                fde_control[row].pov_sel = pov_bit;
+                fde_control[row].version = 0xf;
+                fde_control[row].valid = 1;
+                pos = 0;
+            }
+            if (prev_row != row) {
+                json::map fd;
+                json::map fd_entry;
+                fd["Field Dictionary Number"] = prev_row;
+                fd_entry["entry"] = prev_row;
+                auto prevPovReg = Phv::reg(pov_layout[fde_control[prev_row].pov_sel.value / 8]);
+                auto prevPovBit = fde_control[prev_row].pov_sel.value;
+                auto prevPovOffset = prevPovBit - reg_pov[prevPovReg];
+                Deparser::write_pov_in_json(fd, fd_entry, prevPovReg, prevPovBit, prevPovOffset);
+                fd["Content"] = std::move(chunk_bytes);
+                fd_entry["chunks"] = std::move(fd_entry_chunk_bytes);
+                fd_gress.push_back(std::move(fd));
+                fd_entries.push_back(std::move(fd_entry));
+                prev_row = row;
+            }
+            auto povReg = Phv::reg(pov_layout[fde_control[row].pov_sel.value / 8]);
+            auto povBit = fde_control[row].pov_sel.value % povReg->size;
+            json::map chunk_byte;
+            json::map fd_entry_chunk_byte;
+            json::map fd_entry_chunk;
+            chunk_byte["Byte"] = pos;
+            fd_entry_chunk_byte["chunk_number"] = pos;
+            auto phvReg = Phv::reg(ent.what->encode());
+            if (ent.what->encode() < CHECKSUM_ENGINE_PHVID_TOFINO_LOW ||
+                ent.what->encode() > CHECKSUM_ENGINE_PHVID_TOFINO_HIGH) {
+                write_field_name_in_json(phvReg, povReg, povBit, chunk_byte, fd_entry_chunk, 11,
+                                         gress);
+            } else {
+                write_csum_const_in_json(ent.what->encode(), chunk_byte, fd_entry_chunk, gress);
+            }
+            fd_entry_chunk_byte["chunk"] = std::move(fd_entry_chunk);
+            chunk_bytes.push_back(std::move(chunk_byte.clone()));
+            fd_entry_chunk_bytes.push_back(std::move(fd_entry_chunk_byte.clone()));
+            fde_data[row].phv[pos++] = ent.what->encode();
+            prev_pov = pov_bit;
+        }
+
+        prev = ent.what->encode();
+    }
+    if (pos) {
+        fde_control[row].num_bytes = pos & 3;
+        fde_data[row].num_bytes = pos & 3;
+    }
+
+    // Compute average occupancy.  For deparser FDE compression to work,
+    // need to make sure have certain average occupancy.
+    // This error check may still be too high level.  I think it needs a finer granularity,
+    // but I'm not sure how to model the allowed variability of packet headers.
+
+    // Tofino deparser has a maximum output header size of 480 bytes.  This is done in 2 phases.
+    // Each phase can do 240 bytes, corresponding to 18 QFDEs (4 * 18 * 4 bytes = 288 bytes)
+    // This means that average occupancy must be better than 240 / 288 bytes, or roughly 83%.
+    // This is the value we will check.
+    // We gate the check on total bytes occupied being greater than 64 bytes in an attempt
+    // to consider the QFDE constraint that it can only drive four stage 2 buses for compression.
+
+    unsigned max_bytes_for_rows_occupied = 4 * (row + 1);
+    double occupancy = 0.0;
+
+    if (max_bytes_for_rows_occupied > 0)
+        occupancy =
+            static_cast<double>(total_bytes) / static_cast<double>(max_bytes_for_rows_occupied);
+
+    if (total_bytes > 64 && occupancy < (240.0 / 288.0)) {
+        std::stringstream warn_msg;
+        warn_msg.precision(4);
+        warn_msg << "Deparser field dictionary occupancy is too sparse.";
+        warn_msg << "\nHardware requires an occupancy of " << 100.0 * 240.0 / 288.0
+                 << " to deparse the output header,";
+        warn_msg << "\nbut the PHV layout for the header structures was such that"
+                    "  the occupancy was only "
+                 << 100.0 * occupancy << ".";
+        warn_msg << "\nThis situation is usually caused by a program that has one or"
+                    " more of the following requirements:";
+        warn_msg << "\n  1. many 'short' headers that are not guaranteed to coexist"
+                    " (e.g. less than 4 bytes)";
+        warn_msg << "\n  2. many packet headers that are not multiples of 4 bytes";
+        warn_msg << "\n  3. many conditionally updated checksums";
+        warning(0, "%s", warn_msg.str().c_str());
+    }
+}
+
+template <typename IN_GRP, typename IN_SPLIT, typename EG_GRP, typename EG_SPLIT>
+void tofino_phv_ownership(bitvec phv_use[2], IN_GRP &in_grp, IN_SPLIT &in_split, EG_GRP &eg_grp,
+                          EG_SPLIT &eg_split, unsigned first, unsigned count) {
+    BUG_CHECK(in_grp.val.size() == eg_grp.val.size());
+    BUG_CHECK(in_split.val.size() == eg_split.val.size());
+    BUG_CHECK((in_grp.val.size() + 1) * in_split.val.size() == count);
+    unsigned group_size = in_split.val.size();
+    // DANGER -- this only works because tofino Phv::Register uids happend to match
+    // DANGER -- the deparser encoding of phv containers.
+    unsigned reg = first;
+    for (unsigned i = 0; i < in_grp.val.size(); i++, reg += group_size) {
+        unsigned last = reg + group_size - 1;
+        int count = 0;
+        if (phv_use[INGRESS].getrange(reg, group_size)) {
+            in_grp.val |= 1U << i;
+            if (i * group_size >= 16 && i * group_size < 32)
+                error(0, "%s..%s(R%d..R%d) used by ingress deparser but only available to egress",
+                      Phv::reg(reg)->name, Phv::reg(last)->name, reg, last);
+            else
+                count++;
+        }
+        if (phv_use[EGRESS].getrange(reg, group_size)) {
+            eg_grp.val |= 1U << i;
+            if (i * group_size < 16)
+                error(0, "%s..%s(R%d..R%d) used by egress deparser but only available to ingress",
+                      Phv::reg(reg)->name, Phv::reg(last)->name, reg, last);
+            else
+                count++;
+        }
+        if (count > 1)
+            error(0, "%s..%s(R%d..R%d) used by both ingress and egress deparser",
+                  Phv::reg(reg)->name, Phv::reg(last)->name, reg, last);
+    }
+    in_split.val = phv_use[INGRESS].getrange(reg, group_size);
+    eg_split.val = phv_use[EGRESS].getrange(reg, group_size);
+}
+
+static short tofino_phv2cksum[Target::Tofino::Phv::NUM_PHV_REGS][2] = {
+    // normal {LSWord, MSWord}
+    {287, 286},
+    {283, 282},
+    {279, 278},
+    {275, 274},
+    {271, 270},
+    {267, 266},
+    {263, 262},
+    {259, 258},
+    {255, 254},
+    {251, 250},
+    {247, 246},
+    {243, 242},
+    {239, 238},
+    {235, 234},
+    {231, 230},
+    {227, 226},
+    {223, 222},
+    {219, 218},
+    {215, 214},
+    {211, 210},
+    {207, 206},
+    {203, 202},
+    {199, 198},
+    {195, 194},
+    {191, 190},
+    {187, 186},
+    {183, 182},
+    {179, 178},
+    {175, 174},
+    {171, 170},
+    {167, 166},
+    {163, 162},
+    {285, 284},
+    {281, 280},
+    {277, 276},
+    {273, 272},
+    {269, 268},
+    {265, 264},
+    {261, 260},
+    {257, 256},
+    {253, 252},
+    {249, 248},
+    {245, 244},
+    {241, 240},
+    {237, 236},
+    {233, 232},
+    {229, 228},
+    {225, 224},
+    {221, 220},
+    {217, 216},
+    {213, 212},
+    {209, 208},
+    {205, 204},
+    {201, 200},
+    {197, 196},
+    {193, 192},
+    {189, 188},
+    {185, 184},
+    {181, 180},
+    {177, 176},
+    {173, 172},
+    {169, 168},
+    {165, 164},
+    {161, 160},
+    {147, -1},
+    {145, -1},
+    {143, -1},
+    {141, -1},
+    {127, -1},
+    {125, -1},
+    {123, -1},
+    {121, -1},
+    {107, -1},
+    {105, -1},
+    {103, -1},
+    {101, -1},
+    {87, -1},
+    {85, -1},
+    {83, -1},
+    {81, -1},
+    {67, -1},
+    {65, -1},
+    {63, -1},
+    {61, -1},
+    {47, -1},
+    {45, -1},
+    {43, -1},
+    {41, -1},
+    {27, -1},
+    {25, -1},
+    {23, -1},
+    {21, -1},
+    {7, -1},
+    {5, -1},
+    {3, -1},
+    {1, -1},
+    {146, -1},
+    {144, -1},
+    {142, -1},
+    {140, -1},
+    {126, -1},
+    {124, -1},
+    {122, -1},
+    {120, -1},
+    {106, -1},
+    {104, -1},
+    {102, -1},
+    {100, -1},
+    {86, -1},
+    {84, -1},
+    {82, -1},
+    {80, -1},
+    {66, -1},
+    {64, -1},
+    {62, -1},
+    {60, -1},
+    {46, -1},
+    {44, -1},
+    {42, -1},
+    {40, -1},
+    {26, -1},
+    {24, -1},
+    {22, -1},
+    {20, -1},
+    {6, -1},
+    {4, -1},
+    {2, -1},
+    {0, -1},
+    {159, -1},
+    {157, -1},
+    {155, -1},
+    {153, -1},
+    {151, -1},
+    {149, -1},
+    {139, -1},
+    {137, -1},
+    {135, -1},
+    {133, -1},
+    {131, -1},
+    {129, -1},
+    {119, -1},
+    {117, -1},
+    {115, -1},
+    {113, -1},
+    {111, -1},
+    {109, -1},
+    {99, -1},
+    {97, -1},
+    {95, -1},
+    {93, -1},
+    {91, -1},
+    {89, -1},
+    {79, -1},
+    {77, -1},
+    {75, -1},
+    {73, -1},
+    {71, -1},
+    {69, -1},
+    {59, -1},
+    {57, -1},
+    {55, -1},
+    {53, -1},
+    {51, -1},
+    {49, -1},
+    {39, -1},
+    {37, -1},
+    {35, -1},
+    {33, -1},
+    {31, -1},
+    {29, -1},
+    {19, -1},
+    {17, -1},
+    {15, -1},
+    {13, -1},
+    {11, -1},
+    {9, -1},
+    {158, -1},
+    {156, -1},
+    {154, -1},
+    {152, -1},
+    {150, -1},
+    {148, -1},
+    {138, -1},
+    {136, -1},
+    {134, -1},
+    {132, -1},
+    {130, -1},
+    {128, -1},
+    {118, -1},
+    {116, -1},
+    {114, -1},
+    {112, -1},
+    {110, -1},
+    {108, -1},
+    {98, -1},
+    {96, -1},
+    {94, -1},
+    {92, -1},
+    {90, -1},
+    {88, -1},
+    {78, -1},
+    {76, -1},
+    {74, -1},
+    {72, -1},
+    {70, -1},
+    {68, -1},
+    {58, -1},
+    {56, -1},
+    {54, -1},
+    {52, -1},
+    {50, -1},
+    {48, -1},
+    {38, -1},
+    {36, -1},
+    {34, -1},
+    {32, -1},
+    {30, -1},
+    {28, -1},
+    {18, -1},
+    {16, -1},
+    {14, -1},
+    {12, -1},
+    {10, -1},
+    {8, -1},
+
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+    {-1, -1},
+
+    // tagalong {LSWord, MSWord}
+    {1, 0},
+    {3, 2},
+    {5, 4},
+    {7, 6},
+    {9, 8},
+    {11, 10},
+    {13, 12},
+    {15, 14},
+    {17, 16},
+    {19, 18},
+    {21, 20},
+    {23, 22},
+    {25, 24},
+    {27, 26},
+    {29, 28},
+    {31, 30},
+    {33, 32},
+    {35, 34},
+    {37, 36},
+    {39, 38},
+    {41, 40},
+    {43, 42},
+    {45, 44},
+    {47, 46},
+    {49, 48},
+    {51, 50},
+    {53, 52},
+    {55, 54},
+    {57, 56},
+    {59, 58},
+    {61, 60},
+    {63, 62},
+    {64, -1},
+    {65, -1},
+    {66, -1},
+    {67, -1},
+    {68, -1},
+    {69, -1},
+    {70, -1},
+    {71, -1},
+    {72, -1},
+    {73, -1},
+    {74, -1},
+    {75, -1},
+    {76, -1},
+    {77, -1},
+    {78, -1},
+    {79, -1},
+    {80, -1},
+    {81, -1},
+    {82, -1},
+    {83, -1},
+    {84, -1},
+    {85, -1},
+    {86, -1},
+    {87, -1},
+    {88, -1},
+    {89, -1},
+    {90, -1},
+    {91, -1},
+    {92, -1},
+    {93, -1},
+    {94, -1},
+    {95, -1},
+    {96, -1},
+    {97, -1},
+    {98, -1},
+    {99, -1},
+    {100, -1},
+    {101, -1},
+    {102, -1},
+    {103, -1},
+    {104, -1},
+    {105, -1},
+    {106, -1},
+    {107, -1},
+    {108, -1},
+    {109, -1},
+    {110, -1},
+    {111, -1},
+    {112, -1},
+    {113, -1},
+    {114, -1},
+    {115, -1},
+    {116, -1},
+    {117, -1},
+    {118, -1},
+    {119, -1},
+    {120, -1},
+    {121, -1},
+    {122, -1},
+    {123, -1},
+    {124, -1},
+    {125, -1},
+    {126, -1},
+    {127, -1},
+    {128, -1},
+    {129, -1},
+    {130, -1},
+    {131, -1},
+    {132, -1},
+    {133, -1},
+    {134, -1},
+    {135, -1},
+    {136, -1},
+    {137, -1},
+    {138, -1},
+    {139, -1},
+    {140, -1},
+    {141, -1},
+    {142, -1},
+    {143, -1}};
+
+#define TAGALONG_THREAD_BASE                                                        \
+    (Target::Tofino::Phv::COUNT_8BIT_TPHV + Target::Tofino::Phv::COUNT_16BIT_TPHV + \
+     2 * Target::Tofino::Phv::COUNT_32BIT_TPHV)
+
+template <typename DTYPE, typename STYPE>
+static void copy_csum_cfg_entry(DTYPE &dst_unit, STYPE &src_unit) {
+    BUG_CHECK(dst_unit.size() == src_unit.size());
+
+    for (unsigned i = 0; i < dst_unit.size(); i++) {
+        auto &src = src_unit[i];
+        auto &dst = dst_unit[i];
+
+        dst.zero_l_s_b = src.zero_l_s_b;
+        dst.zero_m_s_b = src.zero_m_s_b;
+        dst.swap = src.swap;
+    }
+}
+
+template <class ENTRIES>
+static void init_tofino_checksum_entry(ENTRIES &entry) {
+    entry.zero_l_s_b = 1;
+    entry.zero_l_s_b.rewrite();
+    entry.zero_m_s_b = 1;
+    entry.zero_m_s_b.rewrite();
+    entry.swap = 0;
+    entry.swap.rewrite();
+}
+
+template <typename IPO, typename HPO>
+static void tofino_checksum_units(checked_array_base<IPO> &main_csum_units,
+                                  checked_array_base<HPO> &tagalong_csum_units, gress_t gress,
+                                  Deparser::FullChecksumUnit checksum_unit[]) {
+    BUG_CHECK(tofino_phv2cksum[Target::Tofino::Phv::NUM_PHV_REGS - 1][0] == 143);
+    for (int i = 0; i < Target::Tofino::DEPARSER_CHECKSUM_UNITS; i++) {
+        auto &main_unit = main_csum_units[i].csum_cfg_entry;
+        auto &tagalong_unit = tagalong_csum_units[i].csum_cfg_entry;
+        auto &tagalong_unit_zeros_as_ones = tagalong_csum_units[i].zeros_as_ones;
+        for (auto &ent : main_unit) init_tofino_checksum_entry(ent);
+        for (auto &ent : tagalong_unit) init_tofino_checksum_entry(ent);
+        if (checksum_unit[i].entries.empty()) continue;
+        // Tofino does not support checksum calculation using multiple
+        // partial checksum unit.
+        // Full checksum unit and partial checksum unit will always be same
+        BUG_CHECK(checksum_unit[i].entries.size() == 1);
+        auto &checksum_unit_entries = checksum_unit[i].entries[i];
+        for (auto &reg : checksum_unit_entries) {
+            int mask = reg.mask;
+            int swap = reg.swap;
+            int idx = reg->reg.deparser_id();
+            if (!reg.pov.empty())
+                error(reg.pov.front().lineno, "No POV support in tofino checksum");
+            auto cksum_idx0 = tofino_phv2cksum[idx][0];
+            auto cksum_idx1 = tofino_phv2cksum[idx][1];
+            BUG_CHECK(cksum_idx0 >= 0);
+            if (idx >= 256) {
+                write_checksum_entry(tagalong_unit[cksum_idx0], mask & 3, swap & 1, i,
+                                     reg->reg.name);
+                if (cksum_idx1 >= 0)
+                    write_checksum_entry(tagalong_unit[cksum_idx1], mask >> 2, swap >> 1, i,
+                                         reg->reg.name);
+                else
+                    BUG_CHECK((mask >> 2 == 0) && (swap >> 1 == 0));
+            } else {
+                write_checksum_entry(main_unit[cksum_idx0], mask & 3, swap & 1, i, reg->reg.name);
+                if (cksum_idx1 >= 0)
+                    write_checksum_entry(main_unit[cksum_idx1], mask >> 2, swap >> 1, i,
+                                         reg->reg.name);
+                else
+                    BUG_CHECK((mask >> 2 == 0) && (swap >> 1 == 0));
+            }
+        }
+        // Thread non-tagalong checksum results through the tagalong unit
+        int idx = i + TAGALONG_THREAD_BASE + gress * Target::Tofino::DEPARSER_CHECKSUM_UNITS;
+        write_checksum_entry(tagalong_unit[idx], 0x3, 0x0, i);
+        // Setting Zeros_As_Ones enable
+        tagalong_unit_zeros_as_ones.en = checksum_unit[i].zeros_as_ones_en;
+        main_unit.set_modified();
+        tagalong_unit.set_modified();
+    }
+}
+
+static void tofino_checksum_units(
+    Target::Tofino::deparser_regs &regs,
+    Deparser::FullChecksumUnit full_checksum_unit[2][MAX_DEPARSER_CHECKSUM_UNITS]) {
+    for (unsigned id = 2; id < MAX_DEPARSER_CHECKSUM_UNITS; id++) {
+        if (!full_checksum_unit[0][id].entries.empty() &&
+            !full_checksum_unit[1][id].entries.empty())
+            error(-1, "deparser checksum unit %d used in both ingress and egress", id);
+    }
+
+    tofino_checksum_units(regs.input.iim.ii_phv_csum.csum_cfg,
+                          regs.header.him.hi_tphv_csum.csum_cfg, INGRESS,
+                          full_checksum_unit[INGRESS]);
+    tofino_checksum_units(regs.input.iem.ie_phv_csum.csum_cfg,
+                          regs.header.hem.he_tphv_csum.csum_cfg, EGRESS,
+                          full_checksum_unit[EGRESS]);
+
+    // make sure shared units are configured identically
+    for (unsigned id = 2; id < Target::Tofino::DEPARSER_CHECKSUM_UNITS; id++) {
+        auto &eg_main_unit = regs.input.iem.ie_phv_csum.csum_cfg[id].csum_cfg_entry;
+        auto &ig_main_unit = regs.input.iim.ii_phv_csum.csum_cfg[id].csum_cfg_entry;
+
+        auto &eg_tphv_unit = regs.header.hem.he_tphv_csum.csum_cfg[id].csum_cfg_entry;
+        auto &ig_tphv_unit = regs.header.him.hi_tphv_csum.csum_cfg[id].csum_cfg_entry;
+
+        if (!full_checksum_unit[0][id].entries.empty()) {
+            copy_csum_cfg_entry(eg_main_unit, ig_main_unit);
+            copy_csum_cfg_entry(eg_tphv_unit, ig_tphv_unit);
+        } else if (!full_checksum_unit[1][id].entries.empty()) {
+            copy_csum_cfg_entry(ig_main_unit, eg_main_unit);
+            copy_csum_cfg_entry(ig_tphv_unit, eg_tphv_unit);
+        }
+    }
+}
+
+template <>
+void Deparser::write_config(Target::Tofino::deparser_regs &regs) {
+    regs.input.icr.inp_cfg.disable();
+    regs.input.icr.intr.disable();
+    regs.header.hem.he_edf_cfg.disable();
+    regs.header.him.hi_edf_cfg.disable();
+
+    tofino_checksum_units(regs, full_checksum_unit);
+    json::map field_dictionary_alloc;
+    json::vector fd_gress;
+    json::vector fde_entries_i;
+    json::vector fde_entries_e;
+
+    // Deparser resources
+    json::vector resources_deparser;
+
+    // Create field dictionaries for ingress
+    tofino_field_dictionary(regs.input.iim.ii_fde_pov.fde_pov, regs.header.him.hi_fde_phv.fde_phv,
+                            regs.input.iir.main_i.pov.phvs, pov_order[INGRESS], pov[INGRESS],
+                            dictionary[INGRESS], fd_gress, fde_entries_i, INGRESS);
+    field_dictionary_alloc["ingress"] = std::move(fd_gress);
+    // Create field dictionaries for egress
+    tofino_field_dictionary(regs.input.iem.ie_fde_pov.fde_pov, regs.header.hem.he_fde_phv.fde_phv,
+                            regs.input.ier.main_e.pov.phvs, pov_order[EGRESS], pov[EGRESS],
+                            dictionary[EGRESS], fd_gress, fde_entries_e, EGRESS);
+    field_dictionary_alloc["egress"] = std::move(fd_gress);
+
+    if (Log::verbosity() > 0) {
+        auto json_dump = open_output("logs/field_dictionary.log");
+        *json_dump << &field_dictionary_alloc;
+    }
+    // Output deparser resources
+    report_resources_deparser_json(fde_entries_i, fde_entries_e);
+
+    if (Phv::use(INGRESS).intersects(Phv::use(EGRESS))) {
+        warning(lineno[INGRESS], "Registers used in both ingress and egress in pipeline: %s",
+                Phv::db_regset(Phv::use(INGRESS) & Phv::use(EGRESS)).c_str());
+        /* FIXME -- this only (sort-of) works because 'deparser' comes first in the alphabet,
+         * FIXME -- so is the first section to have its 'output' method run.  Its a hack
+         * FIXME -- anyways to attempt to correct broken asm that should be an error */
+        Phv::unsetuse(INGRESS, phv_use[EGRESS]);
+        Phv::unsetuse(EGRESS, phv_use[INGRESS]);
+    }
+
+    tofino_phv_ownership(phv_use, regs.input.iir.ingr.phv8_grp, regs.input.iir.ingr.phv8_split,
+                         regs.input.ier.egr.phv8_grp, regs.input.ier.egr.phv8_split,
+                         Target::Tofino::Phv::FIRST_8BIT_PHV, Target::Tofino::Phv::COUNT_8BIT_PHV);
+    tofino_phv_ownership(phv_use, regs.input.iir.ingr.phv16_grp, regs.input.iir.ingr.phv16_split,
+                         regs.input.ier.egr.phv16_grp, regs.input.ier.egr.phv16_split,
+                         Target::Tofino::Phv::FIRST_16BIT_PHV,
+                         Target::Tofino::Phv::COUNT_16BIT_PHV);
+    tofino_phv_ownership(phv_use, regs.input.iir.ingr.phv32_grp, regs.input.iir.ingr.phv32_split,
+                         regs.input.ier.egr.phv32_grp, regs.input.ier.egr.phv32_split,
+                         Target::Tofino::Phv::FIRST_32BIT_PHV,
+                         Target::Tofino::Phv::COUNT_32BIT_PHV);
+
+    for (unsigned i = 0; i < 8; i++) {
+        if (phv_use[EGRESS].intersects(Target::Tofino::Phv::tagalong_groups[i])) {
+            regs.input.icr.tphv_cfg.i_e_assign |= 1 << i;
+            if (phv_use[INGRESS].intersects(Target::Tofino::Phv::tagalong_groups[i])) {
+                error(lineno[INGRESS],
+                      "tagalong group %d used in both ingress and "
+                      "egress deparser",
+                      i);
+            }
+        }
+    }
+
+    for (auto &intrin : intrinsics) intrin.type->setregs(regs, *this, intrin);
+
+    if (!regs.header.hir.ingr.ingress_port.sel.modified())
+        regs.header.hir.ingr.ingress_port.sel = 1;
+
+    for (auto &digest : digests) digest.type->setregs(regs, *this, digest);
+
+    // The csum_cfg_entry registers are NOT reset by hardware and must be
+    // explicitly configured.  We remove the disable_if_reset_value() calls on
+    // these register tree for now, but ideally they should have a flag to indicate no
+    // reset value is present and the register tree should prune only those regs
+    // if (options.condense_json) {
+    //     regs.input.disable_if_reset_value();
+    //     regs.header.disable_if_reset_value(); }
+    if (error_count == 0 && options.gen_json) {
+        regs.input.emit_json(*open_output("regs.all.deparser.input_phase.cfg.json"));
+        regs.header.emit_json(*open_output("regs.all.deparser.header_phase.cfg.json"));
+    }
+    TopLevel::regs<Target::Tofino>()->reg_pipe.deparser.hdr.set("regs.all.deparser.header_phase",
+                                                                &regs.header);
+    TopLevel::regs<Target::Tofino>()->reg_pipe.deparser.inp.set("regs.all.deparser.input_phase",
+                                                                &regs.input);
+}
+
+template <>
+unsigned Deparser::FDEntry::Checksum::encode<Target::Tofino>() {
+    return CHECKSUM_ENGINE_PHVID_TOFINO_LOW + (gress * CHECKSUM_ENGINE_PHVID_TOFINO_PER_GRESS) +
+           unit;
+}
+
+template <>
+unsigned Deparser::FDEntry::Constant::encode<Target::Tofino>() {
+    error(lineno, "Tofino deparser does not support constant entries");
+    return -1;
+}
+
+template <>
+void Deparser::gen_learn_quanta(Target::Tofino::parser_regs &regs, json::vector &learn_quanta) {}
+
+template <>
+void Deparser::process(Target::Tofino *) {
+    // Chip-specific code for process method
+    // None for Tofino
+}
diff --git a/backends/tofino/bf-asm/tofino/exact_match.cpp b/backends/tofino/bf-asm/tofino/exact_match.cpp
new file mode 100644
index 00000000000..dcd8d5d022a
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/exact_match.cpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/tofino/exact_match.h"
+
+void Target::Tofino::ExactMatchTable::setup_ways() {
+    ::ExactMatchTable::setup_ways();
+    for (auto &row : layout) {
+        int first_way = -1;
+        for (auto &ram : row.memunits) {
+            int way = way_map.at(ram).way;
+            if (first_way < 0) {
+                first_way = way;
+            } else if (ways[way].group_xme != ways[first_way].group_xme) {
+                error(row.lineno,
+                      "Ways %d and %d of table %s share address bus on row %d, "
+                      "but use different hash groups",
+                      first_way, way, name(), row.row);
+                break;
+            }
+        }
+    }
+}
diff --git a/backends/tofino/bf-asm/tofino/exact_match.h b/backends/tofino/bf-asm/tofino/exact_match.h
new file mode 100644
index 00000000000..a4e6199bb9d
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/exact_match.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_EXACT_MATCH_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_EXACT_MATCH_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+
+class Target::Tofino::ExactMatchTable : public ::ExactMatchTable {
+    friend class ::ExactMatchTable;
+    ExactMatchTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::ExactMatchTable(line, n, gr, s, lid) {}
+
+    void setup_ways() override;
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_EXACT_MATCH_H_ */
diff --git a/backends/tofino/bf-asm/tofino/gateway.cpp b/backends/tofino/bf-asm/tofino/gateway.cpp
new file mode 100644
index 00000000000..9be05e17d70
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/gateway.cpp
@@ -0,0 +1,320 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/tofino/gateway.h"
+
+#include "backends/tofino/bf-asm/hashexpr.h"
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tofino/ternary_match.h"
+#include "lib/hex.h"
+
+/* Tofino1/2 Gateway table support
+ * GatewayTable uses the Table::Layout in a somewhat hacky way to track the gateway match
+ * and payload blocks.  Layout may have either one or two entries.
+ * layout[0] is the layout for the gateway match -- which row and search bus is being used
+ * layout[1] is the layout for the payload -- which row and result bus is being used.
+ * if layout.size() == 1, there is no payload.
+ * The payload result bus is stored as bus[RESULT_BUS] even though it may be either a
+ * match result bus or a tind bus -- the second bit (so busses 2 and 3) are the tind
+ * busses as that is the way they they are encode in some registers.  It should perhaps be
+ * changed to use the bus_type_t to track whether it is a match RESULT_BUS or a TIND_BUS
+ */
+
+bool Target::Tofino::GatewayTable::check_match_key(MatchKey &key, const std::vector<MatchKey> &vec,
+                                                   bool is_xor) {
+    if (!::GatewayTable::check_match_key(key, vec, is_xor)) return false;
+    if (key.offset < 32 && (key.offset & 7) != (key.val->lo & 7))
+        error(key.val.lineno, "Gateway %s key %s misaligned within byte", is_xor ? "xor" : "match",
+              key.val.name());
+    if (key.offset + key.val->size() > (is_xor ? 32 : 44)) {
+        error(key.val.lineno, "Gateway %s key too big", is_xor ? "xor" : "match");
+        return false;
+    }
+    if (key.offset >= 32 && !input_xbar.empty()) {
+        BUG_CHECK(input_xbar.size() == 1, "%s does not have one input xbar", name());
+        auto hash = input_xbar[0]->hash_column(key.offset + 8);
+        if (hash.size() != 1 || hash[0]->bit || !hash[0]->fn ||
+            !hash[0]->fn->match_phvref(key.val)) {
+            // FIXME: hash.size() maybe zero when key.valid is true.
+            // which means the key.offset is incorrect.
+            if (!key.valid) {
+                error(key.val.lineno, "Gateway %s key %s not in matching hash column",
+                      is_xor ? "xor" : "match", key.val.name());
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+void Target::Tofino::GatewayTable::pass1() {
+    ::GatewayTable::pass1();
+    /* in a gateway, the layout has one or two rows -- layout[0] specifies the gateway, and
+     * layout[1] specifies the payload. There will be no columns in either row.
+     */
+    if (layout.empty() || layout[0].row < 0)
+        error(lineno, "No row specified in gateway");
+    else if (!layout[0].bus.count(Layout::SEARCH_BUS) && (!match.empty() || !xor_match.empty()))
+        error(lineno, "No bus specified in gateway to read from");
+    if (payload_unit >= 0 && have_payload < 0 && match_address < 0)
+        error(lineno, "payload_unit with no payload or match address in gateway");
+    if (layout.size() > 1) {
+        if (layout[1].bus.count(Layout::RESULT_BUS) && (have_payload >= 0 || match_address >= 0)) {
+            int result_bus = layout[1].bus.at(Layout::RESULT_BUS);
+            if (payload_unit < 0) {
+                payload_unit = result_bus & 1;
+            } else if (payload_unit != (result_bus & 1)) {
+                error(layout[1].lineno, "payload unit %d cannot write to result bus %d",
+                      payload_unit, result_bus);
+            }
+        }
+        if (layout[1].row < 0) {
+            error(layout[1].lineno, "payload_bus with no payload_row in gateway");
+        } else if (Table *tbl = match_table) {
+            if (auto *tmatch = dynamic_cast<TernaryMatchTable *>(tbl)) tbl = tmatch->indirect;
+            if (tbl && !tbl->layout.empty()) {
+                for (auto &r : tbl->layout) {
+                    if (r.row != layout[1].row) continue;
+                    if (!r.bus.count(Layout::RESULT_BUS)) continue;
+                    int match_rbus = r.bus.at(Layout::RESULT_BUS);
+                    if (payload_unit >= 0 && payload_unit != (match_rbus & 1)) continue;
+                    if (!layout[1].bus.count(Layout::RESULT_BUS))
+                        layout[1].bus[Layout::RESULT_BUS] = match_rbus;
+                    if (match_rbus == layout[1].bus.at(Layout::RESULT_BUS)) {
+                        if (tbl->to<TernaryIndirectTable>()) layout[1].bus[Layout::RESULT_BUS] |= 2;
+                        break;
+                    }
+                }
+            }
+        } else if (have_payload >= 0 || match_address >= 0) {
+            if (payload_unit) {
+                if (auto *old = stage->gw_payload_use[layout[1].row][payload_unit])
+                    error(layout[1].lineno, "payload %d.%d already in use by table %s",
+                          layout[1].row, payload_unit, old->name());
+                else
+                    stage->gw_payload_use[layout[1].row][payload_unit] = this;
+            }
+        } else if (payload_unit >= 0) {
+            error(lineno, "payload_unit with no payload or match address in gateway");
+        }
+    } else if ((have_payload >= 0 || match_address >= 0) && !match_table) {
+        error(have_payload, "payload on standalone gateway requires explicit payload_row");
+    } else if (payload_unit >= 0 && match_table) {
+        bool ternary = false;
+        Table *tbl = match_table;
+        if (auto *tmatch = dynamic_cast<TernaryMatchTable *>(tbl)) {
+            ternary = true;
+            tbl = tmatch->indirect;
+        }
+        if (!tbl || tbl->layout.empty()) {
+            error(lineno, "No result busses in table %s for gateway payload", match_table->name());
+        } else {
+            for (auto &r : tbl->layout) {
+                auto match_rbus = r.bus.count(Layout::RESULT_BUS) ? r.bus.at(Layout::RESULT_BUS)
+                                                                  : r.bus.at(Layout::SEARCH_BUS);
+                if (match_rbus >= 0 && payload_unit != (match_rbus & 1)) continue;
+                if (!stage->gw_payload_use[r.row][payload_unit]) {
+                    layout.resize(2);
+                    layout[1].row = r.row;
+                    if (r.bus.count(Layout::RESULT_BUS))
+                        layout[1].bus[Layout::RESULT_BUS] = r.bus.at(Layout::RESULT_BUS);
+                    else
+                        layout[1].bus[Layout::RESULT_BUS] =
+                            r.bus.at(Layout::SEARCH_BUS) | (ternary ? 2 : 0);
+                    stage->gw_payload_use[r.row][payload_unit] = this;
+                    break;
+                }
+            }
+            if (layout.size() < 2)
+                error(lineno, "No row in table %s has payload unit %d free", tbl->name(),
+                      payload_unit);
+        }
+    }
+    if (layout.size() > 1 && layout[1].bus.count(Layout::RESULT_BUS)) {
+        int result_bus = layout[1].bus.at(Layout::RESULT_BUS);
+        Table *tbl = match_table;
+        if (auto *tmatch = dynamic_cast<TernaryMatchTable *>(tbl)) tbl = tmatch->indirect;
+        if (!tbl) tbl = this;
+        auto &bus_use =
+            (result_bus & 2) ? stage->tcam_indirect_bus_use : stage->match_result_bus_use;
+        auto *old = bus_use[layout[1].row][result_bus & 1];
+        if (old && old != tbl)
+            error(layout[1].lineno,
+                  "Gateway payload result bus %d conflict on row %d between "
+                  "%s and %s",
+                  result_bus, layout[1].row, name(), old->name());
+        bus_use[layout[1].row][result_bus & 1] = tbl;
+    }
+}
+
+void Target::Tofino::GatewayTable::pass2() {
+    ::GatewayTable::pass2();
+    if (gw_unit < 0) {
+        if (layout[0].bus.count(Layout::SEARCH_BUS) &&
+            !stage->gw_unit_use[layout[0].row][layout[0].bus.at(Layout::SEARCH_BUS)]) {
+            gw_unit = layout[0].bus.at(Layout::SEARCH_BUS);
+        } else {
+            for (int i = 0; i < 2; ++i) {
+                if (!stage->gw_unit_use[layout[0].row][i] &&
+                    !stage->sram_search_bus_use[layout[0].row][i]) {
+                    gw_unit = i;
+                    break;
+                }
+            }
+        }
+        if (gw_unit < 0)
+            error(layout[0].lineno, "No gateway units available on row %d", layout[0].row);
+        else
+            stage->gw_unit_use[layout[0].row][gw_unit] = this;
+    }
+    if (!layout[0].bus.count(Layout::SEARCH_BUS) && gw_unit >= 0)
+        layout[0].bus[Layout::SEARCH_BUS] = gw_unit;
+    if (payload_unit < 0 && (have_payload >= 0 || match_address >= 0)) {
+        if (layout.size() > 1) {
+            if (!layout[1].bus.count(Layout::RESULT_BUS)) {
+                if (!stage->gw_payload_use[layout[1].row][0])
+                    payload_unit = 0;
+                else if (!stage->gw_payload_use[layout[1].row][1])
+                    payload_unit = 1;
+            } else {
+                int u = layout[1].bus.at(Layout::RESULT_BUS) & 1;
+                if (!stage->gw_payload_use[layout[1].row][u]) payload_unit = u;
+            }
+            if (payload_unit >= 0)
+                stage->gw_payload_use[layout[1].row][payload_unit] = this;
+            else
+                error(lineno, "No payload available on row %d", layout[1].row);
+        } else if (Table *tbl = match_table) {
+            bool ternary = false;
+            if (auto *tmatch = dynamic_cast<TernaryMatchTable *>(tbl)) {
+                tbl = tmatch->indirect;
+                ternary = true;
+            }
+            if (tbl && !tbl->layout.empty()) {
+                for (auto &row : tbl->layout) {
+                    auto match_rbus = row.bus.at(ternary ? Layout::TIND_BUS : Layout::RESULT_BUS);
+                    BUG_CHECK(match_rbus >= 0);  // alloc_busses on the match table must run first
+                    if (stage->gw_payload_use[row.row][match_rbus]) {
+                        continue;
+                    } else {
+                        payload_unit = match_rbus;
+                    }
+                    stage->gw_payload_use[row.row][payload_unit] = this;
+                    layout.resize(2);
+                    layout[1].row = row.row;
+                    layout[1].bus[Layout::RESULT_BUS] = match_rbus | (ternary ? 2 : 0);
+                    break;
+                }
+                if (payload_unit < 0)
+                    error(lineno, "No row in table %s has a free payload unit", tbl->name());
+            } else {
+                error(lineno, "No result busses in table %s for gateway payload",
+                      match_table->name());
+            }
+        }
+    }
+    if (payload_unit >= 0 && !layout[1].bus.count(Layout::RESULT_BUS)) {
+        BUG_CHECK(layout.size() > 1);
+        int row = layout[1].row;
+        Table *tbl = match_table;
+        int ternary = tbl ? 0 : -1;
+        if (auto *tmatch = dynamic_cast<TernaryMatchTable *>(tbl)) {
+            ternary = 1;
+            tbl = tmatch->indirect ? tmatch->indirect : tmatch;
+        }
+        if (!tbl) tbl = this;
+        for (int i = payload_unit; i < 4; i += 2) {
+            if (ternary >= 0 && (i >> 1) != ternary) continue;
+            auto &result_bus = (i & 2) ? stage->tcam_indirect_bus_use : stage->match_result_bus_use;
+            if (!result_bus[row][i & 1] || result_bus[row][i & 1] == tbl) {
+                layout[1].bus[Layout::RESULT_BUS] = i;
+                result_bus[row][i & 1] = tbl;
+                break;
+            }
+        }
+        if (!layout[1].bus.count(Layout::RESULT_BUS)) {
+            error(lineno, "No result bus available for gateway payload of table %s on row %d",
+                  name(), layout[1].row);
+        }
+    }
+}
+
+void Target::Tofino::GatewayTable::pass3() {
+    ::GatewayTable::pass3();
+    if (layout[0].bus.count(Layout::SEARCH_BUS)) {
+        int search_bus = layout[0].bus.at(Layout::SEARCH_BUS);
+        auto *tbl = stage->sram_search_bus_use[layout[0].row][search_bus];
+        // Sharing with an exact match -- make sure it is ok
+        if (!tbl) return;
+        for (auto &ixb : input_xbar) {
+            auto *sram_tbl = tbl->to<SRamMatchTable>();
+            BUG_CHECK(sram_tbl,
+                      "%s is not an SRamMatch table even though it is using a "
+                      "search bus?",
+                      tbl->name());
+            SRamMatchTable::WayRam *way = nullptr;
+            for (auto &row : sram_tbl->layout) {
+                if (row.row == layout[0].row && row.bus.at(Layout::SEARCH_BUS) == search_bus) {
+                    if (row.memunits.empty()) {
+                        // FIXME -- not really used, so we don't need to check the
+                        // match/hash group.  Should this be an asm error?
+                        return;
+                    }
+                    way = &sram_tbl->way_map.at(row.memunits[0]);
+                    break;
+                }
+            }
+            BUG_CHECK(way, "%s claims to use search bus %d.%d, but we can't find it in the layout",
+                      sram_tbl->name(), layout[0].row, search_bus);
+            if (ixb->hash_group() >= 0 && sram_tbl->ways[way->way].group_xme >= 0 &&
+                ixb->hash_group() != sram_tbl->ways[way->way].group_xme) {
+                error(layout[0].lineno,
+                      "%s sharing search bus %d.%d with %s, but wants a "
+                      "different hash group",
+                      name(), layout[0].row, search_bus, tbl->name());
+            }
+            if (ixb->match_group() >= 0 && sram_tbl->word_ixbar_group[way->word] >= 0 &&
+                gateway_needs_ixbar_group() &&
+                ixb->match_group() != sram_tbl->word_ixbar_group[way->word]) {
+                error(layout[0].lineno,
+                      "%s sharing search bus %d.%d with %s, but wants a "
+                      "different match group",
+                      name(), layout[0].row, search_bus, tbl->name());
+            }
+        }
+    }
+}
+
+template <>
+void enable_gateway_payload_exact_shift_ovr(Target::Tofino::mau_regs &regs, int bus) {
+    // Not supported on tofino
+    BUG();
+}
+template void enable_gateway_payload_exact_shift_ovr(Target::Tofino::mau_regs &regs, int bus);
+
+void Target::Tofino::GatewayTable::write_next_table_regs(Target::Tofino::mau_regs &regs) {
+    auto &merge = regs.rams.match.merge;
+    int idx = 3;
+    if (need_next_map_lut) error(lineno, "Tofino does not support using next_map_lut in gateways");
+    for (auto &line : table) {
+        BUG_CHECK(idx >= 0);
+        if (!line.run_table)
+            merge.gateway_next_table_lut[logical_id][idx] = line.next.next_table_id();
+        --idx;
+    }
+    if (!miss.run_table) merge.gateway_next_table_lut[logical_id][4] = miss.next.next_table_id();
+}
diff --git a/backends/tofino/bf-asm/tofino/gateway.h b/backends/tofino/bf-asm/tofino/gateway.h
new file mode 100644
index 00000000000..6035a80dab7
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/gateway.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_GATEWAY_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_GATEWAY_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+
+class Target::Tofino::GatewayTable : public ::GatewayTable {
+    friend class ::GatewayTable;
+    GatewayTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::GatewayTable(line, n, gr, s, lid) {}
+
+    void pass1() override;
+    void pass2() override;
+    void pass3() override;
+
+    bool check_match_key(MatchKey &, const std::vector<MatchKey> &, bool) override;
+    int gw_memory_unit() const override { return layout[0].row * 2 + gw_unit; }
+    REGSETS_IN_CLASS(Tofino, TARGET_OVERLOAD, void write_next_table_regs, (mau_regs &), override)
+};
+
+template <class REGS>
+void enable_gateway_payload_exact_shift_ovr(REGS &regs, int bus);
+template <>
+void enable_gateway_payload_exact_shift_ovr(Target::Tofino::mau_regs &regs, int bus);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_GATEWAY_H_ */
diff --git a/backends/tofino/bf-asm/tofino/input_xbar.cpp b/backends/tofino/bf-asm/tofino/input_xbar.cpp
new file mode 100644
index 00000000000..5a3334b5cd4
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/input_xbar.cpp
@@ -0,0 +1,80 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/tofino/input_xbar.h"
+
+template <>
+void InputXbar::write_galois_matrix(Target::Tofino::mau_regs &regs, HashTable id,
+                                    const std::map<int, HashCol> &mat) {
+    int parity_col = -1;
+    BUG_CHECK(id.type == HashTable::EXACT, "not an exact hash table %d", id.type);
+    if (hash_table_parity.count(id) && !options.disable_gfm_parity) {
+        parity_col = hash_table_parity[id];
+    }
+    auto &hash = regs.dp.xbar_hash.hash;
+    std::set<int> gfm_rows;
+    for (auto &col : mat) {
+        int c = col.first;
+        // Skip parity column encoding, if parity is set overall parity is
+        // computed later below
+        if (c == parity_col) continue;
+        const HashCol &h = col.second;
+        for (int word = 0; word < 4; word++) {
+            unsigned data = h.data.getrange(word * 16, 16);
+            unsigned valid = (h.valid >> word * 2) & 3;
+            if (data == 0 && valid == 0) continue;
+            auto &w = hash.galois_field_matrix[id.index * 4 + word][c];
+            w.byte0 = data & 0xff;
+            w.byte1 = (data >> 8) & 0xff;
+            w.valid0 = valid & 1;
+            w.valid1 = (valid >> 1) & 1;
+            gfm_rows.insert(id.index * 4 + word);
+        }
+    }
+    // A GFM row can be shared by multiple tables. In most cases the columns are
+    // non overlapping but if they are overlapping the GFM encodings must be the
+    // same (e.g. ATCAM tables). The input xbar has checks to determine which
+    // cases are valid.
+    // The parity must be computed for all columns within the row and set into
+    // the parity column.
+    if (parity_col >= 0) {
+        for (auto r : gfm_rows) {
+            int hp_byte0 = 0, hp_byte1 = 0;
+            int hp_valid0 = 0, hp_valid1 = 0;
+            for (auto c = 0; c < 52; c++) {
+                if (c == parity_col) continue;
+                auto &w = hash.galois_field_matrix[r][c];
+                hp_byte0 ^= w.byte0;
+                hp_byte1 ^= w.byte1;
+                hp_valid0 ^= w.valid0;
+                hp_valid1 ^= w.valid1;
+            }
+            auto &w_hp = hash.galois_field_matrix[r][parity_col];
+            w_hp.byte0.rewrite();
+            w_hp.byte1.rewrite();
+            w_hp.valid0.rewrite();
+            w_hp.valid1.rewrite();
+            w_hp.byte0 = hp_byte0;
+            w_hp.byte1 = hp_byte1;
+            w_hp.valid0 = hp_valid0;
+            w_hp.valid1 = hp_valid1;
+        }
+    }
+}
+
+template void InputXbar::write_galois_matrix(Target::Tofino::mau_regs &regs, HashTable id,
+                                             const std::map<int, HashCol> &mat);
diff --git a/backends/tofino/bf-asm/tofino/input_xbar.h b/backends/tofino/bf-asm/tofino/input_xbar.h
new file mode 100644
index 00000000000..5fcf746e7ce
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/input_xbar.h
@@ -0,0 +1,27 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_INPUT_XBAR_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_INPUT_XBAR_H_
+
+#include "backends/tofino/bf-asm/input_xbar.h"
+
+template <>
+void InputXbar::write_galois_matrix(Target::Tofino::mau_regs &regs, HashTable id,
+                                    const std::map<int, HashCol> &mat);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_INPUT_XBAR_H_ */
diff --git a/backends/tofino/bf-asm/tofino/instruction.cpp b/backends/tofino/bf-asm/tofino/instruction.cpp
new file mode 100644
index 00000000000..e2ca6ad7b87
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/instruction.cpp
@@ -0,0 +1,56 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* Tofino overloads for instructions #included in instruction.cpp
+ * WARNING -- this is included in an anonymous namespace, as VLIWInstruction is
+ * in that anonymous namespace */
+
+void VLIWInstruction::write_regs(Target::Tofino::mau_regs &regs, Table *tbl,
+                                 Table::Actions::Action *act) {
+    if (act != tbl->stage->imem_addr_use[tbl->gress][act->addr]) {
+        LOG3("skipping " << tbl->name() << '.' << act->name << " as its imem is used by "
+                         << tbl->stage->imem_addr_use[tbl->gress][act->addr]->name);
+        return;
+    }
+    LOG2(this);
+    auto &imem = regs.dp.imem;
+    int iaddr = act->addr / ACTION_IMEM_COLORS;
+    int color = act->addr % ACTION_IMEM_COLORS;
+    unsigned bits = encode();
+    BUG_CHECK(slot >= 0);
+    switch (Phv::reg(slot)->size) {
+        case 8:
+            imem.imem_subword8[slot - 64][iaddr].imem_subword8_instr = bits;
+            imem.imem_subword8[slot - 64][iaddr].imem_subword8_color = color;
+            imem.imem_subword8[slot - 64][iaddr].imem_subword8_parity = parity(bits) ^ color;
+            break;
+        case 16:
+            imem.imem_subword16[slot - 128][iaddr].imem_subword16_instr = bits;
+            imem.imem_subword16[slot - 128][iaddr].imem_subword16_color = color;
+            imem.imem_subword16[slot - 128][iaddr].imem_subword16_parity = parity(bits) ^ color;
+            break;
+        case 32:
+            imem.imem_subword32[slot][iaddr].imem_subword32_instr = bits;
+            imem.imem_subword32[slot][iaddr].imem_subword32_color = color;
+            imem.imem_subword32[slot][iaddr].imem_subword32_parity = parity(bits) ^ color;
+            break;
+        default:
+            BUG();
+    }
+    auto &power_ctl = regs.dp.actionmux_din_power_ctl;
+    phvRead([&](const Phv::Slice &sl) { set_power_ctl_reg(power_ctl, sl.reg.mau_id()); });
+}
diff --git a/backends/tofino/bf-asm/tofino/match_table.cpp b/backends/tofino/bf-asm/tofino/match_table.cpp
new file mode 100644
index 00000000000..57513269851
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/match_table.cpp
@@ -0,0 +1,75 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* mau table template specializations for tofino -- #included directly in match_tables.cpp */
+
+template <>
+void MatchTable::write_next_table_regs(Target::Tofino::mau_regs &regs, Table *tbl) {
+    auto &merge = regs.rams.match.merge;
+    // Copies the values directly from the hit map provided by the compiler directly into the
+    // map
+    if (!tbl->get_hit_next().empty()) {
+        merge.next_table_map_en |= (1U << logical_id);
+        auto &mp = merge.next_table_map_data[logical_id];
+        ubits<8> *map_data[8] = {&mp[0].next_table_map_data0, &mp[0].next_table_map_data1,
+                                 &mp[0].next_table_map_data2, &mp[0].next_table_map_data3,
+                                 &mp[1].next_table_map_data0, &mp[1].next_table_map_data1,
+                                 &mp[1].next_table_map_data2, &mp[1].next_table_map_data3};
+        int index = 0;
+        for (auto &n : tbl->get_hit_next()) *map_data[index++] = n.next_table_id();
+    }
+
+    merge.next_table_format_data[logical_id].match_next_table_adr_mask = next_table_adr_mask;
+
+    /**
+     * Unfortunately for the compiler/driver integration, this register is both required
+     * to be owned by the compiler and the driver.  The driver is responsible for programming
+     * this register when the default action of a table is specified.  The value written
+     * is the next_table_full of that particular action.
+     *
+     * However, the compiler owns this register in the following scenarios:
+     *     1. For match_with_no_key tables, where the pathway is through the hit pathway,
+     *        the driver does not touch this register, as the values are actually reversed
+     *     2. For a table that is split into multiple tables, the driver only writes the
+     *        last value.  Thus the compiler now sets up this register for all tables
+     *        before this.
+     */
+    merge.next_table_format_data[logical_id].match_next_table_adr_miss_value =
+        tbl->get_miss_next().next_table_id();
+    /**
+     * The next_table_format_data register is built up of three values:
+     *     - match_next_table_adr_miss_value - Configurable at runtime
+     *     - match_next_table_adr_mask - Static Config
+     *     - match_next_table_adr_default - Static Config
+     *
+     * In order to reprogram the register at runtime, the driver must have all three values to
+     * not require a hardware read, even though only one is truly programmable.  Thus in the
+     * context JSON, we provide the two extra values in an extremely poorly named JSON
+     *
+     * ERROR: Driver doesn't read the match_next_table_adr_default
+     * "default_next_table_mask" - match_next_table_adr_mask
+     * "default_next_table" - Only required if a table has no default_action specified, which is
+     *      only a Glass value.  This could always be 0.  Perhaps we can remove from Brig through
+     *      compiler version?
+     *
+     */
+}
+
+template <>
+void MatchTable::write_regs(Target::Tofino::mau_regs &regs, int type, Table *result) {
+    write_common_regs<Target::Tofino>(regs, type, result);
+}
diff --git a/backends/tofino/bf-asm/tofino/meter.h b/backends/tofino/bf-asm/tofino/meter.h
new file mode 100644
index 00000000000..f812ecee37c
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/meter.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_METER_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_METER_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+
+class Target::Tofino::MeterTable : public ::MeterTable {
+    friend class ::MeterTable;
+    MeterTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::MeterTable(line, n, gr, s, lid) {}
+};
+
+template <>
+void MeterTable::setup_teop_regs(Target::Tofino::mau_regs &, int) {
+    BUG();  // no teop on tofino
+}
+
+template <>
+void MeterTable::write_alu_vpn_range(Target::Tofino::mau_regs &) {
+    BUG();  // not available on tofino
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_METER_H_ */
diff --git a/backends/tofino/bf-asm/tofino/parser.cpp b/backends/tofino/bf-asm/tofino/parser.cpp
new file mode 100644
index 00000000000..2ef8f7b510b
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/parser.cpp
@@ -0,0 +1,1631 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <initializer_list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "backends/tofino/bf-asm/misc.h"
+#include "backends/tofino/bf-asm/parser-tofino-jbay.h"
+#include "backends/tofino/bf-asm/target.h"
+#include "backends/tofino/bf-asm/top_level.h"
+
+// ----------------------------------------------------------------------------
+// Slots & Useful constants
+// ----------------------------------------------------------------------------
+
+// Following constants are used for detection of unused slot (e.g., initial value) and
+// minimal/maximal indexes for extractor slots
+static unsigned EXTRACT_SLOT_UNUSED = 511;
+static unsigned EXTRACT_SLOT_CONSTANT_DIS = 0;
+static unsigned EXTRACT_SLOT_CONSTANT_EN = 1;
+static unsigned EXTRACT_SLOT_CONSTANT_ZERO = 0;
+static unsigned PHV_MIN_INDEX = 0;
+static unsigned PHV_MAX_INDEX = 224;
+static unsigned TPHV_MIN_INDEX = 256;
+static unsigned TPHV_MAX_INDEX = 368;
+
+/* remapping structure for getting at the config bits for phv output
+ * programming in a systematic way */
+struct tofino_phv_output_map {
+    int size; /* 8, 16, or 32 */
+    ubits<9> *dst;
+    ubits_base *src; /* 6 or 8 bits */
+    ubits<1> *src_type, *offset_add, *offset_rot;
+};
+std::ostream &operator<<(std::ostream &of, const tofino_phv_output_map *om) {
+    of << om->size << "bit, dst = " << std::to_string(om->dst->value)
+       << ", src = " << std::to_string(om->src->value);
+    if (om->src_type) of << ", src_type = " << std::to_string(om->src_type->value);
+    if (om->offset_add) of << ", offset_add = " << std::to_string(om->offset_add->value);
+    if (om->offset_rot) of << ", offset_rot = " << std::to_string(om->offset_rot->value);
+    return of;
+}
+enum extractor_slots {
+    /* enum for indexes in the tofino_phv_output_map */
+    phv_32b_0,
+    phv_32b_1,
+    phv_32b_2,
+    phv_32b_3,
+    phv_16b_0,
+    phv_16b_1,
+    phv_16b_2,
+    phv_16b_3,
+    phv_8b_0,
+    phv_8b_1,
+    phv_8b_2,
+    phv_8b_3,
+    tofino_phv_output_map_size,
+};
+
+// PHV use slots: ordered list of slots to try
+//
+// For example, when trying to find a 32b slot:
+//  1. First try the 4 x 32b extractors.
+//  2. If that fails, try pairs of 16b extractors.
+//  3. If that still fails, finally try the 8b extractors together.
+//
+// For checksums, allocate in the reverse order as we need to fill
+// from the last container back due to a HW bug (see MODEL-210).
+//
+// FIXME: what does "shift" represent???
+static struct phv_use_slots {
+    int idx;
+    unsigned usemask, shift, size;
+} phv_32b_slots[] = {{phv_32b_0, 1U << phv_32b_0, 0, 32},  {phv_32b_1, 1U << phv_32b_1, 0, 32},
+                     {phv_32b_2, 1U << phv_32b_2, 0, 32},  {phv_32b_3, 1U << phv_32b_3, 0, 32},
+                     {phv_16b_0, 3U << phv_16b_0, 16, 16}, {phv_16b_2, 3U << phv_16b_2, 16, 16},
+                     {phv_8b_0, 0xfU << phv_8b_0, 24, 8},  {0, 0, 0, 0}},
+  phv_16b_slots[] = {{phv_16b_0, 1U << phv_16b_0, 0, 16},
+                     {phv_16b_1, 1U << phv_16b_1, 0, 16},
+                     {phv_16b_2, 1U << phv_16b_2, 0, 16},
+                     {phv_16b_3, 1U << phv_16b_3, 0, 16},
+                     {phv_8b_0, 3U << phv_8b_0, 8, 8},
+                     {phv_8b_2, 3U << phv_8b_2, 8, 8},
+                     {0, 0, 0, 0}},
+  phv_8b_slots[] = {{phv_8b_0, 1U << phv_8b_0, 0, 8},
+                    {phv_8b_1, 1U << phv_8b_1, 0, 8},
+                    {phv_8b_2, 1U << phv_8b_2, 0, 8},
+                    {phv_8b_3, 1U << phv_8b_3, 0, 8},
+                    {0, 0, 0, 0}},
+  phv_32b_csum_slots[] = {{phv_32b_3, 1U << phv_32b_3, 0, 32},
+                          {phv_32b_2, 1U << phv_32b_2, 0, 32},
+                          {phv_32b_1, 1U << phv_32b_1, 0, 32},
+                          {phv_32b_0, 1U << phv_32b_0, 0, 32},
+                          {phv_16b_2, 3U << phv_16b_2, 16, 16},
+                          {phv_16b_0, 3U << phv_16b_0, 16, 16},
+                          {phv_8b_0, 0xfU << phv_8b_0, 24, 8},
+                          {0, 0, 0, 0}},
+  phv_16b_csum_slots[] = {{phv_16b_3, 1U << phv_16b_3, 0, 16},
+                          {phv_16b_2, 1U << phv_16b_2, 0, 16},
+                          {phv_16b_1, 1U << phv_16b_1, 0, 16},
+                          {phv_16b_0, 1U << phv_16b_0, 0, 16},
+                          {phv_8b_2, 3U << phv_8b_2, 8, 8},
+                          {phv_8b_0, 3U << phv_8b_0, 8, 8},
+                          {0, 0, 0, 0}},
+  phv_8b_csum_slots[] = {{phv_8b_3, 1U << phv_8b_3, 0, 8},
+                         {phv_8b_2, 1U << phv_8b_2, 0, 8},
+                         {phv_8b_1, 1U << phv_8b_1, 0, 8},
+                         {phv_8b_0, 1U << phv_8b_0, 0, 8},
+                         {0, 0, 0, 0}};
+
+static phv_use_slots *get_phv_use_slots(int size) {
+    phv_use_slots *usable_slots = nullptr;
+
+    if (size == 32)
+        usable_slots = phv_32b_slots;
+    else if (size == 16)
+        usable_slots = phv_16b_slots;
+    else if (size == 8)
+        usable_slots = phv_8b_slots;
+    else
+        BUG();
+
+    return usable_slots;
+}
+
+static phv_use_slots *get_phv_csum_use_slots(int size) {
+    phv_use_slots *usable_slots = nullptr;
+
+    if (size == 32)
+        usable_slots = phv_32b_csum_slots;
+    else if (size == 16)
+        usable_slots = phv_16b_csum_slots;
+    else if (size == 8)
+        usable_slots = phv_8b_csum_slots;
+    else
+        BUG();
+
+    return usable_slots;
+}
+
+// ----------------------------------------------------------------------------
+// Helping classes
+// ----------------------------------------------------------------------------
+
+/// Helping cache to remember values for a different parser objects
+/// based on the type of extraction size. The storage is done into two
+/// layers and user is free to specify the values of layer 1 and layer 2
+/// types. The third type specifies the return value
+template <typename T1, typename T2, typename T3>
+class TwoLevelCache {
+    std::map<T1, std::map<T2, T3>> m_cache;
+
+ public:
+    void insert(const T1 key1, T2 key2, T3 val) { m_cache[key1][key2] = val; }
+
+    bool has(const T1 key1, T2 key2) const {
+        if (!m_cache.count(key1)) return false;
+        auto level1 = m_cache.at(key1);
+
+        return level1.count(key2);
+    }
+
+    T3 get(const T1 key1, T2 key2) const { return m_cache.at(key1).at(key2); }
+};
+
+/**
+ * @brief This class is used for internal tracking of Tofino output map
+ * extractor allocation. It is beneficial during the debugging process of this
+ * functionality because it can provid the answer to question:
+ * "What is alloacted into this extractor slot?"
+ */
+class MatchSlotTracker {
+    using SetMap =
+        TwoLevelCache<const Parser::State::Match *, int, const Parser::State::Match::Set *>;
+    using SaveMap =
+        TwoLevelCache<const Parser::State::Match *, int, const Parser::State::Match::Save *>;
+    using CsumMap = TwoLevelCache<const Parser::State::Match *, int, const Parser::Checksum *>;
+    using PaddingMap = TwoLevelCache<const Parser::State::Match *, int, tofino_phv_output_map *>;
+
+ public:
+    // Helping caches for tracking of slot occupancy mapping
+    SetMap setMap;
+    SaveMap saveMap;
+    CsumMap csumMap;
+    PaddingMap padMap;
+
+    /**
+     * @brief Get the db slots object
+     *
+     * @param match Match line to dump
+     * @param slot_idx Passed index of slot which needs to be dumped
+     * @return std::string object with dumped data
+     */
+    std::string get_db_slots(const Parser::State::Match *match, const int slot_idx) const {
+        std::stringstream ss;
+        ss << "Mapping for state " << match->state->name;
+        if (match->match) ss << ", match " << match->match;
+        ss << ", slot " << slot_idx << ": ";
+
+        if (setMap.has(match, slot_idx)) {
+            auto set = setMap.get(match, slot_idx);
+            ss << "set, " << set->where;
+        } else if (saveMap.has(match, slot_idx)) {
+            auto save = saveMap.get(match, slot_idx);
+            ss << "save, " << save->where;
+        } else if (csumMap.has(match, slot_idx)) {
+            auto csum = csumMap.get(match, slot_idx);
+            ss << "csum, " << csum->dest;
+        } else if (padMap.has(match, slot_idx)) {
+            ss << "fake extraction padding, " << padMap.get(match, slot_idx);
+        } else {
+            ss << "<unknown>";
+        }
+
+        return ss.str();
+    }
+};
+
+static MatchSlotTracker matchSlotTracker;
+
+// ----------------------------------------------------------------------------
+//  Parser configuration dump
+// ----------------------------------------------------------------------------
+
+template <>
+void Parser::Checksum::write_config(Target::Tofino::parser_regs &regs, Parser *parser) {
+    if (unit == 0)
+        write_tofino_row_config(regs.memory[gress].po_csum_ctrl_0_row[addr]);
+    else if (unit == 1)
+        write_tofino_row_config(regs.memory[gress].po_csum_ctrl_1_row[addr]);
+    else
+        error(lineno, "invalid unit for parser checksum");
+}
+
+template <>
+void Parser::CounterInit::write_config(Target::Tofino::parser_regs &regs, gress_t gress, int idx) {
+    auto &ctr_init_ram = regs.memory[gress].ml_ctr_init_ram[idx];
+    ctr_init_ram.add = add;
+    ctr_init_ram.mask = mask;
+    ctr_init_ram.rotate = rot;
+    ctr_init_ram.max = max;
+    ctr_init_ram.src = src;
+}
+
+template <>
+void Parser::RateLimit::write_config(::Tofino::regs_pipe &regs, gress_t gress) {
+    if (gress == INGRESS) {
+        auto &ctrl = regs.pmarb.parb_reg.parb_group.i_output_rate_ctrl;
+        ctrl.ratectrl_inc = inc;
+        ctrl.ratectrl_dec = dec;
+        ctrl.ratectrl_max = max;
+        ctrl.ratectrl_ena = 1;
+    } else if (gress == EGRESS) {
+        auto &ctrl = regs.pmarb.parb_reg.parb_group.e_output_rate_ctrl;
+        ctrl.ratectrl_inc = inc;
+        ctrl.ratectrl_dec = dec;
+        ctrl.ratectrl_max = max;
+        ctrl.ratectrl_ena = 1;
+    }
+}
+
+template <>
+void Parser::State::Match::write_lookup_config(Target::Tofino::parser_regs &regs, State *state,
+                                               int row) const {
+    auto &word0 = regs.memory[state->gress].ml_tcam_row_word0[row];
+    auto &word1 = regs.memory[state->gress].ml_tcam_row_word1[row];
+    match_t lookup = {0, 0};
+    unsigned dont_care = 0;
+    for (int i = 0; i < 4; i++) {
+        lookup.word0 <<= 8;
+        lookup.word1 <<= 8;
+        dont_care <<= 8;
+        if (state->key.data[i].bit >= 0) {
+            lookup.word0 |= ((match.word0 >> state->key.data[i].bit) & 0xff);
+            lookup.word1 |= ((match.word1 >> state->key.data[i].bit) & 0xff);
+        } else {
+            dont_care |= 0xff;
+        }
+    }
+    lookup.word0 |= dont_care;
+    lookup.word1 |= dont_care;
+    word0.lookup_16 = (lookup.word0 >> 16) & 0xffff;
+    word1.lookup_16 = (lookup.word1 >> 16) & 0xffff;
+    word0.lookup_8[0] = (lookup.word0 >> 8) & 0xff;
+    word1.lookup_8[0] = (lookup.word1 >> 8) & 0xff;
+    word0.lookup_8[1] = lookup.word0 & 0xff;
+    word1.lookup_8[1] = lookup.word1 & 0xff;
+    word0.curr_state = state->stateno.word0;
+    word1.curr_state = state->stateno.word1;
+    if (state->key.ctr_zero >= 0) {
+        word0.ctr_zero = (match.word0 >> state->key.ctr_zero) & 1;
+        word1.ctr_zero = (match.word1 >> state->key.ctr_zero) & 1;
+    } else {
+        word0.ctr_zero = word1.ctr_zero = 1;
+    }
+
+    if (state->key.ctr_neg >= 0) {
+        word0.ctr_neg = (match.word0 >> state->key.ctr_neg) & 1;
+        word1.ctr_neg = (match.word1 >> state->key.ctr_neg) & 1;
+    } else {
+        word0.ctr_neg = word1.ctr_neg = 1;
+    }
+
+    word0.ver_0 = word1.ver_0 = 1;
+    word0.ver_1 = word1.ver_1 = 1;
+}
+
+/* FIXME -- combine these next two methods into a single method on MatchKey */
+/* FIXME -- factor Tofino/JBay variation better (most is common) */
+template <>
+int Parser::State::write_lookup_config(Target::Tofino::parser_regs &regs, Parser *pa, State *state,
+                                       int row, const std::vector<State *> &prev) {
+    LOG2("-- checking match from state " << name << " (" << stateno << ')');
+    auto &ea_row = regs.memory[gress].ml_ea_row[row];
+    int max_off = -1;
+    for (int i = 0; i < 4; i++) {
+        if (i == 1) continue;
+        if (key.data[i].bit < 0) continue;
+        bool set = true;
+        for (State *p : prev) {
+            if (p->key.data[i].bit >= 0) {
+                set = false;
+                if (p->key.data[i].byte != key.data[i].byte)
+                    error(p->lineno,
+                          "Incompatible match fields between states "
+                          "%s and %s, triggered from state %s",
+                          name.c_str(), p->name.c_str(), state->name.c_str());
+            }
+        }
+        if (set && key.data[i].byte != MatchKey::USE_SAVED) {
+            int off = key.data[i].byte + ea_row.shift_amt;
+            if (off < 0 || off >= 32) {
+                error(key.lineno,
+                      "Match offset of %d in state %s out of range "
+                      "for previous state %s",
+                      key.data[i].byte, name.c_str(), state->name.c_str());
+            } else if (i) {
+                ea_row.lookup_offset_8[(i - 2)] = off;
+                ea_row.ld_lookup_8[(i - 2)] = 1;
+                max_off = std::max(max_off, off);
+            } else {
+                ea_row.lookup_offset_16 = off;
+                ea_row.ld_lookup_16 = 1;
+                max_off = std::max(max_off, off + 1);
+            }
+        }
+    }
+    return max_off;
+}
+
+template <>
+int Parser::State::Match::write_load_config(Target::Tofino::parser_regs &regs, Parser *pa,
+                                            State *state, int row) const {
+    auto &ea_row = regs.memory[state->gress].ml_ea_row[row];
+    int max_off = -1;
+    for (int i = 0; i < 4; i++) {
+        if (i == 1) continue;
+        if (load.data[i].bit < 0) continue;
+        if (load.data[i].byte != MatchKey::USE_SAVED) {
+            int off = load.data[i].byte;
+            if (off < 0 || off >= 32) {
+                error(load.lineno, "Load offset of %d in state %s out of range", load.data[i].byte,
+                      state->name.c_str());
+            } else if (i) {
+                ea_row.lookup_offset_8[(i - 2)] = off;
+                ea_row.ld_lookup_8[(i - 2)] = 1;
+                max_off = std::max(max_off, off);
+            } else {
+                ea_row.lookup_offset_16 = off;
+                ea_row.ld_lookup_16 = 1;
+                max_off = std::max(max_off, off + 1);
+            }
+        }
+    }
+    return max_off;
+}
+
+// Narrow-to-wide extraction alignment needs adjusting when
+// 8b/16b checksum validations are written in the same cycle
+bool adjust_phv_use_slot(phv_use_slots &slot, int size, int csum_8b, int csum_16b) {
+    if ((size == 32 && slot.idx >= phv_16b_0) || (size == 16 && slot.idx >= phv_8b_0)) {
+        if (slot.idx <= phv_16b_3) {
+            slot.idx -= csum_16b;
+            slot.usemask >>= csum_16b;
+            return slot.idx >= phv_16b_0;
+        } else {
+            slot.idx -= csum_8b;
+            slot.usemask >>= csum_8b;
+            return slot.idx >= phv_8b_0;
+        }
+    }
+    return true;
+}
+
+template <>
+void Parser::Checksum::write_output_config(Target::Tofino::parser_regs &regs, Parser *pa,
+                                           State::Match *ma, void *_map, unsigned &used) const {
+    if (type != 0 || !dest) return;
+
+    // checksum verification requires the last extractor to be a dummy (to work around a RTL bug)
+    // see MODEL-210 for discussion.
+
+    tofino_phv_output_map *map = reinterpret_cast<tofino_phv_output_map *>(_map);
+
+    phv_use_slots *usable_slots = get_phv_csum_use_slots(dest->reg.size);
+
+    auto &slot = usable_slots[0];
+
+    auto id = dest->reg.parser_id();
+    *map[slot.idx].dst = id;
+    matchSlotTracker.csumMap.insert(ma, slot.idx, this);
+    // The source address is checked for source extract errors whenever the dest
+    // is not 511. To prevent errors when buf_req = 0 (corresponding to states with no extracts),
+    // point the source to the version area of the source range which is always valid.
+    *map[slot.idx].src = PARSER_SRC_MAX_IDX - (dest->reg.size / 8) + 1;
+    used |= slot.usemask;
+
+    pa->phv_allow_bitwise_or[id] = 1;
+}
+
+template <>
+int Parser::State::Match::Save::write_output_config(Target::Tofino::parser_regs &regs, void *_map,
+                                                    unsigned &used, int csum_8b,
+                                                    int csum_16b) const {
+    tofino_phv_output_map *map = reinterpret_cast<tofino_phv_output_map *>(_map);
+
+    int slot_size = (hi - lo + 1) * 8;
+    phv_use_slots *usable_slots = get_phv_use_slots(slot_size);
+
+    for (int i = 0; usable_slots[i].usemask; i++) {
+        auto slot = usable_slots[i];
+        if (!adjust_phv_use_slot(slot, where->reg.size, csum_8b, csum_16b)) continue;
+        if (used & slot.usemask) continue;
+        if ((flags & ROTATE) && !map[slot.idx].offset_rot) continue;
+
+        if ((where->reg.size == 32 && slot.idx >= phv_16b_0) ||
+            (where->reg.size == 16 && slot.idx >= phv_8b_0)) {
+            match->has_narrow_to_wide_extract = true;
+
+            if (where->reg.size == 32 && slot.idx == phv_8b_0) {
+                match->narrow_to_wide_32b_8.push_back(&where);
+            } else if (where->reg.size == 32 && slot.idx >= phv_16b_0) {
+                match->narrow_to_wide_32b_16.push_back(&where);
+            } else {
+                match->narrow_to_wide_16b_8.push_back(&where);
+            }
+        }
+
+        // swizzle upper/lower pairs of extractors for 4x8->32
+        // a 32b value using 8b extractors must use the extractors in this order: [2 3 0 1]
+        bool swizzle_b1 = where->reg.size == 32 && slot.idx == phv_8b_0;
+
+        int byte = lo;
+        for (int i = slot.idx; slot.usemask & (1U << i); i++, byte += slot.size / 8U) {
+            int x = i;
+            if (swizzle_b1) x ^= 2;
+
+            *map[x].dst = where->reg.parser_id();
+            *map[x].src = byte;
+            matchSlotTracker.saveMap.insert(match, x, this);
+            if (flags & OFFSET) *map[x].offset_add = 1;
+            if (flags & ROTATE) *map[x].offset_rot = 1;
+        }
+        used |= slot.usemask;
+        return hi;
+    }
+    error(where.lineno, "Ran out of phv output extractor slots");
+    return -1;
+}
+
+bool can_slot_extract_constant(int slot) {
+    return slot != phv_16b_2 && slot != phv_16b_3 && slot != phv_32b_2 && slot != phv_32b_3;
+}
+
+/**
+ * @brief Encode constant @p val for use with extractor slot @p slot.
+ *
+ * @param slot Valid value of enum extractor_slot
+ * @param val Constant to encode
+ * @return int The encoded constant, or -1 if given @p slot cannot extract a constant or is not a
+ * valid value of enum extractor_slot
+ */
+static int encode_constant_for_slot(int slot, unsigned val) {
+    if (!can_slot_extract_constant(slot)) return -1;
+    if (val == 0) return val;
+    switch (slot) {
+        case phv_32b_0:
+        case phv_32b_1:
+            for (int i = 0; i < 32; i++) {
+                if ((val & 1) && (0x7 & val) == val) return (i << 3) | val;
+                val = ((val >> 1) | (val << 31)) & 0xffffffffU;
+            }
+            return -1;
+        case phv_16b_0:
+        case phv_16b_1:
+            if ((val >> 16) && encode_constant_for_slot(slot, val >> 16) < 0) return -1;
+            val &= 0xffff;
+            for (int i = 0; i < 16; i++) {
+                if ((val & 1) && (0xf & val) == val) return (i << 4) | val;
+                val = ((val >> 1) | (val << 15)) & 0xffffU;
+            }
+            return -1;
+        case phv_8b_0:
+        case phv_8b_1:
+        case phv_8b_2:
+        case phv_8b_3:
+            return val & 0xff;
+        default:
+            BUG();
+            return -1;
+    }
+}
+
+template <>
+void Parser::State::Match::Set::write_output_config(Target::Tofino::parser_regs &regs, void *_map,
+                                                    unsigned &used, int csum_8b,
+                                                    int csum_16b) const {
+    tofino_phv_output_map *map = reinterpret_cast<tofino_phv_output_map *>(_map);
+
+    phv_use_slots *usable_slots = get_phv_use_slots(where->reg.size);
+
+    for (int i = 0; usable_slots[i].usemask; i++) {
+        auto slot = usable_slots[i];
+        if (!adjust_phv_use_slot(slot, where->reg.size, csum_8b, csum_16b)) continue;
+        if (used & slot.usemask) continue;
+        if (!map[slot.idx].src_type) continue;
+        if ((flags & ROTATE) && (!map[slot.idx].offset_rot || slot.shift)) continue;
+        unsigned shift = 0;
+        bool can_encode = true;
+        for (int i = slot.idx; slot.usemask & (1U << i); i++) {
+            if (encode_constant_for_slot(i, (what << where->lo) >> shift) < 0) {
+                can_encode = false;
+                break;
+            }
+            shift += slot.size;
+        }
+        if (!can_encode) continue;
+
+        if ((where->reg.size == 32 && slot.idx >= phv_16b_0) ||
+            (where->reg.size == 16 && slot.idx >= phv_8b_0)) {
+            match->has_narrow_to_wide_extract = true;
+
+            if (where->reg.size == 32 && slot.idx == phv_8b_0) {
+                match->narrow_to_wide_32b_8.push_back(&where);
+            } else if (where->reg.size == 32 && slot.idx >= phv_16b_0) {
+                match->narrow_to_wide_32b_16.push_back(&where);
+            } else {
+                match->narrow_to_wide_16b_8.push_back(&where);
+            }
+        }
+
+        // swizzle upper/lower pairs of extractors for 4x8->32
+        // a 32b value using 8b extractors must use the extractors in this order: [2 3 0 1]
+        bool swizzle_b1 = where->reg.size == 32 && slot.idx == phv_8b_0;
+
+        // Go from most- to least-significant slice
+        shift = where->reg.size - slot.size;
+        for (int i = slot.idx; slot.usemask & (1U << i); i++) {
+            int x = i;
+            if (swizzle_b1) x ^= 2;
+
+            *map[x].dst = where->reg.parser_id();
+            *map[x].src_type = 1;
+            auto v = encode_constant_for_slot(x, (what << where->lo) >> shift);
+            *map[x].src = v;
+            matchSlotTracker.setMap.insert(match, x, this);
+            if (flags & OFFSET) *map[x].offset_add = 1;
+            if (flags & ROTATE) *map[x].offset_rot = 1;
+            shift -= slot.size;
+        }
+        used |= slot.usemask;
+        return;
+    }
+    error(where.lineno, "Ran out of phv output extractor slots");
+}
+
+/** Tofino1-specific output map management
+ * Tofino1 has separate 8- 16- and 32-bit extractors with various limitations on extracting
+ * constants and capability of ganging extractors to extract larger PHVs or extrating adjacent
+ * pairs of smaller PHVs.  They're also addressed via named registers rather than an array,
+ * so we build an array of pointers into the reg object to simplify things.  The `used`
+ * value ends up begin a simple 12-bit bitmap with 1 bit for each extractor.
+ */
+
+#define OUTPUT_MAP_INIT(MAP, ROW, SIZE, INDEX)                                         \
+    MAP[phv_##SIZE##b_##INDEX].size = SIZE;                                            \
+    MAP[phv_##SIZE##b_##INDEX].dst = &ROW.phv_##SIZE##b_dst_##INDEX;                   \
+    MAP[phv_##SIZE##b_##INDEX].src = &ROW.phv_##SIZE##b_src_##INDEX;                   \
+    MAP[phv_##SIZE##b_##INDEX].src_type = &ROW.phv_##SIZE##b_src_type_##INDEX;         \
+    MAP[phv_##SIZE##b_##INDEX].offset_add = &ROW.phv_##SIZE##b_offset_add_dst_##INDEX; \
+    MAP[phv_##SIZE##b_##INDEX].offset_rot = &ROW.phv_##SIZE##b_offset_rot_imm_##INDEX;
+#define OUTPUT_MAP_INIT_PART(MAP, ROW, SIZE, INDEX)                                    \
+    MAP[phv_##SIZE##b_##INDEX].size = SIZE;                                            \
+    MAP[phv_##SIZE##b_##INDEX].dst = &ROW.phv_##SIZE##b_dst_##INDEX;                   \
+    MAP[phv_##SIZE##b_##INDEX].src = &ROW.phv_##SIZE##b_src_##INDEX;                   \
+    MAP[phv_##SIZE##b_##INDEX].src_type = 0;                                           \
+    MAP[phv_##SIZE##b_##INDEX].offset_add = &ROW.phv_##SIZE##b_offset_add_dst_##INDEX; \
+    MAP[phv_##SIZE##b_##INDEX].offset_rot = 0;
+
+template <>
+void *Parser::setup_phv_output_map(Target::Tofino::parser_regs &regs, gress_t gress, int row) {
+    static tofino_phv_output_map map[tofino_phv_output_map_size];
+    auto &action_row = regs.memory[gress].po_action_row[row];
+    OUTPUT_MAP_INIT(map, action_row, 32, 0)
+    OUTPUT_MAP_INIT(map, action_row, 32, 1)
+    OUTPUT_MAP_INIT_PART(map, action_row, 32, 2)
+    OUTPUT_MAP_INIT_PART(map, action_row, 32, 3)
+    OUTPUT_MAP_INIT(map, action_row, 16, 0)
+    OUTPUT_MAP_INIT(map, action_row, 16, 1)
+    OUTPUT_MAP_INIT_PART(map, action_row, 16, 2)
+    OUTPUT_MAP_INIT_PART(map, action_row, 16, 3)
+    OUTPUT_MAP_INIT(map, action_row, 8, 0)
+    OUTPUT_MAP_INIT(map, action_row, 8, 1)
+    OUTPUT_MAP_INIT(map, action_row, 8, 2)
+    OUTPUT_MAP_INIT(map, action_row, 8, 3)
+    return map;
+}
+
+template <>
+void Parser::mark_unused_output_map(Target::Tofino::parser_regs &regs, void *_map, unsigned used) {
+    tofino_phv_output_map *map = reinterpret_cast<tofino_phv_output_map *>(_map);
+    for (int i = 0; i < tofino_phv_output_map_size; i++)
+        if (!(used & (1U << i))) *map[i].dst = 0x1ff;
+}
+
+template <>
+void Parser::State::Match::HdrLenIncStop::write_config(
+    Tofino::memories_all_parser_::_po_action_row &) const {
+    BUG();  // no hdr_len_inc_stop on tofino; should not get here
+}
+
+template <>
+void Parser::State::Match::Clot::write_config(Tofino::memories_all_parser_::_po_action_row &, int,
+                                              bool) const {
+    BUG();  // no CLOTs on tofino; should not get here
+}
+
+template <>
+void Parser::State::Match::write_counter_config(
+    Target::Tofino::parser_regs::_memory::_ml_ea_row &ea_row) const {
+    ea_row.ctr_amt_idx = ctr_instr ? ctr_instr->addr : ctr_imm_amt;
+    ea_row.ctr_ld_src = ctr_ld_src;
+    ea_row.ctr_load = ctr_load;
+}
+
+template <class COMMON>
+void init_common_regs(Parser *p, COMMON &regs, gress_t gress) {
+    // TODO: fixed config copied from compiler -- needs to be controllable
+    for (int i = 0; i < 4; i++) {
+        if (p->start_state[i]) {
+            regs.start_state.state[i] = p->start_state[i]->stateno.word1;
+            regs.enable_.enable_[i] = 1;
+        }
+        regs.pri_start.pri[i] = p->priority[i];
+        regs.pri_thresh.pri[i] = p->pri_thresh[i];
+    }
+    regs.mode = 4;
+    regs.max_iter.max = 128;
+    if (p->parser_error.lineno >= 0) {
+        regs.err_phv_cfg.dst = p->parser_error->reg.parser_id();
+        regs.err_phv_cfg.aram_mbe_en = 1;
+        regs.err_phv_cfg.ctr_range_err_en = 1;
+        regs.err_phv_cfg.dst_cont_err_en = 1;
+        regs.err_phv_cfg.fcs_err_en = 1;
+        regs.err_phv_cfg.multi_wr_err_en = 1;
+        regs.err_phv_cfg.no_tcam_match_err_en = 1;
+        regs.err_phv_cfg.partial_hdr_err_en = 1;
+        regs.err_phv_cfg.phv_owner_err_en = 1;
+        regs.err_phv_cfg.src_ext_err_en = 1;
+        regs.err_phv_cfg.timeout_cycle_err_en = 1;
+        regs.err_phv_cfg.timeout_iter_err_en = 1;
+    }
+}
+
+enum class AnalysisType { BIT8, BIT16 };
+using extractor_slots_list = std::initializer_list<extractor_slots>;
+const extractor_slots_list phv_8bit_extractors = {phv_8b_0, phv_8b_1, phv_8b_2, phv_8b_3};
+const extractor_slots_list phv_16bit_extractors = {phv_16b_0, phv_16b_1, phv_16b_2, phv_16b_3};
+// Declare a helping type for the count cache class
+using ExtractionCountCache = TwoLevelCache<const Parser::State::Match *, AnalysisType, int>;
+
+/// Count the number of extractions for a given @p match.
+/// The method takes the @p elems list which holds PHV indexes to check
+/// (accepted lists are @p phv_8bit_extractors and @p phv_16_bit_extractors).
+int count_number_of_extractions(Parser *parser, Target::Tofino::parser_regs &regs,
+                                Parser::State::Match *match, const AnalysisType type) {
+    int used = 0;
+    int row = parser->match_to_row.at(match);
+    auto map = reinterpret_cast<tofino_phv_output_map *>(
+        parser->setup_phv_output_map(regs, parser->gress, row));
+
+    auto elems = type == AnalysisType::BIT8 ? phv_8bit_extractors : phv_16bit_extractors;
+    for (auto i : elems) {
+        if (map[i].dst->value != EXTRACT_SLOT_UNUSED) {
+            used++;
+        }
+    }
+
+    return used;
+}
+
+/// Pad collector object which provides mapping from a narrow-to-wide match
+/// to added padding
+class PaddingInfoCollector {
+ public:
+    struct PadInfo {
+        /// The number of added extractors to work correctly
+        int m_count8;
+        int m_count16;
+
+        PadInfo() {
+            m_count8 = 0;
+            m_count16 = 0;
+        }
+
+        void add(const AnalysisType type, const int val) {
+            if (type == AnalysisType::BIT8) {
+                m_count8 += val;
+            } else {
+                m_count16 += val;
+            }
+        }
+    };
+
+    /// Information for one parser state where the padding
+    // is being added
+    struct PadState {
+        /// Added padding information into parser states (successors or predecessors)
+        std::map<Parser::State::Match *, PadInfo *> m_padding;
+
+        void addPadInfo(Parser::State::Match *match, AnalysisType pad, int count) {
+            if (count == 0) return;
+
+            if (!m_padding.count(match)) {
+                m_padding[match] = new PadInfo;
+            }
+
+            m_padding[match]->add(pad, count);
+        }
+
+        bool hasPadInfo() { return m_padding.size() != 0; }
+
+        void print() {
+            std::stringstream message;
+            message << " Pad State Info : " << std::endl;
+            for (auto &m : m_padding) {
+                message << " \t State(match) : " << m.first->state->name << "(" << m.first->match
+                        << ")" << " -> { m_count8 : " << m.second->m_count8
+                        << ", m_count16 : " << m.second->m_count16 << " }" << std::endl;
+            }
+            LOG1(message.str());
+        }
+    };
+
+    PadState *getPadState(Parser::State::Match *match) {
+        if (!m_nrw_matches.count(match)) {
+            m_nrw_matches[match] = new PadState;
+        }
+
+        return m_nrw_matches[match];
+    }
+
+    void printPadInfo() {
+        for (auto s : m_nrw_matches) {
+            auto nrw_match = s.first;
+            auto info_collector = s.second;
+            // Skip the info if we don't have any stored padding
+            if (!info_collector->hasPadInfo()) {
+                continue;
+            }
+
+            std::stringstream message;
+            message << "State " << nrw_match->state->name;
+            if (nrw_match->match == true) {
+                message << ", match " << nrw_match->match;
+            }
+
+            message << " is using the narrow-to-wide extraction: " << std::endl;
+
+            if (nrw_match->narrow_to_wide_32b_16.size() != 0) {
+                message << "\t* 32 bit extractors are replaced by 2 x 16 bit extractors: ";
+                for (auto ref : nrw_match->narrow_to_wide_32b_16) {
+                    message << ref->name() << " ";
+                }
+                message << std::endl;
+            }
+
+            if (nrw_match->narrow_to_wide_32b_8.size() != 0) {
+                message << "\t* 32 bit extractors are replaced by 4 x 8 bit extractors: ";
+                for (auto ref : nrw_match->narrow_to_wide_32b_8) {
+                    message << ref->name() << " ";
+                }
+                message << std::endl;
+            }
+
+            if (nrw_match->narrow_to_wide_16b_8.size() != 0) {
+                message << "\t* 16 bit extractors are replaced by 2 x 8 bit extractors: ";
+                for (auto ref : nrw_match->narrow_to_wide_16b_8) {
+                    message << ref->name() << " ";
+                }
+                message << std::endl;
+            }
+
+            message
+                << "The following extractions need to be added to parser states to work correctly:"
+                << std::endl;
+
+            for (auto pad : info_collector->m_padding) {
+                auto match = pad.first;
+                auto pad_info = pad.second;
+
+                message << "\t* State " << match->state->name;
+                if (match->match == true) {
+                    message << ", match " << match->match;
+                }
+
+                message << " needs " << pad_info->m_count8 << " x 8 bit and " << pad_info->m_count16
+                        << " x 16 bit extractions to be added" << std::endl;
+            }
+
+            LOG1("WARNING: " << message.str());
+        }
+    }
+
+ private:
+    /// This provides mapping between the narrow-to-wide (NRW) match and collected
+    /// padding information
+    std::map<Parser::State::Match *, PadState *> m_nrw_matches;
+};
+
+/// Size of internal parser FIFO
+static const int parser_fifo_size = 32;
+
+/// Compute the @p val / @p div and ceil it to the nearest upper value. The result
+/// will be wrapped to the FIFO size in parser.
+int ceil_and_wrap_to_fifo_size(int val, int div) {
+    int fifo_items = val > parser_fifo_size ? parser_fifo_size : val;
+    return (fifo_items + div - 1) / div;
+}
+
+int analyze_worst_extractor_path(Parser *parser, Target::Tofino::parser_regs &regs,
+                                 Parser::State::Match *match, AnalysisType type,
+                                 std::set<Parser::State *> &visited, ExtractionCountCache &cache) {
+    if (visited.count(match->state)) {
+        // We have found a node in a loop --> we will get via our predessors into the same state.
+        // This means that we can take this loop many times and that we need to distribute the
+        // maximal FIFO value to our parets (we are predecessors of our parents).
+        //
+        // IN SUCH CASE THE NUMBER OF EXTRACTIONS FOR ALL SUCCESSORS DOESN'T CATCH
+        // THE REALITY. IT IS JUST FOR THE SIMULATION OF FULL PARSER FIFO BLOCKS.
+        // REALITY IS COVERED WHEN THE GRAPH DOESN'T HAVE LOOPS
+        return parser_fifo_size;
+    }
+
+    if (LOGGING(3)) {
+        std::stringstream ss;
+        ss << "Processing match " << match->state->name << ", gress = " << match->state->gress;
+        if (match->match) {
+            ss << ", match " << match->match;
+        }
+        LOG3(ss.str());
+    }
+
+    // Check the cache if we know the result
+    if (cache.has(match, type)) {
+        return cache.get(match, type);
+    }
+
+    // Mark node as visited and run the analysis
+    visited.insert(match->state);
+    int extractions = count_number_of_extractions(parser, regs, match, type);
+    int pred_extractions = 0;
+    for (auto pred : match->state->pred) {
+        pred_extractions =
+            std::max(pred_extractions,
+                     analyze_worst_extractor_path(parser, regs, pred, type, visited, cache));
+    }
+
+    // Insert the result into the cache and unmark the node as visited
+    visited.erase(match->state);
+    int extraction_result = extractions + pred_extractions;
+    cache.insert(match, type, extraction_result);
+
+    return extraction_result;
+}
+
+/**
+ * @brief Dump the occupancy of extraction slots in output map
+ *
+ * @param match Current match which is being processed
+ * @param indexes Indexes to inspect
+ * @param prefix Prefix to add before the print
+ */
+static void print_slot_occupancy(const Parser::State::Match *match,
+                                 const std::initializer_list<extractor_slots> indexes,
+                                 const std::string prefix = "") {
+    // Print the prefix if not empty, iterate over checked indexes and
+    // print slot occupancy information.
+    std::stringstream ss;
+    std::string sep;
+    if (prefix != "") {
+        ss << prefix << " : " << std::endl;
+        sep = "\t* ";
+    }
+
+    for (auto idx : indexes) {
+        ss << sep << matchSlotTracker.get_db_slots(match, idx) << std::endl;
+    }
+
+    auto output = ss.str();
+    if (output.size() == 0) return;
+    LOG5(output);
+}
+
+/**
+ * @brief Set the @p pad_idx or @p from_idx based on the used extractor scenario
+ *
+ * @param pad_idx Input/output for padding index
+ * @param from_idx Input/output for source index
+ * @param used Number of used extractors
+ * @param has_csum State is using the VERIFY checksum
+ * @param match State match which is being processed
+ * @param map Pointer on the output map configuration
+ */
+static void set_idx_for_16b_extractions(unsigned &pad_idx, unsigned &from_idx, const unsigned used,
+                                        const bool has_csum, const Parser::State::Match *match,
+                                        struct tofino_phv_output_map *map) {
+    if (used == 1) {
+        // One extractor is being used and the index is stored in from_idx. The allocation
+        // strategy here is to keep data in tuples {0,1} and {2,3}.
+        if (from_idx == phv_16b_0 || from_idx == phv_16b_2) {
+            pad_idx = from_idx + 1;
+        } else if (from_idx == phv_16b_1 || from_idx == phv_16b_3) {
+            pad_idx = from_idx - 1;
+        } else {
+            // We should never reach this point
+            error(match->lineno,
+                  "Cannot identify index for 16bit extractor padding (1 extractor)!");
+        }
+    } else {
+        // Three extractors are used and the unused extractor index is stored in
+        // the pad_idx variable. We can keep indexes in tuples
+        if (has_csum) {
+            from_idx = phv_16b_3;
+        } else if (pad_idx == phv_16b_0 || pad_idx == phv_16b_2) {
+            from_idx = pad_idx + 1;
+        } else if (pad_idx == phv_16b_1 || pad_idx == phv_16b_3) {
+            from_idx = pad_idx - 1;
+        } else {
+            // We should never reach this point
+            error(match->lineno,
+                  "Cannot identify index for 16bit extractor padding (3 extractors)!");
+        }
+    }
+}
+
+/**
+ * @brief Verify all invariants for the extractor padding configuration
+ *
+ * @param pad_idx Input/output for padding index
+ * @param from_idx Input/output for source index
+ * @param used Number of used extractors
+ * @param has_csum State is using the VERIFY checksum
+ * @param match State match which is being processed
+ * @param map Pointer on the output map configuration
+ */
+static void check_16b_extractor_configuration(const unsigned pad_idx, const unsigned from_idx,
+                                              const unsigned used, const bool has_csum,
+                                              const struct tofino_phv_output_map *map) {
+    // 1] Indexes are kept in tuples {0,1} and {2,3}. Checksum means that from_idx is
+    // set to the last extractor.
+    bool csum = has_csum && (from_idx == phv_16b_3);
+    bool first_tuple = (from_idx == phv_16b_0 || from_idx == phv_16b_1) && (pad_idx <= phv_16b_1);
+    bool second_tuple = (from_idx == phv_16b_2 || from_idx == phv_16b_3) && (pad_idx >= phv_16b_2);
+    BUG_CHECK(has_csum || first_tuple || second_tuple,
+              "Source and destination index are not configured correctly for 16bit 2n extractor "
+              "padding!");
+
+    // 2] All indexes are sourced from global version field which is tied to zeros.
+    // The Checksum case means that we need to set the from index on the last 16b extractor
+    BUG_CHECK(map[pad_idx].dst->value != EXTRACT_SLOT_UNUSED,
+              "Invalid extractor destination for 16bit 2n padding!");
+    if (has_csum) {
+        BUG_CHECK(from_idx == phv_16b_3,
+                  "Invalid from_idx for the 16bit 2n padding with checksum!");
+    }
+
+    // Check the slot configuration - sourcing from global field and no constant for {0,1}
+    BUG_CHECK(
+        *map[pad_idx].src >= PARSER_SRC_MAX_IDX - 3 && *map[pad_idx].src != EXTRACT_SLOT_UNUSED,
+        "Field is not sourcing from the global version field!");
+    if (pad_idx == phv_16b_0 || pad_idx == phv_16b_1) {
+        BUG_CHECK(*map[pad_idx].src_type == 0,
+                  "Invalid configuration of the source type for 16b 2n padding!");
+    }
+}
+
+//
+// uArch for Tofino Parser:
+// * Parser Output section
+// * Checksum section
+// * Parse Merge section
+
+/**
+ * @brief Perform the 16b padding onto map and computed index.
+ *
+ * @param regs Register configuration instance
+ * @param map Tofino output map pointer
+ * @param pad_idx Index which needs to be padded
+ * @param from_idx Index which is the source of the slot configuration
+ */
+static void do_16b_padding(Target::Tofino::parser_regs &regs, tofino_phv_output_map *map,
+                           const unsigned pad_idx, const unsigned from_idx) {
+    // Add fake extractors to reach 2n constraint, we need to copy destination and source from
+    // the global version field which is tied to zeros in RTL.
+    // We are keeping both indexes in tuples {0,1} or {2,3}.
+    map[pad_idx].dst->rewrite();
+    *map[pad_idx].dst = *map[from_idx].dst;
+    *map[pad_idx].src = PARSER_SRC_MAX_IDX - 1;
+    if (pad_idx < phv_16b_2) {
+        // Even though extractors {0, 1} can extract from constants, we want to extract
+        // from the tied-to-zero global version field for consistency across all 16b extractors
+        *map[pad_idx].src_type = EXTRACT_SLOT_CONSTANT_DIS;
+    }
+
+    // Mark the dummy write dest as multi-write, we need to distinguish between PHV and TPHV
+    if (*map[from_idx].dst >= PHV_MIN_INDEX && *map[from_idx].dst < PHV_MAX_INDEX) {
+        regs.ingress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst].rewrite();
+        regs.egress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst].rewrite();
+
+        regs.ingress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst] = 0;
+        regs.egress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst] = 0;
+    } else if (*map[from_idx].dst >= TPHV_MIN_INDEX && *map[from_idx].dst < TPHV_MAX_INDEX) {
+        auto tphv_idx = *map[from_idx].dst - TPHV_MIN_INDEX;
+        regs.ingress.prsr_reg.no_multi_wr.t_nmw[tphv_idx].rewrite();
+        regs.egress.prsr_reg.no_multi_wr.t_nmw[tphv_idx].rewrite();
+
+        regs.ingress.prsr_reg.no_multi_wr.t_nmw[tphv_idx] = 0;
+        regs.egress.prsr_reg.no_multi_wr.t_nmw[tphv_idx] = 0;
+    }
+}
+
+/// Add the fake extractions to have 2n 16b extractions. The function returns
+/// the number of added extractions.
+static int pad_to_16b_extracts_to_2n(Parser *parser, Target::Tofino::parser_regs &regs,
+                                     Parser::State::Match *match) {
+    // Obtain the slot configuration for given row
+    int row = parser->match_to_row.at(match);
+    auto map = reinterpret_cast<tofino_phv_output_map *>(
+        parser->setup_phv_output_map(regs, parser->gress, row));
+    if (LOGGING(5)) {
+        print_slot_occupancy(match, phv_16bit_extractors, "Before 16bit padding");
+    }
+
+    // Count number of used extractors - the number of extractions/constant set operations and
+    // checksums should be the padded to 2n.
+    unsigned used = 0;
+    unsigned pad_idx = 0;
+    unsigned from_idx = 0;
+    bool from_idx_is_8b_16b = false;
+    for (auto i : phv_16bit_extractors) {
+        if (map[i].dst->value == EXTRACT_SLOT_UNUSED) {
+            pad_idx = i;
+        } else {
+            used++;
+            // Try to get an 8b or 16b index.
+            // If we use a single 32b index to pad then we'll hit problems
+            // because we need 2 x 16b to fill a 32b container.
+            if (!from_idx_is_8b_16b) {
+                from_idx = i;
+                auto *reg = Phv::reg(map[i].dst->value);
+                from_idx_is_8b_16b = reg->size == 8 || reg->size == 16;
+            }
+        }
+    }
+
+    // Check if csum equals VERIFY type and destination register size is 16
+    bool has_csum = false;
+    for (auto &c : match->csum) {
+        if (c.type == 0 && c.dest && c.dest->reg.size == 16) {
+            has_csum = true;
+            break;
+        }
+    }
+
+    // Identify indexes for source and destination slots for the padding
+    if (used == 1 || used == 3) {
+        set_idx_for_16b_extractions(pad_idx, from_idx, used, has_csum, match, map);
+    } else {
+        // Value is 0,2 or 4, we are good!
+        if (LOGGING(5)) {
+            LOG5("No 16bit padding is needed to add in " << match->state->name << " state.");
+        }
+
+        return 0;
+    }
+
+    // Add fake extractors to reach 2n constraint, we need to copy destination and source from
+    // the global version field which is tied to zeros in RTL.
+    // We are keeping both indexes in tuples {0,1} or {2,3}.
+    do_16b_padding(regs, map, pad_idx, from_idx);
+    matchSlotTracker.padMap.insert(match, pad_idx, &map[pad_idx]);
+    check_16b_extractor_configuration(pad_idx, from_idx, used, has_csum, map);
+
+    // Match can also have a value set which means we need to do the initialization for
+    // other rows. The first row is being initialized, other rows needs the initialization
+    // and padding configuration
+    for (int vs_offset = 1; vs_offset < match->value_set_size; ++vs_offset) {
+        int nrow = row + vs_offset;
+        LOG5("Adding the padding for value_set "
+             << match->value_set_name << " offset = " << vs_offset << " (row = " << nrow << ")");
+        map = reinterpret_cast<tofino_phv_output_map *>(
+            parser->setup_phv_output_map(regs, parser->gress, nrow));
+        do_16b_padding(regs, map, pad_idx, from_idx);
+    }
+
+    if (LOGGING(5)) {
+        print_slot_occupancy(match, phv_16bit_extractors, "After 16bit padding");
+    }
+
+    return 1;
+}
+
+/**
+ * @brief Perform the 8b padding onto map and computed index.
+ *
+ * @param regs Register configuration instance
+ * @param map Tofino output map pointer
+ * @param from_idx Index which is the source of the slot configuration
+ */
+static void do_8b_padding(Target::Tofino::parser_regs &regs, tofino_phv_output_map *map,
+                          const unsigned from_idx) {
+    for (auto pad_idx : phv_8bit_extractors) {
+        if (map[pad_idx].dst->value != EXTRACT_SLOT_UNUSED) continue;
+
+        // Extraction slot is not used and we need to put padding there. The main idea
+        // is to source from the zero constant
+        map[pad_idx].dst->rewrite();
+        *map[pad_idx].dst = *map[from_idx].dst;
+        *map[pad_idx].src = EXTRACT_SLOT_CONSTANT_ZERO;
+        *map[pad_idx].src_type = EXTRACT_SLOT_CONSTANT_EN;
+    }
+
+    // Mark the dummy write dest as multi-write, we need to distinguish between PHV and TPHV
+    if (*map[from_idx].dst >= PHV_MIN_INDEX && *map[from_idx].dst < PHV_MAX_INDEX) {
+        regs.ingress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst].rewrite();
+        regs.egress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst].rewrite();
+
+        regs.ingress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst] = 0;
+        regs.egress.prsr_reg.no_multi_wr.nmw[*map[from_idx].dst] = 0;
+    } else if (*map[from_idx].dst >= TPHV_MIN_INDEX && *map[from_idx].dst < TPHV_MAX_INDEX) {
+        auto tphv_idx = *map[from_idx].dst - TPHV_MIN_INDEX;
+        regs.ingress.prsr_reg.no_multi_wr.t_nmw[tphv_idx].rewrite();
+        regs.egress.prsr_reg.no_multi_wr.t_nmw[tphv_idx].rewrite();
+
+        regs.ingress.prsr_reg.no_multi_wr.t_nmw[tphv_idx] = 0;
+        regs.egress.prsr_reg.no_multi_wr.t_nmw[tphv_idx] = 0;
+    }
+}
+
+/// Add the fake extractions to have 4n 8b extractions. The function returns
+/// the number of added extractions.
+static int pad_to_8b_extracts_to_4n(Parser *parser, Target::Tofino::parser_regs &regs,
+                                    Parser::State::Match *match) {
+    // Obtain the slot configuration for given row
+    int row = parser->match_to_row.at(match);
+    auto map = reinterpret_cast<tofino_phv_output_map *>(
+        parser->setup_phv_output_map(regs, parser->gress, row));
+    if (LOGGING(5)) {
+        print_slot_occupancy(match, phv_8bit_extractors, "Before 8bit padding");
+    }
+
+    // Count number of used extractors - the number of extraction/constant set operations and
+    // checksums should be padded to 4n. The source of the added padding will be stored in
+    // the from_idx variable.
+    unsigned used = 0;
+    unsigned from_idx = 0;
+    bool from_idx_is_8b = false;
+    for (auto i : phv_8bit_extractors) {
+        if (map[i].dst->value == EXTRACT_SLOT_UNUSED) continue;
+        // Update the used counter and remember the used slot
+        used++;
+        // Try to get an 8b index.
+        // If we use a single 16b index to pad then we'll hit problems
+        // because we need 2 x 8b to fill a 16b container.
+        if (!from_idx_is_8b) {
+            from_idx = i;
+            from_idx_is_8b = Phv::reg(map[i].dst->value)->size == 8;
+        }
+    }
+
+    if (used % 4 == 0) {
+        if (LOGGING(5)) {
+            LOG5("No 8bit padding is needed to add in " << match->state->name << " state.");
+        }
+
+        return 0;
+    }
+
+    // Add fake extractions to meet the 4n constraint and setup tracking
+    do_8b_padding(regs, map, from_idx);
+    for (auto pad_idx : phv_8bit_extractors) {
+        matchSlotTracker.padMap.insert(match, pad_idx, &map[pad_idx]);
+    }
+
+    // Match can also have a value set which means we need to do the initialization for
+    // other rows. The first row is being initialized, other rows needs the initialization
+    // and padding configuration
+    for (int vs_offset = 1; vs_offset < match->value_set_size; ++vs_offset) {
+        int nrow = row + vs_offset;
+        LOG5("Adding the padding for value_set "
+             << match->value_set_name << " offset = " << vs_offset << " (row = " << nrow << ")");
+        map = reinterpret_cast<tofino_phv_output_map *>(
+            parser->setup_phv_output_map(regs, parser->gress, nrow));
+        do_8b_padding(regs, map, from_idx);
+    }
+
+    if (LOGGING(5)) {
+        print_slot_occupancy(match, phv_8bit_extractors, "After 8bit padding");
+    }
+
+    return 4 - (used % 4);
+}
+
+/// Add padding extracts to a parser state and its children.
+///
+/// @tparam use_8bit Apply to 8b extracts (true) or 16b extracts (false)
+/// @param parser Parser containing the state being padded
+/// @param regs
+/// @param node_count Number of states to pad, including this state. States with zero extracts are
+///        not counted in @p node_count.
+/// @param visited
+/// @param pstate
+template <bool use_8bit>
+void pad_nodes_extracts(Parser *parser, Target::Tofino::parser_regs &regs, int node_count,
+                        Parser::State::Match *match, std::set<Parser::State *> &visited,
+                        PaddingInfoCollector::PadState *pstate,
+                        std::map<Parser::State::Match *, int> &cache) {
+    if (node_count == 0 || !match) {
+        LOG4("Node count or nullptr match was reached");
+        return;
+    }
+
+    const std::string log_pad = use_8bit ? "8b" : "16b";
+    if (visited.count(match->state)) {
+        LOG4("State " << match->state << " was already visited in " << log_pad << " padding.");
+        return;
+    }
+
+    visited.insert(match->state);
+    LOG3("Padding " << log_pad << " extracts - state = " << match->state->name << ", "
+                    << "remaining " << log_pad << " states to pad is " << node_count);
+
+    pstate->print();
+    // Memoization to minimize path visits
+    // If state is visited before with the same or higher node count dont visit it
+    // again. Cache holds a map from state to node count
+    if (cache[match] >= node_count && node_count > 0) {
+        LOG5(" Using cached state(match) : " << match->state->name << "(" << match->match
+                                             << ") -> node count " << cache[match]);
+        visited.erase(match->state);
+        return;
+    }
+    cache[match] = node_count;
+    LOG5(" Caching state(match) : " << match->state->name << "(" << match->match
+                                    << ") -> node count " << cache[match]);
+
+    // We need to be sure that we will not be passing data to 2x16bit busses. Therefore, we have
+    // to pad the bus entirely for 16bit extractions. In addition, we need to see node_cout 16bit
+    // extactions - due to possible FIFO stalls. If the node doesn't contain the required
+    // extraction, we don't decrement the node_count value.
+    int new_node_count = node_count;
+    auto phv_type = use_8bit ? AnalysisType::BIT8 : AnalysisType::BIT16;
+    if (count_number_of_extractions(parser, regs, match, phv_type)) {
+        if (use_8bit) {
+            int pad = pad_to_8b_extracts_to_4n(parser, regs, match);
+            pstate->addPadInfo(match, AnalysisType::BIT8, pad);
+        } else {
+            int pad = pad_to_16b_extracts_to_2n(parser, regs, match);
+            pstate->addPadInfo(match, AnalysisType::BIT16, pad);
+        }
+
+        new_node_count--;
+    }
+    if (LOGGING(5)) pstate->print();
+
+    for (auto state : match->next) {
+        for (auto next_match : state->match) {
+            pad_nodes_extracts<use_8bit>(parser, regs, new_node_count, next_match, visited, pstate,
+                                         cache);
+        }
+    }
+
+    visited.erase(match->state);
+}
+
+// Helping aliases for padding functions
+const auto pad_nodes_8b_extracts = pad_nodes_extracts<true>;
+const auto pad_nodes_16b_extracts = pad_nodes_extracts<false>;
+
+void handle_narrow_to_wide_constraint(Parser *parser, Target::Tofino::parser_regs &regs) {
+    // 1] Apply narrow-to-wide constraints to all predecessors
+    std::set<Parser::State::Match *> narrow_to_wide_matches;
+    PaddingInfoCollector pad_collector;
+
+    for (auto &kv : parser->match_to_row) {
+        if (kv.first->has_narrow_to_wide_extract) narrow_to_wide_matches.insert(kv.first);
+    }
+
+    if (narrow_to_wide_matches.size() == 0) {
+        LOG2("No narrow to wide matches has been detected.");
+        return;
+    }
+
+    // Pad all predecessors
+    std::set<Parser::State::Match *> all_preds;
+    for (auto m : narrow_to_wide_matches) {
+        auto states = m->get_all_preds();
+        states.insert(m);
+        auto pstate = pad_collector.getPadState(m);
+
+        for (auto p : states) {
+            if (all_preds.count(p)) continue;
+
+            all_preds.insert(p);
+            int pad = pad_to_16b_extracts_to_2n(parser, regs, p);
+            pstate->addPadInfo(p, AnalysisType::BIT16, pad);
+            pad = pad_to_8b_extracts_to_4n(parser, regs, p);
+            pstate->addPadInfo(p, AnalysisType::BIT8, pad);
+        }
+    }
+
+    // 2] Apply the narrow-to-wide constraints to a given number
+    // of child nodes.
+    ExtractionCountCache cache;
+    for (auto m : narrow_to_wide_matches) {
+        auto pstate = pad_collector.getPadState(m);
+        std::set<Parser::State *> visited_states;
+        int extracts_16b = analyze_worst_extractor_path(parser, regs, m, AnalysisType::BIT16,
+                                                        visited_states, cache);
+        int extracts_8b = analyze_worst_extractor_path(parser, regs, m, AnalysisType::BIT8,
+                                                       visited_states, cache);
+
+        if (LOGGING(3)) {
+            std::stringstream ss;
+            ss << "INFO: Used extractors for " << m->state->gress << "," << m->state->name;
+            if (m->match) {
+                ss << "," << m->match;
+            }
+            ss << " - " << "8bit:" << extracts_8b << ", 16bit:" << extracts_16b;
+            LOG3(ss.str());
+        }
+
+        // Count the number of nodes we need to take, the arbiter is taking 4x8bit chunks
+        // and 2x16b chunks of data. The result will be ceiled to 16 because that is the
+        // depth of the FIFO. After that, we need to apply the 16b padding to a computed number
+        // of nodes and the same for 8b padding.
+        int pass_16b_nodes = ceil_and_wrap_to_fifo_size(extracts_16b, 2);
+        int pass_8b_nodes = ceil_and_wrap_to_fifo_size(extracts_8b, 4);
+
+        // The state counts represent the states _after_ the n2w state. Increment by 1 to account
+        // for n2w state.
+        if (pass_8b_nodes) pass_8b_nodes++;
+        if (pass_16b_nodes) pass_16b_nodes++;
+
+        // Pad extracts: 8b extracts should be padded based on the number of states to flush 16b
+        // narrow-to-wide extracts, and 16b extracts should be padded based on the number
+        // of states to flush 8b narrow-to-wide extracts.
+        std::map<Parser::State::Match *, int> cacheNodeCount;
+        pad_nodes_16b_extracts(parser, regs, pass_8b_nodes, m, visited_states, pstate,
+                               cacheNodeCount);
+        cacheNodeCount.clear();
+        pad_nodes_8b_extracts(parser, regs, pass_16b_nodes, m, visited_states, pstate,
+                              cacheNodeCount);
+    }
+
+    if (LOGGING(1)) {
+        pad_collector.printPadInfo();
+    }
+}
+
+template <>
+void Parser::write_config(Target::Tofino::parser_regs &regs, json::map &ctxt_json,
+                          bool single_parser) {
+    /// remove after 8.7 release
+    if (single_parser) {
+        for (auto st : all) {
+            st->write_config(regs, this, ctxt_json[st->gress == EGRESS ? "egress" : "ingress"]);
+        }
+    } else {
+        ctxt_json["states"] = json::vector();
+        for (auto st : all) st->write_config(regs, this, ctxt_json["states"]);
+    }
+
+    if (error_count > 0) return;
+
+    int i = 0;
+    for (auto ctr : counter_init) {
+        if (ctr) ctr->write_config(regs, gress, i);
+        ++i;
+    }
+
+    for (i = 0; i < checksum_use.size(); i++) {
+        for (auto csum : checksum_use[i])
+            if (csum) csum->write_config(regs, this);
+    }
+
+    if (gress == INGRESS) {
+        init_common_regs(this, regs.ingress.prsr_reg, INGRESS);
+        // regs.ingress.ing_buf_regs.glb_group.disable();
+        // regs.ingress.ing_buf_regs.chan0_group.chnl_drop.disable();
+        // regs.ingress.ing_buf_regs.chan0_group.chnl_metadata_fix.disable();
+        // regs.ingress.ing_buf_regs.chan1_group.chnl_drop.disable();
+        // regs.ingress.ing_buf_regs.chan1_group.chnl_metadata_fix.disable();
+        // regs.ingress.ing_buf_regs.chan2_group.chnl_drop.disable();
+        // regs.ingress.ing_buf_regs.chan2_group.chnl_metadata_fix.disable();
+        // regs.ingress.ing_buf_regs.chan3_group.chnl_drop.disable();
+        // regs.ingress.ing_buf_regs.chan3_group.chnl_metadata_fix.disable();
+
+        regs.ingress.prsr_reg.hdr_len_adj.amt = hdr_len_adj;
+    }
+
+    if (gress == EGRESS) {
+        init_common_regs(this, regs.egress.prsr_reg, EGRESS);
+        for (int i = 0; i < 4; i++) regs.egress.epb_prsr_port_regs.chnl_ctrl[i].meta_opt = meta_opt;
+
+        int prsr_max_dph = get_prsr_max_dph();
+        if (prsr_max_dph * 16 > Target::PARSER_DEPTH_MAX_BYTES_MULTITHREADED_EGRESS()) {
+            if (!options.tof1_egr_parse_depth_checks_disabled)
+                warning(lineno,
+                        "Egress parser max depth exceeds %d, which requires disabling "
+                        "multithreading in the parser",
+                        Target::PARSER_DEPTH_MAX_BYTES_MULTITHREADED_EGRESS());
+            options.tof1_egr_parse_depth_checks_disabled = true;
+        }
+        regs.egress.epb_prsr_port_regs.multi_threading.prsr_dph_max = prsr_max_dph;
+        regs.egress.prsr_reg.hdr_len_adj.amt = hdr_len_adj;
+    }
+
+    // FIXME: The "|| 1" causes the PHV use information to be unconditionally copied
+    // into the PHV ownership. This forces the parser ownership to be identical to that
+    // in the pipe.
+    // Remove to allow different ownership, but make sure that header stacks
+    // are processed correctly. All stack elements writeable by the parser must
+    // be owned by the parser.
+    if (options.match_compiler || 1) {
+        phv_use[INGRESS] |= Phv::use(INGRESS);
+        phv_use[EGRESS] |= Phv::use(EGRESS);
+    }
+
+    for (int i : phv_use[EGRESS]) {
+        auto id = Phv::reg(i)->parser_id();
+        if (id >= 256) {
+            regs.merge.phv_owner.t_owner[id - 256] = 1;
+            regs.ingress.prsr_reg.phv_owner.t_owner[id - 256] = 1;
+            regs.egress.prsr_reg.phv_owner.t_owner[id - 256] = 1;
+        } else if (id < 224) {
+            regs.merge.phv_owner.owner[id] = 1;
+            regs.ingress.prsr_reg.phv_owner.owner[id] = 1;
+            regs.egress.prsr_reg.phv_owner.owner[id] = 1;
+        }
+    }
+
+    for (int i = 0; i < 224; i++) {
+        if (!phv_allow_bitwise_or[i]) {
+            regs.ingress.prsr_reg.no_multi_wr.nmw[i] = 1;
+            regs.egress.prsr_reg.no_multi_wr.nmw[i] = 1;
+        }
+        if (phv_allow_bitwise_or[i] || phv_init_valid[i]) regs.merge.phv_valid.vld[i] = 1;
+    }
+
+    for (int i = 0; i < 112; i++)
+        if (!phv_allow_bitwise_or[256 + i]) {
+            regs.ingress.prsr_reg.no_multi_wr.t_nmw[i] = 1;
+            regs.egress.prsr_reg.no_multi_wr.t_nmw[i] = 1;
+        }
+
+    // if (options.condense_json) {
+    //     // FIXME -- removing the uninitialized memory causes problems?
+    //     // FIXME -- walle gets the addresses wrong.  Might also require explicit
+    //     // FIXME -- zeroing in the driver on real hardware
+    //     // regs.memory[INGRESS].disable_if_reset_value();
+    //     // regs.memory[EGRESS].disable_if_reset_value();
+    //     regs.ingress.disable_if_reset_value();
+    //     regs.egress.disable_if_reset_value();
+    //     regs.merge.disable_if_reset_value();
+    // }
+
+    // Handles the constraint when using narrow extractors to generate wide values
+    // (either extracted from the packet or using the constants), then you need to
+    // follow the rule the _every_ preceding cycle must do:
+    //   0 or 4 8b extractions
+    //   0 or 2 or 4 16b extractions
+    handle_narrow_to_wide_constraint(this, regs);
+
+    if (error_count == 0 && options.gen_json) {
+        /// TODO remove after 8.7 release
+        /// TODO Needs fix to simple test harness for parsers node
+        /// support
+        if (single_parser) {
+            if (gress == INGRESS) {
+                regs.memory[INGRESS].emit_json(*open_output("memories.all.parser.ingress.cfg.json"),
+                                               "ingress");
+                regs.ingress.emit_json(*open_output("regs.all.parser.ingress.cfg.json"));
+            } else if (gress == EGRESS) {
+                regs.memory[EGRESS].emit_json(*open_output("memories.all.parser.egress.cfg.json"),
+                                              "egress");
+                regs.egress.emit_json(*open_output("regs.all.parser.egress.cfg.json"));
+            }
+            regs.merge.emit_json(*open_output("regs.all.parse_merge.cfg.json"));
+        } else {
+            if (gress == INGRESS) {
+                regs.memory[INGRESS].emit_json(
+                    *open_output("memories.all.parser.ingress.%02x.cfg.json", parser_no),
+                    "ingress");
+                regs.ingress.emit_json(
+                    *open_output("regs.all.parser.ingress.%02x.cfg.json", parser_no));
+            }
+            if (gress == EGRESS) {
+                regs.memory[EGRESS].emit_json(
+                    *open_output("memories.all.parser.egress.%02x.cfg.json", parser_no), "egress");
+                regs.egress.emit_json(
+                    *open_output("regs.all.parser.egress.%02x.cfg.json", parser_no));
+            }
+            regs.merge.emit_json(*open_output("regs.all.parse_merge.cfg.json"));
+        }
+    }
+
+    /// TODO remove after 8.7 release
+    if (single_parser) {
+        for (int i = 0; i < 18; i++) {
+            if (gress == INGRESS) {
+                TopLevel::regs<Target::Tofino>()->mem_pipe.i_prsr[i].set(
+                    "memories.all.parser.ingress", &regs.memory[INGRESS]);
+                TopLevel::regs<Target::Tofino>()->reg_pipe.pmarb.ibp18_reg.ibp_reg[i].set(
+                    "regs.all.parser.ingress", &regs.ingress);
+            } else if (gress == EGRESS) {
+                TopLevel::regs<Target::Tofino>()->mem_pipe.e_prsr[i].set(
+                    "memories.all.parser.egress", &regs.memory[EGRESS]);
+                TopLevel::regs<Target::Tofino>()->reg_pipe.pmarb.ebp18_reg.ebp_reg[i].set(
+                    "regs.all.parser.egress", &regs.egress);
+            }
+        }
+    } else {
+        if (gress == INGRESS) {
+            TopLevel::regs<Target::Tofino>()->parser_ingress.emplace(
+                ctxt_json["handle"]->as_number()->val, &regs.ingress);
+            TopLevel::regs<Target::Tofino>()->parser_memory[INGRESS].emplace(
+                ctxt_json["handle"]->as_number()->val, &regs.memory[INGRESS]);
+        } else if (gress == EGRESS) {
+            TopLevel::regs<Target::Tofino>()->parser_egress.emplace(
+                ctxt_json["handle"]->as_number()->val, &regs.egress);
+            TopLevel::regs<Target::Tofino>()->parser_memory[EGRESS].emplace(
+                ctxt_json["handle"]->as_number()->val, &regs.memory[EGRESS]);
+        }
+
+#if 0
+        /// for initiliazing the parser registers in default configuration.
+        int start_bit = port_use.ffs();
+        do {
+            int end_bit = port_use.ffz(start_bit);
+            std::cout << "set memories and regs from " << start_bit
+            << " to " << end_bit - 1 << std::endl;
+            for (auto i = start_bit; i <= end_bit - 1; i++) {
+                TopLevel::regs<Target::Tofino>()->mem_pipe.i_prsr[i]
+                        .set("memories.all.parser.ingress", &regs.memory[INGRESS]);
+                TopLevel::regs<Target::Tofino>()->reg_pipe.pmarb.ibp18_reg.ibp_reg[i]
+                        .set("regs.all.parser.ingress", &regs.ingress);
+                TopLevel::regs<Target::Tofino>()->mem_pipe.e_prsr[i]
+                        .set("memories.all.parser.egress", &regs.memory[EGRESS]);
+                TopLevel::regs<Target::Tofino>()->reg_pipe.pmarb.ebp18_reg.ebp_reg[i]
+                        .set("regs.all.parser.egress", &regs.egress);
+            }
+            start_bit = port_use.ffs(end_bit);
+        } while (start_bit >= 0);
+#endif
+    }
+    // all parsers share the same parser_merge configuration.
+    TopLevel::regs<Target::Tofino>()->reg_pipe.pmarb.prsr_reg.set("regs.all.parse_merge",
+                                                                  &regs.merge);
+}
+
+template <>
+void Parser::gen_configuration_cache(Target::Tofino::parser_regs &regs, json::vector &cfg_cache) {
+    std::string reg_fqname;
+    std::string reg_name;
+    unsigned reg_value;
+    std::string reg_value_str;
+    unsigned reg_width = 8;
+
+    if (gress == EGRESS) {
+        // epb_prsr_port_regs.chnl_ctrl
+        for (int i = 0; i < 4; i++) {
+            reg_fqname = "pmarb.ebp18_reg.ebp_reg[0].epb_prsr_port_regs.chnl_ctrl[" +
+                         std::to_string(i) + "]";
+            reg_name = "parser0_chnl_ctrl_" + std::to_string(i);
+            reg_value = regs.egress.epb_prsr_port_regs.chnl_ctrl[i];
+            if ((reg_value != 0) || (options.match_compiler)) {
+                reg_value_str = int_to_hex_string(reg_value, reg_width);
+                add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+            }
+        }
+
+        // epb_prsr_port_regs.multi_threading
+        reg_fqname = "pmarb.ebp18_reg.ebp_reg[0].epb_prsr_port_regs.multi_threading";
+        reg_name = "parser0_multi_threading";
+        reg_value = regs.egress.epb_prsr_port_regs.multi_threading;
+        if ((reg_value != 0) || (options.match_compiler)) {
+            reg_value_str = int_to_hex_string(reg_value, reg_width);
+            add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+        }
+    }
+}
diff --git a/backends/tofino/bf-asm/tofino/phv.cpp b/backends/tofino/bf-asm/tofino/phv.cpp
new file mode 100644
index 00000000000..7e8c19f8b45
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/phv.cpp
@@ -0,0 +1,66 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/tofino/phv.h"
+
+void Target::Tofino::Phv::init_regs(::Phv &phv) {
+    // Allocating Tofino registers so the uids map to register encodings
+    static const struct {
+        char code[4];
+        unsigned size, count;
+    } sizes[] = {{"W", 32, 64},  {"B", 8, 64},  {"H", 16, 96}, {"", 0, 32},
+                 {"TW", 32, 32}, {"TB", 8, 32}, {"TH", 16, 48}};
+    unsigned uid = 0;
+    phv.regs.resize(NUM_PHV_REGS);
+    for (unsigned i = 0; i < sizeof sizes / sizeof *sizes; i++) {
+        for (unsigned j = 0; j < sizes[i].count; j++, uid++) {
+            auto reg = phv.regs[uid] = new Register;
+            memset(reg->name, 0, sizeof(reg->name));
+            reg->type = (uid >= FIRST_TPHV) ? Register::TAGALONG : Register::NORMAL;
+            reg->index = j;
+            reg->uid = uid;
+            reg->size = sizes[i].size;
+            if (sizes[i].size) {
+                char buf[8];
+                snprintf(buf, sizeof(buf), "R%d", uid);
+                phv.names[INGRESS][buf][0].slice = ::Phv::Slice(*reg, 0, sizes[i].size - 1);
+                phv.names[EGRESS][buf][0].slice = ::Phv::Slice(*reg, 0, sizes[i].size - 1);
+                snprintf(reg->name, sizeof(reg->name), "%.2s%d", sizes[i].code, j);
+                phv.names[INGRESS][reg->name][0].slice = ::Phv::Slice(*reg, 0, sizes[i].size - 1);
+                phv.names[EGRESS][reg->name][0].slice = ::Phv::Slice(*reg, 0, sizes[i].size - 1);
+            }
+        }
+    }
+    BUG_CHECK(uid == phv.regs.size());
+}
+
+static bitvec tagalong_group(int n) {
+    bitvec rv;
+    rv.setrange(
+        Target::Tofino::Phv::FIRST_8BIT_TPHV + n * (Target::Tofino::Phv::COUNT_8BIT_TPHV / 8),
+        Target::Tofino::Phv::COUNT_8BIT_TPHV / 8);
+    rv.setrange(
+        Target::Tofino::Phv::FIRST_16BIT_TPHV + n * (Target::Tofino::Phv::COUNT_16BIT_TPHV / 8),
+        Target::Tofino::Phv::COUNT_16BIT_TPHV / 8);
+    rv.setrange(
+        Target::Tofino::Phv::FIRST_32BIT_TPHV + n * (Target::Tofino::Phv::COUNT_32BIT_TPHV / 8),
+        Target::Tofino::Phv::COUNT_32BIT_TPHV / 8);
+    return rv;
+}
+const bitvec Target::Tofino::Phv::tagalong_groups[8] = {
+    tagalong_group(0), tagalong_group(1), tagalong_group(2), tagalong_group(3),
+    tagalong_group(4), tagalong_group(5), tagalong_group(6), tagalong_group(7)};
diff --git a/backends/tofino/bf-asm/tofino/phv.h b/backends/tofino/bf-asm/tofino/phv.h
new file mode 100644
index 00000000000..1f16dc391d6
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/phv.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_PHV_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_PHV_H_
+
+#include "backends/tofino/bf-asm/phv.h"
+
+class Target::Tofino::Phv : public Target::Phv {
+    friend class ::Phv;
+    struct Register : public ::Phv::Register {
+        int parser_id() const override { return uid; }
+        int mau_id() const override { return uid < FIRST_TPHV ? uid : -1; }
+        int ixbar_id() const override { return uid < FIRST_TPHV ? uid : -1; }
+        int deparser_id() const override { return uid; }
+    };
+    void init_regs(::Phv &phv) override;
+    target_t type() const override { return TOFINO; }
+    unsigned mau_groupsize() const override { return 16; }
+
+ public:
+    enum {
+        NUM_PHV_REGS = 368,
+        FIRST_8BIT_PHV = 64,
+        COUNT_8BIT_PHV = 64,
+        FIRST_16BIT_PHV = 128,
+        COUNT_16BIT_PHV = 96,
+        FIRST_32BIT_PHV = 0,
+        COUNT_32BIT_PHV = 64,
+        FIRST_TPHV = 256,
+        FIRST_8BIT_TPHV = 288,
+        COUNT_8BIT_TPHV = 32,
+        FIRST_16BIT_TPHV = 320,
+        COUNT_16BIT_TPHV = 48,
+        FIRST_32BIT_TPHV = 256,
+        COUNT_32BIT_TPHV = 32,
+    };
+    static const bitvec tagalong_groups[8];
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_PHV_H_ */
diff --git a/backends/tofino/bf-asm/tofino/salu_inst.cpp b/backends/tofino/bf-asm/tofino/salu_inst.cpp
new file mode 100644
index 00000000000..31205779b3a
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/salu_inst.cpp
@@ -0,0 +1,194 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software d2istributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* Tofino template specializations for instructions #included in salu_inst.cpp
+ * WARNING -- this is included in an anonymous namespace, as these SaluInstruction
+ * subclasses are all defined in that anonymous namespace */
+
+template <>
+void AluOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    LOG2(this);
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_state_alu[act->code][slot - ALU2LO];
+    auto &salu_instr_common = meter_group.stateful.salu_instr_common[act->code];
+    salu.salu_op = opc->opcode & 0xf;
+    salu.salu_arith = opc->opcode >> 4;
+    salu.salu_pred = predication_encode & Target::Tofino::STATEFUL_PRED_MASK;
+    const int alu_const_min = Target::STATEFUL_ALU_CONST_MIN();
+    const int alu_const_max = Target::STATEFUL_ALU_CONST_MAX();
+    if (srca) {
+        if (auto m = srca.to<operand::Memory>()) {
+            salu.salu_asrc_memory = 1;
+            salu.salu_asrc_memory_index = m->field->bit(0) > 0;
+        } else if (auto k = srca.to<operand::Const>()) {
+            salu.salu_asrc_memory = 0;
+            if (k->value >= alu_const_min && k->value <= alu_const_max) {
+                salu.salu_const_src = k->value & Target::STATEFUL_ALU_CONST_MASK();
+                salu.salu_regfile_const = 0;
+            } else {
+                salu.salu_const_src = tbl->get_const(k->lineno, k->value);
+                salu.salu_regfile_const = 1;
+            }
+        } else if (auto r = srca.to<operand::Regfile>()) {
+            salu.salu_asrc_memory = 0;
+            salu.salu_const_src = r->index;
+            salu.salu_regfile_const = 1;
+        } else {
+            BUG();
+        }
+    }
+    if (srcb) {
+        if (auto f = srcb.to<operand::Phv>()) {
+            salu.salu_bsrc_phv = 1;
+            salu.salu_bsrc_phv_index = f->phv_index(tbl);
+        } else if (auto m = srcb.to<operand::MathFn>()) {
+            salu_instr_common.salu_alu2_lo_bsrc_math = 1;
+            if (auto b = m->of.to<operand::Phv>()) {
+                salu_instr_common.salu_alu2_lo_math_src = b->phv_index(tbl);
+            } else if (auto b = m->of.to<operand::Memory>()) {
+                salu_instr_common.salu_alu2_lo_math_src = b->field->bit(0) > 0 ? 3 : 2;
+            } else {
+                BUG();
+            }
+        } else if (auto k = srcb.to<operand::Const>()) {
+            salu.salu_bsrc_phv = 0;
+            if (k->value >= alu_const_min && k->value <= alu_const_max) {
+                salu.salu_const_src = k->value & Target::STATEFUL_ALU_CONST_MASK();
+                salu.salu_regfile_const = 0;
+            } else {
+                salu.salu_const_src = tbl->get_const(k->lineno, k->value);
+                salu.salu_regfile_const = 1;
+            }
+        } else if (auto r = srcb.to<operand::Regfile>()) {
+            salu.salu_bsrc_phv = 0;
+            salu.salu_const_src = r->index;
+            salu.salu_regfile_const = 1;
+        } else {
+            BUG();
+        }
+    }
+}
+void AluOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::Tofino::mau_regs>(regs, tbl, act);
+}
+
+template <>
+void BitOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    LOG2(this);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_state_alu[act->code][slot - ALU2LO];
+    salu.salu_op = opc->opcode & 0xf;
+    salu.salu_pred = predication_encode & Target::Tofino::STATEFUL_PRED_MASK;
+    // 1b instructions are from mem-lo to alu1-lo
+    salu.salu_asrc_memory = 1;
+    salu.salu_asrc_memory_index = 0;
+}
+void BitOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::Tofino::mau_regs>(regs, tbl, act);
+}
+
+template <>
+void CmpOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    LOG2(this);
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_cmp_alu[act->code][slot];
+    if (srca) {
+        salu.salu_cmp_asrc_input = srca->field->bit(0) > 0;
+        salu.salu_cmp_asrc_sign = srca_neg;
+        salu.salu_cmp_asrc_enable = 1;
+    }
+    if (srcb) {
+        salu.salu_cmp_bsrc_input = srcb->phv_index(tbl);
+        salu.salu_cmp_bsrc_sign = srcb_neg;
+        salu.salu_cmp_bsrc_enable = 1;
+    }
+    if (srcc) {
+        if (auto k = dynamic_cast<const operand::Const *>(srcc)) {
+            const int cmp_const_min = Target::STATEFUL_CMP_CONST_MIN();
+            const int cmp_const_max = Target::STATEFUL_CMP_CONST_MAX();
+            if (k->value >= cmp_const_min && k->value <= cmp_const_max) {
+                salu.salu_cmp_const_src = k->value & Target::STATEFUL_CMP_CONST_MASK();
+                salu.salu_cmp_regfile_const = 0;
+            } else {
+                salu.salu_cmp_const_src = tbl->get_const(srcc->lineno, k->value);
+                salu.salu_cmp_regfile_const = 1;
+            }
+        } else if (auto r = dynamic_cast<const operand::Regfile *>(srcc)) {
+            salu.salu_cmp_const_src = r->index;
+            salu.salu_cmp_regfile_const = 1;
+        }
+    } else {
+        salu.salu_cmp_const_src = 0;
+        salu.salu_cmp_regfile_const = 0;
+    }
+    salu.salu_cmp_opcode = opc->opcode | (type << 2);
+}
+void CmpOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::Tofino::mau_regs>(regs, tbl, act);
+}
+
+void TMatchOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    BUG();  // should never be called
+}
+
+void OutOP::decode_output_mux(Target::Tofino, Table *tbl, value_t &op) {
+    static const std::map<std::string, int> ops_mux_lookup = {
+        {"mem_hi", 0},     {"mem_lo", 1},     {"memory_hi", 0}, {"memory_lo", 1},
+        {"phv_hi", 2},     {"phv_lo", 3},     {"alu_hi", 4},    {"alu_lo", 5},
+        {"alu_hi_out", 4}, {"alu_lo_out", 5}, {"predicate", 6}};
+    if (op.type == tCMD && ops_mux_lookup.count(op[0].s))
+        output_mux = ops_mux_lookup.at(op[0].s);
+    else if (op.type == tSTR && ops_mux_lookup.count(op.s))
+        output_mux = ops_mux_lookup.at(op.s);
+    else
+        output_mux = -1;
+    if (src) {
+        int tmp = output_mux;
+        if (auto *phv = src.to<operand::Phv>())
+            output_mux = 3 - phv->phv_index(tbl->to<StatefulTable>());
+        else if (auto *mem = src.to<operand::Memory>())
+            output_mux = mem->field->bit(0) > 0 ? 0 : 1;
+        BUG_CHECK(tmp < 0 || tmp == output_mux, "inconsistent output mux decode");
+    }
+}
+int OutOP::decode_output_option(Target::Tofino, value_t &op) { return -1; }
+
+template <>
+void OutOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl_, Table::Actions::Action *act) {
+    LOG2(this);
+    auto tbl = dynamic_cast<StatefulTable *>(tbl_);
+    BUG_CHECK(tbl);
+    int logical_home_row = tbl->layout[0].row;
+    auto &meter_group = regs.rams.map_alu.meter_group[logical_home_row / 4U];
+    auto &salu = meter_group.stateful.salu_instr_output_alu[act->code];
+    if (predication_encode) {
+        salu.salu_output_cmpfn = predication_encode & Target::Tofino::STATEFUL_PRED_MASK;
+    } else {
+        salu.salu_output_cmpfn = STATEFUL_PREDICATION_ENCODE_UNCOND;
+    }
+    salu.salu_output_asrc = output_mux;
+}
+void OutOP::write_regs(Target::Tofino::mau_regs &regs, Table *tbl, Table::Actions::Action *act) {
+    write_regs<Target::Tofino::mau_regs>(regs, tbl, act);
+}
diff --git a/backends/tofino/bf-asm/tofino/sram_match.cpp b/backends/tofino/bf-asm/tofino/sram_match.cpp
new file mode 100644
index 00000000000..b956e13e5b9
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/sram_match.cpp
@@ -0,0 +1,96 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/stage.h"
+#include "backends/tofino/bf-asm/tables.h"
+#include "lib/log.h"
+
+static int find_in_ixbar(Table *table, std::vector<Phv::Ref> &match) {
+    // It would seem like it would be possible to simplify this code by refactoring it
+    // to use one loop calling Table::find_on_ixbar (which does much of what this does),r
+    // but it is important to prefer a group defined in this table to one defined in other
+    // tables, which the two loops does.  Could perhaps have a variant of find_on_ixbar that
+    // return *all* groups where the Phv::Ref is present (in priority order), so we could
+    // do the intersection (preserving priority order) rather than this repeated looping?
+    int max_i = -1;
+    LOG3("find_in_ixbar " << match);
+    for (unsigned group = 0; group < EXACT_XBAR_GROUPS; group++) {
+        LOG3(" looking in table in group " << group);
+        bool ok = true;
+        for (auto &r : match) {
+            LOG3("  looking for " << r);
+            for (auto &ixb : table->input_xbar) {
+                if (!ixb->find_exact(*r, group)) {
+                    LOG3("   -- not found");
+                    ok = false;
+                    break;
+                }
+            }
+        }
+        if (ok) {
+            LOG3(" success");
+            return group;
+        }
+    }
+    for (unsigned group = 0; group < EXACT_XBAR_GROUPS; group++) {
+        LOG3(" looking in group " << group);
+        bool ok = true;
+        for (auto &r : match) {
+            LOG3("  looking for " << r);
+            bool found = false;
+            InputXbar::Group ixbar_group(InputXbar::Group::EXACT, group);
+            for (auto *in : table->stage->ixbar_use[ixbar_group]) {
+                if (in->find_exact(*r, group)) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                LOG3("   -- not found");
+                if (&r - &match[0] > max_i) max_i = &r - &match[0];
+                ok = false;
+                break;
+            }
+        }
+        if (ok) {
+            LOG3(" success");
+            return group;
+        }
+    }
+    if (max_i > 0)
+        error(match[max_i].lineno, "%s: Can't find %s and %s in same input xbar group",
+              table->name(), match[max_i].name(), match[0].name());
+    else
+        error(match[0].lineno, "%s: Can't find %s in any input xbar group", table->name(),
+              match[0].name());
+    return -1;
+}
+
+void SRamMatchTable::setup_word_ixbar_group(Target::Tofino) {
+    word_ixbar_group.resize(match_in_word.size());
+    unsigned i = 0;
+    for (auto &match : match_in_word) {
+        std::vector<Phv::Ref> phv_ref_match;
+        for (auto *source : match) {
+            auto phv_ref = dynamic_cast<Phv::Ref *>(source);
+            BUG_CHECK(phv_ref);
+            BUG_CHECK(*phv_ref);
+            phv_ref_match.push_back(*phv_ref);
+        }
+        word_ixbar_group[i++] = phv_ref_match.empty() ? -1 : find_in_ixbar(this, phv_ref_match);
+    }
+}
diff --git a/backends/tofino/bf-asm/tofino/stage.cpp b/backends/tofino/bf-asm/tofino/stage.cpp
new file mode 100644
index 00000000000..2c906e858a1
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/stage.cpp
@@ -0,0 +1,140 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* mau stage template specializations for tofino -- #included directly in top-level stage.cpp */
+
+template <>
+void Stage::write_regs(Target::Tofino::mau_regs &regs, bool) {
+    write_common_regs<Target::Tofino>(regs);
+    auto &merge = regs.rams.match.merge;
+    for (gress_t gress : Range(INGRESS, EGRESS)) {
+        if (stageno == 0) {
+            merge.predication_ctl[gress].start_table_fifo_delay0 = pred_cycle(gress) - 1;
+            merge.predication_ctl[gress].start_table_fifo_delay1 = 0;
+            merge.predication_ctl[gress].start_table_fifo_enable = 1;
+        } else {
+            switch (stage_dep[gress]) {
+                case MATCH_DEP:
+                    merge.predication_ctl[gress].start_table_fifo_delay0 =
+                        this[-1].pipelength(gress) - this[-1].pred_cycle(gress) +
+                        pred_cycle(gress) - 1;
+                    merge.predication_ctl[gress].start_table_fifo_delay1 =
+                        this[-1].pipelength(gress) - this[-1].pred_cycle(gress);
+                    merge.predication_ctl[gress].start_table_fifo_enable = 3;
+                    break;
+                case ACTION_DEP:
+                    merge.predication_ctl[gress].start_table_fifo_delay0 = 1;
+                    merge.predication_ctl[gress].start_table_fifo_delay1 = 0;
+                    merge.predication_ctl[gress].start_table_fifo_enable = 1;
+                    break;
+                case CONCURRENT:
+                    merge.predication_ctl[gress].start_table_fifo_enable = 0;
+                    break;
+                default:
+                    BUG();
+            }
+        }
+        if (stageno != 0) {
+            regs.dp.cur_stage_dependency_on_prev[gress] = MATCH_DEP - stage_dep[gress];
+            if (stage_dep[gress] == CONCURRENT) regs.dp.stage_concurrent_with_prev |= 1U << gress;
+        }
+        if (stageno != AsmStage::numstages() - 1)
+            regs.dp.next_stage_dependency_on_cur[gress] = MATCH_DEP - this[1].stage_dep[gress];
+        else if (AsmStage::numstages() < Target::NUM_MAU_STAGES())
+            regs.dp.next_stage_dependency_on_cur[gress] = 2;
+        auto &deferred_eop_bus_delay = regs.rams.match.adrdist.deferred_eop_bus_delay[gress];
+        deferred_eop_bus_delay.eop_internal_delay_fifo = pred_cycle(gress) + 3;
+        /* FIXME -- making this depend on the dependency of the next stage seems wrong */
+        if (stageno == AsmStage::numstages() - 1) {
+            if (AsmStage::numstages() < Target::NUM_MAU_STAGES())
+                deferred_eop_bus_delay.eop_output_delay_fifo = 0;
+            else
+                deferred_eop_bus_delay.eop_output_delay_fifo = pipelength(gress) - 1;
+        } else if (this[1].stage_dep[gress] == MATCH_DEP)
+            deferred_eop_bus_delay.eop_output_delay_fifo = pipelength(gress) - 1;
+        else if (this[1].stage_dep[gress] == ACTION_DEP)
+            deferred_eop_bus_delay.eop_output_delay_fifo = 1;
+        else
+            deferred_eop_bus_delay.eop_output_delay_fifo = 0;
+        deferred_eop_bus_delay.eop_delay_fifo_en = 1;
+    }
+
+    for (gress_t gress : Range(INGRESS, EGRESS))
+        if (table_use[gress] & USE_TCAM)
+            regs.tcams.tcam_piped |= options.match_compiler ? 3 : 1 << gress;
+
+    bitvec in_use = match_use[INGRESS] | action_use[INGRESS] | action_set[INGRESS];
+    bitvec eg_use = match_use[EGRESS] | action_use[EGRESS] | action_set[EGRESS];
+    if (options.match_compiler) {
+        /* the glass compiler occasionally programs extra uses of random registers on
+         * busses where it doesn't actually use them.  Sometimes, these regs
+         * are in use by the other thread, so rely on the deparser to correctly
+         * set the Phv::use info and strip out registers it says are used by
+         * the other thread */
+        in_use -= Deparser::PhvUse(EGRESS);
+        eg_use -= Deparser::PhvUse(INGRESS);
+    }
+    /* FIXME -- if the regs are live across a stage (even if not used in that stage) they
+     * need to be set in the thread registers.  For now we just assume if they are used
+     * anywhere, they need to be marked as live */
+    in_use |= Phv::use(INGRESS);
+    eg_use |= Phv::use(EGRESS);
+    static const int phv_use_transpose[2][14] = {
+        {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 20, 21},
+        {4, 5, 6, 7, 12, 13, 14, 15, 22, 23, 24, 25, 26, 27}};
+    // FIXME -- this code depends on the Phv::Register uids matching the
+    // FIXME -- mau encoding of phv containers. (FIXME-PHV)
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 14; j++) {
+            regs.dp.phv_ingress_thread_alu[i][j] = regs.dp.phv_ingress_thread_imem[i][j] =
+                regs.dp.phv_ingress_thread[i][j] = in_use.getrange(8 * phv_use_transpose[i][j], 8);
+            regs.dp.phv_egress_thread_alu[i][j] = regs.dp.phv_egress_thread_imem[i][j] =
+                regs.dp.phv_egress_thread[i][j] = eg_use.getrange(8 * phv_use_transpose[i][j], 8);
+        }
+    }
+}
+
+template <>
+void Stage::gen_configuration_cache(Target::Tofino::mau_regs &regs, json::vector &cfg_cache) {
+    Stage::gen_configuration_cache_common(regs, cfg_cache);
+
+    unsigned reg_width = 8;  // this means number of hex characters
+    std::string reg_fqname;
+    std::string reg_name;
+    unsigned reg_value;
+    std::string reg_value_str;
+
+    // meter_ctl
+    auto &meter_ctl = regs.rams.map_alu.meter_group;
+    for (int i = 0; i < 4; i++) {
+        reg_fqname = "mau[" + std::to_string(stageno) + "].rams.map_alu.meter_group[" +
+                     std::to_string(i) + "]" + ".meter.meter_ctl";
+        reg_name = "stage_" + std::to_string(stageno) + "_meter_ctl_" + std::to_string(i);
+        reg_value = meter_ctl[i].meter.meter_ctl;
+        if ((reg_value != 0) || (options.match_compiler)) {
+            reg_value_str = int_to_hex_string(reg_value, reg_width);
+            add_cfg_reg(cfg_cache, reg_fqname, reg_name, reg_value_str);
+        }
+    }
+}
+
+template <>
+void Stage::gen_mau_stage_extension(Target::Tofino::mau_regs &regs, json::map &extend) {
+    BUG();  // stage extension not supported on tofino
+}
+
+void AlwaysRunTable::write_regs(Target::Tofino::mau_regs &) { BUG(); }
diff --git a/backends/tofino/bf-asm/tofino/stateful.cpp b/backends/tofino/bf-asm/tofino/stateful.cpp
new file mode 100644
index 00000000000..9265d2d5bb5
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/stateful.cpp
@@ -0,0 +1,77 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/tofino/stateful.h"
+
+int StatefulTable::parse_counter_mode(Target::Tofino target, const value_t &v) {
+    if (v != "counter") return -1;
+    if (v.type == tSTR) return 4;
+    if (v.type != tCMD || v.vec.size != 2) return -1;
+    static const std::map<std::string, int> modes = {{"hit", 2}, {"miss", 1}, {"gateway", 3}};
+    if (!modes.count(v[1].s)) return -1;
+    return modes.at(v[1].s);
+}
+
+void StatefulTable::set_counter_mode(Target::Tofino target, int mode) {
+    stateful_counter_mode |= mode;
+}
+
+template <>
+void StatefulTable::write_logging_regs(Target::Tofino::mau_regs &regs) {
+    auto &merge = regs.rams.match.merge;
+    unsigned meter_group = layout.at(0).row / 4U;
+    auto &salu = regs.rams.map_alu.meter_group[meter_group].stateful;
+    for (MatchTable *m : match_tables) {
+        auto *call = m->get_call(this);
+        if (!call || call->args.at(0).type != Call::Arg::Counter) continue;
+        if (auto mode = call->args.at(0).count_mode()) {
+            merge.mau_stateful_log_counter_ctl[m->logical_id / 8U].set_subfield(
+                mode, 3 * (m->logical_id % 8U), 3);
+            for (auto &rep : merge.mau_stateful_log_ctl_ixbar_map[m->logical_id / 8U])
+                rep.set_subfield(meter_group | 0x4, 3 * (m->logical_id % 8U), 3);
+        }
+    }
+    if (stateful_counter_mode) {
+        merge.mau_stateful_log_instruction_width.set_subfield(format->log2size - 3, 2 * meter_group,
+                                                              2);
+        merge.mau_stateful_log_vpn_offset[meter_group / 2].set_subfield(logvpn_min,
+                                                                        6 * (meter_group % 2), 6);
+        merge.mau_stateful_log_vpn_limit[meter_group / 2].set_subfield(logvpn_max,
+                                                                       6 * (meter_group % 2), 6);
+    }
+
+    for (size_t i = 0; i < const_vals.size(); ++i) {
+        if (const_vals[i].value > INT_MAX || const_vals[i].value < INT_MIN)
+            error(const_vals[i].lineno, "constant value %" PRId64 " too large for stateful alu",
+                  const_vals[i].value);
+        salu.salu_const_regfile[i] = const_vals[i].value & 0xffffffffU;
+    }
+}
+
+/// Compute the proper value for the register
+///    map_alu.meter_alu_group_data_delay_ctl[].meter_alu_right_group_delay
+/// which controls the two halves of the ixbar->meter_alu fifo, based on a bytemask of which
+/// bytes are needed in the meter_alu.  On Tofino, the fifo is 64 bits wide, so each enable
+/// bit controls 32 bits
+int AttachedTable::meter_alu_fifo_enable_from_mask(Target::Tofino::mau_regs &, unsigned bytemask) {
+    int rv = 0;
+    if (bytemask & 0xf) rv |= 1;
+    if (bytemask & 0xf0) rv |= 2;
+    return rv;
+}
+
+void StatefulTable::gen_tbl_cfg(Target::Tofino, json::map &tbl, json::map &stage_tbl) const {}
diff --git a/backends/tofino/bf-asm/tofino/stateful.h b/backends/tofino/bf-asm/tofino/stateful.h
new file mode 100644
index 00000000000..fab94ea2ee9
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/stateful.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_STATEFUL_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_STATEFUL_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+#include "backends/tofino/bf-asm/target.h"
+
+class Target::Tofino::StatefulTable : public ::StatefulTable {
+    friend class ::StatefulTable;
+    StatefulTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::StatefulTable(line, n, gr, s, lid) {}
+};
+
+template <>
+void StatefulTable::write_logging_regs(Target::Tofino::mau_regs &regs);
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_STATEFUL_H_ */
diff --git a/backends/tofino/bf-asm/tofino/template_objects.yaml b/backends/tofino/bf-asm/tofino/template_objects.yaml
new file mode 100644
index 00000000000..f8dde2503a9
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/template_objects.yaml
@@ -0,0 +1,109 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+global:
+  - namespace=Tofino
+  - emit_binary
+  - emit_fieldname
+  - emit_json
+  - enable_disable
+  - input_binary
+  - reverse_write
+  - write_dma=mapram_config
+  - write_dma=imem_subword8
+  - write_dma=imem_subword16
+  - write_dma=imem_subword32
+  - write_dma=galois_field_matrix
+generate:
+  memories:
+    pipe_top_level:
+      memories.pipe_top_level.h: [ decl, name=memories.top ]
+      memories.pipe_top_level.cpp: [ defn, name=memories.top,
+          -Imemories.pipe_top_level.h, -Imemories.pipe_addrmap.h ]
+    pipe_addrmap:
+      memories.pipe_addrmap.h: [ decl, name=memories.pipe ]
+      memories.pipe_addrmap.cpp: [ defn, name=memories.pipe,
+          -Imemories.pipe_addrmap.h, -Imemories.prsr_mem_main_rspec.h ]
+    prsr_mem_main_rspec:
+      memories.prsr_mem_main_rspec.h: [ decl, name=memories.all.parser.%s ]
+      memories.prsr_mem_main_rspec.cpp: [ defn, name=memories.all.parser.%s,
+          -Imemories.prsr_mem_main_rspec.h ]
+  regs:
+    tofino:
+      regs.tofino.h: [ decl, name=regs.top ]
+      regs.tofino.cpp: [ defn, name=regs.top,
+          -Iregs.tofino.h, -Iregs.pipe_addrmap.h ]
+    pipe_addrmap:
+      regs.pipe_addrmap.h: [ decl, name=regs.pipe, expand_disabled_vector ]
+      regs.pipe_addrmap.cpp: [ defn, name=regs.pipe, expand_disabled_vector,
+          -Iregs.pipe_addrmap.h, -Iregs.ibp_rspec.h, -Iregs.ebp_rspec.h,
+          -Iregs.prsr_reg_merge_rspec.h, -Iregs.mau_addrmap.h,
+          -Iregs.dprsr_inp.h, -Iregs.dprsr_hdr.h ]
+    # pmarb_rspec
+    ibp_rspec: # Ingress parser registers
+      regs.ibp_rspec.h: [ decl, name=regs.all.parser.ingress ]
+      regs.ibp_rspec.cpp: [ defn, name=regs.all.parser.ingress,
+          -Iregs.ibp_rspec.h ]
+    ebp_rspec: # Egress parser registers
+      regs.ebp_rspec.h: [ decl, name=regs.all.parser.egress ]
+      regs.ebp_rspec.cpp: [ defn, name=regs.all.parser.egress,
+          -Iregs.ebp_rspec.h ]
+    prsr_reg_merge_rspec: # Shared parser registers
+      regs.prsr_reg_merge_rspec.h: [ decl, name=regs.all.parse_merge ]
+      regs.prsr_reg_merge_rspec.cpp: [ defn, name=regs.all.parse_merge,
+          -Iregs.prsr_reg_merge_rspec.h ]
+    mau_addrmap:
+      regs.mau_addrmap.h: [ decl, name=regs.match_action_stage.%02x ]
+      regs.mau_addrmap.cpp: [ defn, name=regs.match_action_stage.%02x,
+          -Iregs.mau_addrmap.h ]
+    # dprsr_reg_rspec
+    dprsr_inp:
+      regs.dprsr_inp.h: [ decl, name=regs.all.deparser.input_phase, global=fde_pov ]
+      regs.dprsr_inp.cpp: [ defn, name=regs.all.deparser.input_phase, global=fde_pov,
+          -Iregs.dprsr_inp.h ]
+    #dprsr_out_ingr: {}
+    #dprsr_out_egr: {}
+    dprsr_hdr:
+      regs.dprsr_hdr.h: [ decl, name=regs.all.deparser.header_phase, global=fde_phv ]
+      regs.dprsr_hdr.cpp: [ defn, name=regs.all.deparser.header_phase, global=fde_phv,
+          -Iregs.dprsr_hdr.h ]
+ignore:
+  memories:
+    - mau_addrmap
+    # pipe_top_level
+    - tm_pre_mem_rspec
+    - party_pgr_mem_rspec
+  regs:
+    # tofino
+    - dvsl_addrmap
+    - mac_addrmap
+    - serdes_addrmap
+    # pipe_addrmap
+    # pmarb_rspec
+    # ebp_rspec
+    - egrNx_regs
+    # parb_regs
+    - pbus_station_regs
+    - party_pgr_reg_rspec
+    - party_glue_reg_rspec
+    # dprsr_reg_rspec
+    - mir_buf_all
+    - dprsr_out_ingr
+    - dprsr_out_egr
+    # dprsr_hdr
+    # dprsr_hi_mem
+    - dprsr_h_pv_table_map
diff --git a/backends/tofino/bf-asm/tofino/ternary_match.cpp b/backends/tofino/bf-asm/tofino/ternary_match.cpp
new file mode 100644
index 00000000000..e18bdf96723
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/ternary_match.cpp
@@ -0,0 +1,55 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backends/tofino/bf-asm/tofino/ternary_match.h"
+
+#include "backends/tofino/bf-asm/stage.h"
+
+void Target::Tofino::TernaryMatchTable::pass1() {
+    ::TernaryMatchTable::pass1();
+    // Dont allocate id (mark them as used) for empty ternary tables (keyless
+    // tables). Keyless tables are marked ternary with a tind. They are setup by
+    // the driver to always miss (since there is no match) and run the miss
+    // action. The miss action is associated with the logical table space and
+    // does not need a tcam id association. This saves tcams ids to be assigned
+    // to actual ternary tables. This way we can have 8 real ternary match
+    // tables within a stage and not count the keyless among them.
+    // NOTE: The tcam_id is never assigned for these tables and will be set to
+    // default (-1). We also disable registers associated with tcam_id for this
+    // table.
+    if (layout_size() != 0) {
+        alloc_id("tcam", tcam_id, stage->pass1_tcam_id, TCAM_TABLES_PER_STAGE, false,
+                 stage->tcam_id_use);
+        physical_ids[tcam_id] = 1;
+    }
+    // alloc_busses(stage->tcam_match_bus_use); -- now hardwired
+}
+
+void Target::Tofino::TernaryIndirectTable::pass1() {
+    ::TernaryIndirectTable::pass1();
+    alloc_busses(stage->tcam_indirect_bus_use, Layout::TIND_BUS);
+}
+
+void Target::Tofino::TernaryMatchTable::check_tcam_match_bus(
+    const std::vector<Table::Layout> &layout) {
+    for (auto &row : layout) {
+        if (row.bus.empty()) continue;
+        for (auto &tcam : row.memunits)
+            if (row.bus.at(Table::Layout::SEARCH_BUS) != tcam.col)
+                error(row.lineno, "Tcam match bus hardwired to tcam column");
+    }
+}
diff --git a/backends/tofino/bf-asm/tofino/ternary_match.h b/backends/tofino/bf-asm/tofino/ternary_match.h
new file mode 100644
index 00000000000..cfb635443b1
--- /dev/null
+++ b/backends/tofino/bf-asm/tofino/ternary_match.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOFINO_TERNARY_MATCH_H_
+#define BACKENDS_TOFINO_BF_ASM_TOFINO_TERNARY_MATCH_H_
+
+#include "backends/tofino/bf-asm/tables.h"
+
+class Target::Tofino::TernaryMatchTable : public ::TernaryMatchTable {
+    friend class ::TernaryMatchTable;
+    TernaryMatchTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::TernaryMatchTable(line, n, gr, s, lid) {}
+
+    void pass1() override;
+    void check_tcam_match_bus(const std::vector<Table::Layout> &);
+};
+
+class Target::Tofino::TernaryIndirectTable : public ::TernaryIndirectTable {
+    friend class ::TernaryIndirectTable;
+    TernaryIndirectTable(int line, const char *n, gress_t gr, Stage *s, int lid)
+        : ::TernaryIndirectTable(line, n, gr, s, lid) {}
+
+    void pass1() override;
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOFINO_TERNARY_MATCH_H_ */
diff --git a/backends/tofino/bf-asm/top_level.cpp b/backends/tofino/bf-asm/top_level.cpp
new file mode 100644
index 00000000000..d7d89d5f79c
--- /dev/null
+++ b/backends/tofino/bf-asm/top_level.cpp
@@ -0,0 +1,126 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "top_level.h"
+
+#include "bfas.h"
+#include "binary_output.h"
+#include "bson.h"
+#include "version.h"
+
+TopLevel *TopLevel::all = nullptr;
+
+TopLevel::TopLevel() {
+    BUG_CHECK(!all);
+    all = this;
+}
+
+TopLevel::~TopLevel() { all = nullptr; }
+
+template <class TARGET>
+TopLevelRegs<TARGET>::TopLevelRegs() {
+    declare_registers(&this->mem_top, sizeof(this->mem_top),
+                      [this](std::ostream &out, const char *addr, const void *end) {
+                          out << "memories.top";
+                          this->mem_top.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&this->mem_pipe, sizeof(this->mem_pipe),
+                      [this](std::ostream &out, const char *addr, const void *end) {
+                          out << "memories.pipe";
+                          this->mem_pipe.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&this->reg_top, sizeof(this->reg_top),
+                      [this](std::ostream &out, const char *addr, const void *end) {
+                          out << "registers.top";
+                          this->reg_top.emit_fieldname(out, addr, end);
+                      });
+    declare_registers(&this->reg_pipe, sizeof(this->reg_pipe),
+                      [this](std::ostream &out, const char *addr, const void *end) {
+                          out << "registers.pipe";
+                          this->reg_pipe.emit_fieldname(out, addr, end);
+                      });
+}
+
+template <class TARGET>
+TopLevelRegs<TARGET>::~TopLevelRegs() {
+    undeclare_registers(&this->mem_top);
+    undeclare_registers(&this->mem_pipe);
+    undeclare_registers(&this->reg_top);
+    undeclare_registers(&this->reg_pipe);
+}
+
+template <class TARGET>
+void TopLevelRegs<TARGET>::output(json::map &ctxt_json) {
+    for (int i = 0; i < Target::NUM_PIPES(); i++) {
+        if (options.binary >= PIPE0 && options.binary != PIPE0 + i) {
+            this->mem_top.pipes[i].disable();
+            this->reg_top.pipes[i].disable();
+        } else {
+            this->mem_top.pipes[i].set("memories.pipe", &this->mem_pipe);
+            this->reg_top.pipes[i].set("regs.pipe", &this->reg_pipe);
+        }
+    }
+    if (options.condense_json) {
+        this->mem_top.disable_if_reset_value();
+        this->mem_pipe.disable_if_reset_value();
+        this->reg_top.disable_if_reset_value();
+        this->reg_pipe.disable_if_reset_value();
+    }
+    if (error_count == 0) {
+        if (options.gen_json) {
+            this->mem_top.emit_json(*open_output("memories.top.cfg.json"));
+            this->mem_pipe.emit_json(*open_output("memories.pipe.cfg.json"));
+            this->reg_top.emit_json(*open_output("regs.top.cfg.json"));
+            this->reg_pipe.emit_json(*open_output("regs.pipe.cfg.json"));
+        }
+        if (options.binary != NO_BINARY) {
+            auto binfile = open_output("%s.bin", TARGET::name);
+            json::map header;
+            header["asm_version"] = BFASM::Version::getVersion();
+            if (ctxt_json["compiler_version"])
+                header["compiler_version"] = ctxt_json["compiler_version"]->clone();
+            header["reg_version"] = TARGET::top_level_regs::_regs_top::_reg_version;
+            if (ctxt_json["run_id"]) header["run_id"] = ctxt_json["run_id"]->clone();
+            if (ctxt_json["program_name"])
+                header["program_name"] = ctxt_json["program_name"]->clone();
+            header["target"] = Target::name();
+            header["stages"] = Target::NUM_MAU_STAGES();
+            *binfile << binout::tag('H') << json::binary(header);
+            if (options.binary != ONE_PIPE) {
+                this->mem_top.emit_binary(*binfile, 0);
+                this->reg_top.emit_binary(*binfile, 0);
+            } else {
+                this->mem_pipe.emit_binary(*binfile, 0);
+                this->reg_pipe.emit_binary(*binfile, 0);
+            }
+
+            if (options.multi_parsers) {
+                emit_parser_registers(this, *binfile);
+            }
+        }
+    }
+}
+
+template <class TARGET>
+void TopLevelRegs<TARGET>::set_mau_stage(int stage, const char *file,
+                                         typename TARGET::mau_regs *regs, bool egress_only) {
+    BUG_CHECK(!egress_only, "separate egress MAU on target that does not support it");
+    this->reg_pipe.mau[stage].set(file, regs);
+}
+
+#define TOP_LEVEL_REGS(REGSET) template class TopLevelRegs<Target::REGSET>;
+FOR_ALL_REGISTER_SETS(TOP_LEVEL_REGS)
diff --git a/backends/tofino/bf-asm/top_level.h b/backends/tofino/bf-asm/top_level.h
new file mode 100644
index 00000000000..9c150607a71
--- /dev/null
+++ b/backends/tofino/bf-asm/top_level.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_TOP_LEVEL_H_
+#define BACKENDS_TOFINO_BF_ASM_TOP_LEVEL_H_
+
+#include "backends/tofino/bf-asm/json.h"
+#include "backends/tofino/bf-asm/target.h"
+
+template <class REGSET>
+class TopLevelRegs;
+
+class TopLevel {
+ protected:
+    TopLevel();
+
+ public:
+    static TopLevel *all;
+    virtual ~TopLevel();
+    virtual void output(json::map &) = 0;
+    static void output_all(json::map &ctxtJson) { all->output(ctxtJson); }
+    template <class T>
+    static TopLevelRegs<typename T::register_type> *regs();
+#define SET_MAU_STAGE(TARGET)                                                         \
+    virtual void set_mau_stage(int, const char *, Target::TARGET::mau_regs *, bool) { \
+        BUG_CHECK(!"register mismatch");                                              \
+    }
+    FOR_ALL_REGISTER_SETS(SET_MAU_STAGE)
+};
+
+template <class REGSET>
+class TopLevelRegs : public TopLevel, public REGSET::top_level_regs {
+ public:
+    TopLevelRegs();
+    ~TopLevelRegs();
+
+    void output(json::map &);
+    void set_mau_stage(int stage, const char *file, typename REGSET::mau_regs *regs,
+                       bool egress_only);
+};
+
+template <class T>
+TopLevelRegs<typename T::register_type> *TopLevel::regs() {
+    return dynamic_cast<TopLevelRegs<typename T::register_type> *>(all);
+}
+
+#endif /* BACKENDS_TOFINO_BF_ASM_TOP_LEVEL_H_ */
diff --git a/backends/tofino/bf-asm/ubits.cpp b/backends/tofino/bf-asm/ubits.cpp
new file mode 100644
index 00000000000..2b90cceb6d9
--- /dev/null
+++ b/backends/tofino/bf-asm/ubits.cpp
@@ -0,0 +1,83 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "ubits.h"
+
+#include <map>
+#include <sstream>
+
+#include "lib/hex.h"
+#include "lib/log.h"
+
+struct regrange {
+    const char *base;
+    size_t sz;
+    std::function<void(std::ostream &, const char *, const void *)> fn;
+};
+
+static std::map<const char *, regrange> *registers;
+
+static regrange *find_regrange(const void *addr_) {
+    const char *addr = static_cast<const char *>(addr_);
+    if (registers) {
+        auto it = registers->upper_bound(addr);
+        if (it != registers->begin()) {
+            it--;
+            if (addr <= it->second.base + it->second.sz) return &it->second;
+        }
+    }
+    return nullptr;
+}
+
+void declare_registers(const void *addr_, size_t sz,
+                       std::function<void(std::ostream &, const char *, const void *)> fn) {
+    const char *addr = static_cast<const char *>(addr_);
+    if (!registers) registers = new std::map<const char *, regrange>();
+    registers->emplace(addr, regrange{addr, sz, fn});
+}
+
+void undeclare_registers(const void *addr_) {
+    const char *addr = static_cast<const char *>(addr_);
+    registers->erase(addr);
+    if (registers->empty()) {
+        delete registers;
+        registers = 0;
+    }
+}
+
+void print_regname(std::ostream &out, const void *addr, const void *end) {
+    if (auto rr = find_regrange(addr))
+        rr->fn(out, static_cast<const char *>(addr), end);
+    else
+        out << "???";
+}
+
+std::string string_regname(const void *addr, const void *end) {
+    std::stringstream tmp;
+    print_regname(tmp, addr, end);
+    return tmp.str();
+}
+
+void ubits_base::log(const char *op, uint64_t v) const {
+    if (LOGGING(1)) {
+        std::ostringstream tmp;
+        if (!find_regrange(this)) return;
+        LOG1(this << ' ' << op << ' ' << v
+                  << (v != value ? tmp << " (now " << value << ")", tmp : tmp).str() << " (0x"
+                  << hex(value) << ")");
+    }
+}
diff --git a/backends/tofino/bf-asm/ubits.h b/backends/tofino/bf-asm/ubits.h
new file mode 100644
index 00000000000..adc4248f955
--- /dev/null
+++ b/backends/tofino/bf-asm/ubits.h
@@ -0,0 +1,178 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_UBITS_H_  //  NOLINT(build/header_guard)
+#define BACKENDS_TOFINO_BF_ASM_UBITS_H_
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdint.h>
+
+#include <functional>
+#include <iostream>
+#include <sstream>
+
+#include "lib/bitvec.h"
+#include "lib/log.h"
+
+using namespace P4;
+
+void declare_registers(const void *addr, size_t sz,
+                       std::function<void(std::ostream &, const char *, const void *)> fn);
+void undeclare_registers(const void *addr);
+void print_regname(std::ostream &out, const void *addr, const void *end);
+std::string string_regname(const void *addr, const void *end);
+
+struct ubits_base;
+
+struct ubits_base {
+    uint64_t value, reset_value;
+    mutable bool read, write;
+    mutable bool disabled_;
+
+    ubits_base() : value(0), reset_value(0), read(false), write(false), disabled_(false) {}
+    explicit ubits_base(uint64_t v)
+        : value(v), reset_value(v), read(false), write(false), disabled_(false) {}
+    operator uint64_t() const {
+        read = true;
+        return value;
+    }
+    bool modified() const { return write; }
+    void set_modified(bool v = true) { write = v; }
+    bool disabled() const { return disabled_; }
+    bool disable_if_unmodified() { return write ? false : (disabled_ = true); }
+    bool disable_if_zero() const { return value == 0 && !write; }
+    bool disable_if_reset_value() { return value == reset_value ? (disabled_ = true) : false; }
+    bool disable() const {
+        if (write) {
+            LOG1("ERROR: Disabling modified register in " << this);
+            return false;
+        }
+        disabled_ = true;
+        return disabled_;
+    }
+    void enable() const { disabled_ = false; }
+    void rewrite() { write = false; }
+    virtual uint64_t operator=(uint64_t v) = 0;
+    virtual const ubits_base &operator|=(uint64_t v) = 0;
+    virtual unsigned size() = 0;
+    void log(const char *op, uint64_t v) const;
+};
+
+inline std::ostream &operator<<(std::ostream &out, const ubits_base *u) {
+    print_regname(out, u, u + 1);
+    return out;
+}
+
+template <int N>
+struct ubits : ubits_base {
+    ubits() : ubits_base() {}
+    const ubits &check(std::true_type) {
+        if (value >= (uint64_t(1) << N)) {
+            LOG1("ERROR:  out of range for " << N << " bits in " << this);
+            value &= (uint64_t(1) << N) - 1;
+        }
+        return *this;
+    }
+    const ubits &check(std::false_type) { return *this; }
+    const ubits &check() {
+        return check(std::integral_constant<bool, (N != sizeof(uint64_t) * CHAR_BIT)>{});
+    }
+    explicit ubits(uint64_t v) : ubits_base(v) { check(); }
+    ubits(const ubits &) = delete;
+    ubits(ubits &&) = default;
+    uint64_t operator=(uint64_t v) override {
+        if (disabled_) LOG1("ERROR: Writing disabled register value in " << this);
+        if (write)
+            LOG1((value != v ? "ERROR:" : "WARNING:")
+                 << " Overwriting " << value << " with " << v << " in " << this);
+        value = v;
+        write = true;
+        log("=", v);
+        check();
+        return v;
+    }
+    const ubits &operator=(const ubits &v) {
+        *this = v.value;
+        v.read = true;
+        return v;
+    }
+    const ubits_base &operator=(const ubits_base &v) {
+        *this = v.value;
+        v.read = true;
+        return v;
+    }
+    unsigned size() override { return N; }
+    const ubits &operator|=(uint64_t v) override {
+        if (disabled_) LOG1("ERROR: Writing disabled register value in " << this);
+        if (write && (v & value) != 0)
+            LOG1("WARNING: Overwriting " << value << " with " << (v | value) << " in " << this);
+        value |= v;
+        write = true;
+        log("|=", v);
+        return check();
+    }
+    const ubits &operator|=(bitvec v) {
+        if (disabled_) LOG1("ERROR: Writing disabled register value in " << this);
+        if (v.ffs(N) > 0)
+            LOG1("ERROR: bitvec 0x" << v << " out of range for " << N << " bits in " << this);
+        uint64_t val = v.getrange(0, N);
+        if (write && (val & value) != 0)
+            LOG1("WARNING: Overwriting " << value << " with " << (val | value) << " in " << this);
+        value |= val;
+        write = true;
+        log("|=", val);
+        return check();
+    }
+    const ubits &operator+=(uint64_t v) {
+        if (disabled_) LOG1("ERROR: Overwriting disabled register value in " << this);
+        value += v;
+        write = true;
+        log("+=", v);
+        return check();
+    }
+    const ubits &operator^=(uint64_t v) {
+        if (disabled_) LOG1("ERROR: Overwriting disabled register value in " << this);
+        value ^= v;
+        write = true;
+        log("^=", v);
+        return check();
+    }
+    const ubits &set_subfield(uint64_t v, unsigned bit, unsigned size) {
+        if (disabled_) LOG1("ERROR: Overwriting disabled register value in " << this);
+        uint64_t mask = (1ULL << size) - 1;
+        uint64_t oldv = (value >> bit) & mask;
+        if (bit + size > N) {
+            LOG1("ERROR: subfield " << bit << ".." << (bit + size - 1) << " out of range in "
+                                    << this);
+        } else if (write && oldv) {
+            LOG1((v != oldv ? "ERROR" : "WARNING")
+                 << ": Overwriting subfield(" << bit << ".." << (bit + size - 1) << ") value "
+                 << oldv << " with " << v << " in " << this);
+        }
+        if (v > mask) {
+            LOG1("ERROR: Subfield value " << v << " too large for " << size << " bits in " << this);
+            v &= mask;
+        }
+        value |= v << bit;
+        write = true;
+        log("|=", v << bit);
+        return check();
+    }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_UBITS_H_ */
diff --git a/backends/tofino/bf-asm/vector.c b/backends/tofino/bf-asm/vector.c
new file mode 100644
index 00000000000..7a4df9f36f7
--- /dev/null
+++ b/backends/tofino/bf-asm/vector.c
@@ -0,0 +1,116 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include "vector.h"
+
+struct raw_vector {
+    int         capacity, size;
+    void        *data;
+};
+
+int init_raw_vector(void *vec, size_t elsize, int mincap)
+{
+    struct raw_vector *v = (struct raw_vector *)vec;
+    v->size = 0;
+    v->capacity = 32 / elsize;
+    if (v->capacity < 4) v->capacity = 4;
+    if (v->capacity < mincap) v->capacity = mincap;
+    if (!(v->data = malloc(elsize * v->capacity)))
+	v->capacity = 0;
+    return v->data ? 0 : -1;
+}
+
+int erase_raw_vector(void *vec, size_t elsize, int i, unsigned cnt)
+{
+    struct raw_vector *v = (struct raw_vector *)vec;
+    if (i < 0 && i >= v->size) return -1;
+    if (cnt == 0) cnt = 1;
+    if (i + cnt >= (unsigned)v->size) {
+	v->size = i;
+    } else {
+	char *p = (char *)v->data + i*elsize;
+	memmove(p, p + elsize*cnt, elsize * (v->size - i - cnt));
+	v->size -= cnt; }
+    return 0;
+}
+
+int expand_raw_vector(void *vec, size_t elsize)
+{
+    struct raw_vector *v = (struct raw_vector *)vec;
+    size_t ncap = v->capacity * 2U;
+    void *n;
+    if (ncap == 0) {
+	ncap = 32 / elsize;
+	if (ncap < 4) ncap = 4; }
+    if (ncap > (size_t)INT_MAX && (int)(ncap = INT_MAX) == v->capacity) {
+	errno = ERANGE;
+	return -1; }
+    if (!(n = realloc(v->data, elsize * ncap))) return -1;
+    v->capacity = ncap;
+    v->data = n;
+    return 0;
+}
+
+int insert_raw_vector(void *vec, size_t elsize, int i, unsigned cnt)
+{
+    struct raw_vector *v = (struct raw_vector *)vec;
+    if (i < 0 && i > v->size) return -1;
+    if (cnt == 0) cnt = 1;
+    if (v->size + cnt > (unsigned)INT_MAX) {
+	errno = ERANGE;
+	return -1; }
+    if ((int)(v->size + cnt) > v->capacity) {
+	int newsz = v->size + cnt;
+	void *n;
+	if (newsz < v->capacity * 2) newsz = v->capacity * 2;
+	if (!(n = realloc(v->data, elsize * newsz))) return -1;
+	v->capacity = newsz;
+	v->data = n; }
+    if (i < v->size) {
+	char *p = (char *)v->data + i*elsize;
+	memmove(p + cnt*elsize, p, elsize * (v->size - i)); }
+    v->size += cnt;
+    return 0;
+}
+
+int reserve_raw_vector(void *vec, size_t elsize, int size, int shrink)
+{
+    struct raw_vector *v = (struct raw_vector *)vec;
+    void *n;
+    if (v->capacity < size || (shrink && v->capacity > size)) {
+	if (!(n = realloc(v->data, elsize * size))) return -1;
+	v->capacity = size;
+	if (size < v->size)
+	    v->size = size;
+	v->data = n; }
+    return 0;
+}
+
+int shrink_raw_vector(void *vec, size_t elsize)
+{
+    struct raw_vector *v = (struct raw_vector *)vec;
+    void *n;
+    if (v->size < v->capacity) {
+	if (!(n = realloc(v->data, elsize * v->size))) return -1;
+	v->capacity = v->size;
+	v->data = n; }
+    return 0;
+}
diff --git a/backends/tofino/bf-asm/vector.h b/backends/tofino/bf-asm/vector.h
new file mode 100644
index 00000000000..520fa636058
--- /dev/null
+++ b/backends/tofino/bf-asm/vector.h
@@ -0,0 +1,229 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_VECTOR_H_
+#define BACKENDS_TOFINO_BF_ASM_VECTOR_H_
+
+/* C code and macros for VECTOR objects similar to C++ std::vector */
+#include <stddef.h>
+
+#define CAT(A, B) A##B
+#define VECTOR(NAME) CAT(NAME, _VECTOR)
+#define DECLARE_VECTOR(TYPE, ...)       \
+    typedef struct CAT(TYPE, _VECTOR) { \
+        int capacity, size;             \
+        TYPE *data;                     \
+        __VA_ARGS__                     \
+    } CAT(TYPE, _VECTOR);
+#define DECLARE_VECTOR2(NAME, ELTYPE, ...) \
+    typedef struct CAT(NAME, _VECTOR) {    \
+        int capacity, size;                \
+        ELTYPE *data;                      \
+        __VA_ARGS__                        \
+    } CAT(NAME, _VECTOR);
+
+#define RAW(X) X
+
+/* VECTOR constructors/destrutor
+ * can safely use memset(&vec, 0, sizeof(vec)) for initial capacity of 0,
+ * so global and calloc'd VECTORs are safe to use immediately
+ * local and malloc's VECTORs must be initialized before use, as they may
+ * contain garbage */
+
+/* VECTOR_init(vec, capacity)
+ *   initialize an empty vector with optional initial capacity
+ * VECTOR_initcopy(vec, from)
+ *   initialize a vector as a copy of an existing vector
+ * VECTOR_initN(vec, val1, ...)
+ *   initialize a vector with N values
+ * RETURNS
+ *   0  success
+ *   -1 failure (out of memory), vector has capacity 0
+ */
+#define VECTOR_init(vec, ...) init_raw_vector(&(vec), sizeof((vec).data[0]), RAW(__VA_ARGS__ + 0))
+
+#define VECTOR_initcopy(vec, from)                                                               \
+    (init_raw_vector(&(vec), sizeof((vec).data[0]), (from).size)                                 \
+         ? -1                                                                                    \
+         : (memcpy((vec).data, (from).data, ((vec).size = (from).size) * sizeof((vec).data[0])), \
+            0))
+
+#define VECTOR_init1(vec, v1)                          \
+    (init_raw_vector(&(vec), sizeof((vec).data[0]), 1) \
+         ? -1                                          \
+         : ((vec).size = 1, (vec).data[0] = (v1), 0))
+#define VECTOR_init2(vec, v1, v2)                      \
+    (init_raw_vector(&(vec), sizeof((vec).data[0]), 2) \
+         ? -1                                          \
+         : ((vec).size = 2, (vec).data[0] = (v1), (vec).data[1] = (v2), 0))
+#define VECTOR_init3(vec, v1, v2, v3)                  \
+    (init_raw_vector(&(vec), sizeof((vec).data[0]), 3) \
+         ? -1                                          \
+         : ((vec).size = 3, (vec).data[0] = (v1), (vec).data[1] = (v2), (vec).data[2] = (v3), 0))
+#define VECTOR_init4(vec, v1, v2, v3, v4)                                                     \
+    (init_raw_vector(&(vec), sizeof((vec).data[0]), 4)                                        \
+         ? -1                                                                                 \
+         : ((vec).size = 3, (vec).data[0] = (v1), (vec).data[1] = (v2), (vec).data[2] = (v3), \
+            (vec).data[3] = (v4), 0))
+#define VECTOR_init5(vec, v1, v2, v3, v4, v5)                                                 \
+    (init_raw_vector(&(vec), sizeof((vec).data[0]), 5)                                        \
+         ? -1                                                                                 \
+         : ((vec).size = 3, (vec).data[0] = (v1), (vec).data[1] = (v2), (vec).data[2] = (v3), \
+            (vec).data[3] = (v4), (vec).data[4] = (v5), 0))
+
+#define EMPTY_VECTOR_INIT \
+    { 0, 0, 0 }
+
+/* VECTOR_fini(vec)
+ *   destroys a vector, freeing memory
+ * RETURNS
+ *   void
+ */
+#define VECTOR_fini(vec) free((vec).data)
+
+/* VECTOR methods */
+
+/* VECTOR_add(vec, val)
+ *   add a single value to the end of a vector, increasing its size (and
+ *   capacity if necessary)
+ * VECTOR_addcopy(vec, ptr, n)
+ *   add a multiple value to the end of a vector, increasing its size (and
+ *   capacity as necessary)
+ * VECTOR_copy(vec, from)
+ *   replace a vector with a copy of another vector
+ * RETURNS
+ *   0  success
+ *   -1 failure (out of memory), vector is unchanged
+ */
+#define VECTOR_add(vec, val)                                                            \
+    (((vec).size == (vec).capacity && expand_raw_vector(&(vec), sizeof((vec).data[0]))) \
+         ? -1                                                                           \
+         : ((vec).data[(vec).size++] = (val), 0))
+#define VECTOR_addcopy(vec, ptr, n)                                              \
+    (VECTOR_reserve(vec, (vec).size + (n))                                       \
+         ? -1                                                                    \
+         : (memcpy((vec).data + (vec).size, (ptr), (n) * sizeof((vec).data[0])), \
+            (vec).size += (n), 0))
+#define VECTOR_copy(vec, from)                                                    \
+    (VECTOR_reserve(vec, (from).size)                                             \
+         ? -1                                                                     \
+         : (memcpy((vec).data, (from).data, (from).size * sizeof((vec).data[0])), \
+            (vec).size = (from).size, 0))
+
+#define VECTOR_begin(vec) ((vec).data)
+#define VECTOR_end(vec) ((vec).data + (vec).size)
+#define VECTOR_empty(vec) ((vec).size == 0)
+
+/* VECTOR_erase(vec, idx, cnt)
+ *   erase cnt elements from a vector (defaults to 1).  If there are fewer
+ *   than cnt elements in the vector after idx (inclusive), all will be
+ *   erased
+ * RETURNS
+ *   0  success
+ *   -1 idx is out of range
+ */
+#define VECTOR_erase(vec, idx, ...) \
+    erase_raw_vector(&(vec), sizeof((vec).data[0]), idx, RAW(__VA_ARGS__ + 0))
+
+/* VECTOR_expand(vec)
+ *   increase the capacity of a vector, if possible.  Does not affect the size
+ * RETURNS
+ *   0  success
+ *   -1 failure (out of memory), vector is unchanged
+ */
+#define VECTOR_expand(vec) expand_raw_vector(&(vec), sizeof((vec).data[0]))
+
+/* VECTOR_foreach(vec, apply)
+ *   apply a function or macro to every element of a vector
+ *   not a valid expression, so doesn't really return anything
+ */
+#define VECTOR_foreach(vec, apply)                \
+    do {                                          \
+        for (int i_ = 0; i_ < (vec).size; i_++) { \
+            apply((&(vec).data[i_]));             \
+        }                                         \
+    } while (0)
+
+/* VECTOR_insert(vec, idx, cnt)
+ *   increase the size of a vector, adding uninitialized space at idx, and
+ *   moving later elements of the vector up.  cnt defaults to 1
+ * RETURNS
+ *   0  success
+ *   -1 failure -- idx is out of range[ERANGE], or out of memeory[ENOMEM]
+ *                 vector is unchanged
+ */
+#define VECTOR_insert(vec, idx, ...) \
+    insert_raw_vector(&(vec), sizeof((vec).data[0]), idx, RAW(__VA_ARGS__ + 0))
+
+#define VECTOR_pop(vec) ((vec).data[--(vec).size])
+#define VECTOR_push(vec, val) VECTOR_add(vec, val)
+
+/* VECTOR_reserve(vec, size, shrink)
+ *   change the capacity of a vector.  If shrink is false (default), will only
+ *   increase the capacity.
+ * RETURNS
+ *   0  success
+ *   -1 failure (out of memory), vector is unchanged
+ */
+#define VECTOR_reserve(vec, size, ...) \
+    reserve_raw_vector(&(vec), sizeof((vec).data[0]), size, RAW(__VA_ARGS__ + 0))
+
+/* VECTOR_resize(vec, size, shrink)
+ *   change the size of a vector.  If shrink is false (default), will only
+ *   increase the capacity.
+ * RETURNS
+ *   0  success
+ *   -1 failure (out of memory), vector is unchanged
+ */
+#define VECTOR_resize(vec, sz, ...) \
+    (VECTOR_reserve(vec, sz, __VA_ARGS__) ? -1 : ((vec).size = (sz), 0))
+
+/* VECTOR_shrink_to_fit(vec)
+ *   reduce capacity to match the size, releasing memory if possible
+ * RETURNS
+ *   0  success
+ *   -1 failure (realloc failed to shrink?), vector is unchanged
+ */
+#define VECTOR_shrink_to_fit(vec) shrink_raw_vector(&(vec), sizeof((vec).data[0]))
+
+/* VECTOR_terminate(vec, val)
+ *   ensure that capacity is greater than size, and store val after
+ *   the end of the vector.
+ * RETURNS
+ *   0  success
+ *   -1 failure (out of memory), vector is unchanged
+ */
+#define VECTOR_terminate(vec, val)                                                      \
+    (((vec).size == (vec).capacity && expand_raw_vector(&(vec), sizeof((vec).data[0]))) \
+         ? -1                                                                           \
+         : ((vec).data[(vec).size] = (val), 0))
+#define VECTOR_top(vec) ((vec).data[(vec).size - 1])
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int erase_raw_vector(void *vec, size_t elsize, int idx, unsigned cnt);
+extern int expand_raw_vector(void *vec, size_t elsize);
+extern int init_raw_vector(void *vec, size_t elsize, int mincap);
+extern int insert_raw_vector(void *vec, size_t elsize, int idx, unsigned cnt);
+extern int reserve_raw_vector(void *vec, size_t elsize, int size, int shrink);
+extern int shrink_raw_vector(void *vec, size_t elsize);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BACKENDS_TOFINO_BF_ASM_VECTOR_H_ */
diff --git a/backends/tofino/bf-asm/version.h b/backends/tofino/bf-asm/version.h
new file mode 100644
index 00000000000..d4fd9442c71
--- /dev/null
+++ b/backends/tofino/bf-asm/version.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+#include <string>
+
+namespace BFASM {
+// Singleton class representing the assembler version
+class Version {
+ public:
+    static const std::string getVersion() {
+        static Version v;
+        return std::to_string(v.major) + "." + std::to_string(v.minor) + "." +
+               std::to_string(v.patch);
+    }
+
+ private:
+    static constexpr int major = 1;
+    static constexpr int minor = 0;
+    static constexpr int patch = 1;
+
+    Version() {}
+
+ public:
+    // disable any other constructors
+    Version(Version const &) = delete;
+    void operator=(Version const &) = delete;
+};
+
+}  // namespace BFASM
diff --git a/backends/tofino/bf-asm/walle/README.md b/backends/tofino/bf-asm/walle/README.md
new file mode 100644
index 00000000000..2e9b6a4b16a
--- /dev/null
+++ b/backends/tofino/bf-asm/walle/README.md
@@ -0,0 +1,263 @@
+Walle - JSON-to-binary cruncher tool
+====================================================
+
+Walle serves as a layer of abstraction between the Tofino compiler and chip,
+presenting the chip's memory hierarchy to the compiler as a set of JSON
+structures that contain register/memory names and their values, while
+abstracting away the actual addresses of these registers and the methods by
+which they are programmed (DMA/direct PCIe writes/indirect instruction lists).
+
+Walle stores the exact structure of the chip's memory hierarchy in a
+"chip.schema" file, which has to be generated from raw register data whenever
+the chip registers change, and is then used afterwards to crunch compiler output
+into a binary config file. It can also be used to generate "template" JSON that
+looks like compiler output with the hardware's default values for all fields (in
+most cases, 0). These templates are used by the compiler to enforce the correct
+structure on its output data, and also in general should be regenerated whenever
+the chip's registers change.
+
+Using Walle
+----------------------------------------------------
+### Basic usage
+#### Generating a schema
+First, generate a chip schema. Invoke Walle with the `--generate-schema` flag
+followed by the directory containing raw CSV files output by csrCompiler. If
+the bfnregs repo is cloned into ~/bfnregs, this would be:
+
+    ./walle.py --generate-schema ~/bfnregs/modules/tofino_regs/module/csv/
+
+This will generate a file named `chip.schema` in the current working directory,
+which is where it will look for the chip schema by default. The
+`--schema SCHEMA-FILE` flag can be used to point Walle to a different schema,
+or a different location to output the schema it is generating.
+
+#### Crunching compiler output
+The most common use case for Walle is taking multiple config JSONs and
+crunching them into a binary file Tofino's drivers can read. Just invoke
+Walle with the names of all relevant JSON files, and optionally the name of
+the file to output:
+
+    ./walle.py cfg1.json cfg2.json cfg3.json -o chip_config.bin
+
+If the compiler was set up to dump all of its config output into an otherwise
+empty directory, shell wildcards can be used to shorten this command. If that
+dir is called 'cfgs', this would look like:
+
+    ./walle.py cfgs/*.json -o chip_config.bin
+
+#### Generating templates
+Walle can be used to generate blank register templates to be filled in by the
+compiler. These templates are the JSON files that Walle would expect to see
+given the current chip schema, but with all of the data set to the corresponding
+hardware register's power-on default (in most cases, 0).
+
+To do so, Walle must be fed a JSON file enumerating the Semifore addressmap
+objects it should generate templates for. This file must take this structure:
+
+   {
+     "generated": {
+         "memories":[
+            // memory addressmap names
+         ],
+         "regs":[
+            // register addressmap names
+         ]
+     },
+     "ignored": {
+         "memories":[
+            // memory addressmap names
+         ],
+         "regs":[
+            // register addressmap names
+         ]
+      }
+   }
+
+Names under 'memories' keys refer to addressmaps included by the top-level
+'pipe_top_level.csr' file, while names under 'regs' keys refer to those included
+by 'tofino.csr'.
+
+Address maps listed under 'generated' will cause a JSON template file to be
+generated. Wherever that address map appears elsewhere in the hierarchy will be
+replaced with a string reference to said JSON file.
+
+Address maps listed under 'ignored' will be replaced with a 0 when they appear 
+elsewhere in the hierarchy, and no JSON template file will be generated.
+
+Use the `--generate-templates` flag followed by the path to a file of the
+format just discussed to generate template JSONs in a directory called
+`templates` (which will be created in the working directory if it doesn't
+already exist):
+
+    ./walle.py --generate-templates templates_file
+
+These files can then be copied to the compiler's source tree.
+
+The templates themselves end in the extension '.cfg.json'. Walle will also
+generate an identical hierarchy containing the bit-widths of each field, and
+these files end in the extension '.size.json'.
+
+### Advanced usage
+#### Directing the crunch process
+Walle crunches by first loading all provided JSON files and verifying them
+against its chip schema, and then drilling down from specified "top-level"
+points in that cloud of JSON data. By default, these points are called
+`memories.top` and `regs.top` and represent the memory and register hierarchies
+of the chip, respectively.
+
+The `--top NAME` flag can be used to manually specify the top-level points to
+drill down from. Multiple `--top NAME` flags can be included, and if any are
+present the default top-level names are not used.
+
+This is equivalent to the default behavior:
+
+    ./walle.py cfgs/*.json --top memories.top --top regs.top -o chip_config.bin
+
+One of them can be left out to only generate, say, only register configuration:
+
+    ./walle.py cfgs/*.json --top regs.top -o chip_config.bin
+
+Walle calculates addresses relative to the top-level points specified, so it is
+important that these points only ever refer to actual top-level points in the
+Semifore register hierarchy. If it is desired to only generate, for example,
+config data for the MAU or one pipe, the top-level JSON files should be
+hand-tweaked to disable other parts of the configuration binary. See the
+specification of the JSON config format for more details.
+
+#### Directing the template generation process
+Walle generates a template file for each addressmap type specified in the
+`template_objects` file which sits in the same folder as the Walle script. If
+Walle encounters an instance of these addressmap types during template
+generation, it leaves that tree of the JSON data unexpanded and replaces it
+with a string indicating it expects a template to be plugged in to that
+location.
+
+The type names of these addressmaps can be found by viewing the Semifore HTML
+output of the reg and memory hierarchies and checking 'Header File Information'
+at the top of the page. The 'Type Name' field that then appears within each
+address map indicates the type which should be passed to Walle for
+templatization. Semifore incorrectly capitlizes the first letter of the type
+name - it should be all lowercase when specified to Walle.
+
+Note that the JSON fed to Walle does *not* have to follow the same template
+structure as specified in the `template_objects` file - this templatization
+control is just for convenience and reducing the file size of the generated
+blank templates.
+
+Configuration JSON format
+----------------------------------------------------
+Walle consumes JSON files that specify values to be written registers named in
+the chip's Semifore specification. The structure of these JSON files directly
+mirrors the structure found in the chip's Semifore specification.
+
+Each JSON file contains a dictionary that represents one instance of a Semifore
+addressmap. Addressmap dictionaries' keys represent the Semifore names of
+registers and nested addressmaps, while the values are either:
+
+   *  Dictionaries representing those objects
+   *  Lists of dictionaries, in the case the object in question is an array
+   *  Lists of lists (of lists of lists of lists of...) in the case the object
+      in question is an N-dimensional array
+
+Register dictionaries have field names as keys and integers as values. They
+follow the same rules for lists in the event of a field array. The outer-most
+dictionary also has these special Walle keys:        
+
+   *  `_type` : The full type name that this file provides values for, of the
+      form `section.semifore_type`. For example, the parser's memories are of
+      type `memories.prsr_mem_rspec`, while its registers are of type
+     `regs.prsr_reg_rspec`
+   *  `_name` : A name used to reference this file and its data elsewhere in
+      the config JSON
+   *  `_schema_hash`: The MD5 hash of the raw Semifore output used to generate
+      the chip schema from which this file's structure was derived, used
+      to ensure the chip schema and JSON input match
+   *  `_reg_version`: The git tag of the bfnregs repo commit used to generate
+      the chip schema from which this file's structure was derived. This value
+      isn't used by Walle itself, but is useful to manually determine which
+      version of the compiler or model a given config JSON was created for.
+
+At any point in the hierarchy, a register/addressmap value may be replaced
+with:
+
+   *  A string containing the name of another JSON input file, which "stamps"
+      that other data down at this point in the memory hierarchy
+   *  0, indicating no write operation should be generated for the given object
+
+Fields cannot be "disabled with 0s" the way registers and addressmaps can,
+since the register is the level of granularity at which the drivers write data.
+
+Config JSON can be hand-tweaked with 0's to produce a binary blob that
+only writes to specific registers and leaves everything else alone, in order
+to produce "initial boot" config blobs and then "soft reboot" config blobs.
+
+#### Error checking
+Walle will fail to generate output if:
+
+   *  A field value ever exceeds the field's bit width as specified in the chip
+     schema
+   *  A template is instantiated at a point in the hierarchy that does not
+     match the type expected by the chip schema (eg, naming an instance of
+     `memories.prsr_mem_rspec` in the top-level *register* JSON)
+   *  A file's `_schema_hash` value does not match the hash stored in the chip
+     schema. This check can be suppressed with the flag
+     `--ignore-schema-mismatch`:
+
+        ./walle.py cfgs/*.json --ignore-schema-mismatch -o chip_config.bin
+
+     This flag is provided for development purposes, because even a small
+     change at one end of the register hierarchy (like correcting a typo in a
+     register *description*) will change the hash without actually affecting
+     the structure of the chip schema, and it would be a pain to have to
+     regenerate all templates and copy them over into the compiler source tree
+     just to get things working again.
+
+     In the long run, however, this flag should not be used and schema hashes
+     should be consistent.
+
+Binary blob format
+----------------------------------------------------
+Walle generates a sequence of binary write instructions for the driver which
+are of the following types:
+
+   *  Direct register write - For 32 bit registers that can be addressed
+      directly from the PCIe bus, a simple address-data pair of the form:
+
+            4 bytes: "\0\0\0R"
+            4 bytes: 32-bit PCIe address
+            4 bytes: Data
+            All fields little-endian
+
+   *  Indirect register write - For registers wider than 32 bits, or to compose
+      many direct register writes into one write list that can be transmitted
+      across PCIe as a single transaction.
+
+      TODO: not actually implemented driver-or-Walle-side yet, since the model
+      doesn't currently support indirect reg addressing
+
+   *  DMA block write - Automatically chosen for arrays of registers larger
+      than 4 elements, a base address and block of data:
+
+            4 bytes: "\0\0\0D"
+            8 bytes: 42-bit chip address
+            4 bytes: Bit-length of word
+            4 bytes: Number of words
+            Following: Data, in 32-bit word chunks
+            All fields little-endian
+
+        TODO: currently only registers in the 'memories' half of the hierarchy
+        will get rolled into DMA blocks, again because the model doesn't
+        currently support indirect reg addressing. Eventually this won't
+        be a problem
+
+The driver should execute these instructions in the order they are read. The
+binary blob has no header or structure aside from these write instructions,
+so multiple binary files can be concatenated together or split into parts as
+needed.
+
+Walle can be optionally instructed to generate a direct register write to
+address 0xFFFFFFFF at the very end of the file to signify to the model the end
+of configuration data. This is enabled with the flag `--append-sentinel`:
+
+    ./walle.py cfgs/*.json --append-sentinel -o chip_config.bin
+
diff --git a/backends/tofino/bf-asm/walle/chip.py b/backends/tofino/bf-asm/walle/chip.py
new file mode 100644
index 00000000000..8156a5ee62c
--- /dev/null
+++ b/backends/tofino/bf-asm/walle/chip.py
@@ -0,0 +1,183 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+TODO: document this file
+"""
+import struct
+from copy import copy
+
+
+class chip_object(object):
+    """
+    TODO: docstring
+    """
+
+    def __init__(self, addr, src_key):
+        self.addr = addr
+        self.src_key = src_key
+
+    def add_offset(self, offset):
+        self.addr += offset
+
+
+class direct_reg(chip_object):
+    """
+    A single register write operation, of the format:
+
+    4 bytes: "\0\0\0R"
+    4 bytes: 32-bit PCIe address
+    4 bytes: Data
+
+    All fields little-endian
+    """
+
+    def __init__(self, addr, value, src_key=None):
+        chip_object.__init__(self, addr, src_key)
+        self.value = struct.pack("<I", value)
+        self.orig_value = value
+
+    def __str__(self):
+        # TODO
+        return hex(self.addr) + ": " + hex(self.orig_value)
+
+    def deepcopy(self):
+        return copy(self)
+
+    def bytes(self):
+        return "\0\0\0R" + struct.pack("<I", self.addr) + self.value
+
+
+class indirect_reg(chip_object):
+    """
+    A single indirect register write operation, of the format:
+
+    4 bytes: "\0\0\0I"
+    8 bytes: 42-bit chip address
+    4 bytes: Bit-length of word
+    Following: Data
+
+    All fields little-endian
+    """
+
+    def __init__(self, addr, value, width, src_key=None):
+        chip_object.__init__(self, addr, src_key)
+        self.width = width
+
+        hexstr = hex(value).replace('0x', '').replace('L', '')
+        if len(hexstr) % 2 != 0:
+            hexstr = '0' + hexstr
+        self.value = bytearray.fromhex(hexstr)
+        self.value = self.value.rjust(self.width // 8, chr(0))
+        self.value.reverse()
+        self.orig_value = value
+
+    def __str__(self):
+        # TODO
+        return hex(self.addr) + ": " + hex(self.orig_value)
+
+    def deepcopy(self):
+        return copy(self)
+
+    def bytes(self):
+        # TODO: for now implement as a sequence of direct register writes
+        # return "\0\0\0I" + struct.pack("<Q",self.addr) + struct.pack("<I",self.width) + self.value
+
+        # Make sure to write pieces of the register starting from the
+        # most-significant end, to ensure atomicity
+        offset = (self.width // 8) - 4
+        byte_str = ""
+        while offset >= 0:
+            byte_str += (
+                "\0\0\0R"
+                + struct.pack("<I", self.addr + offset)
+                + self.value[offset : offset + 4].ljust(4, chr(0))
+            )
+            offset -= 4
+
+        return byte_str
+
+
+class dma_block(chip_object):
+    """
+    A single DMA block write operation in one of these two formats:
+
+    4 bytes: "\0\0\0D"
+    8 bytes: 42-bit chip address
+    4 bytes: Bit-length of word
+    4 bytes: Number of words
+    Following: Data, in 32-bit word chunks
+
+    4 bytes: "\0\0\0B"
+    8 bytes: 32-bit PCIe address
+    4 bytes: Bit-length of word
+    4 bytes: Number of words
+    Following: Data, in 32-bit word chunks
+
+    All fields little-endian
+    """
+
+    def __init__(self, addr, width, src_key=None, is_reg=False):
+        chip_object.__init__(self, addr, src_key)
+        self.width = width
+        self.values = []
+        self.is_reg = is_reg
+
+    def add_word(self, value):
+        hexstr = hex(value).replace('0x', '').replace('L', '')
+
+        if len(hexstr) % 2 != 0:
+            hexstr = '0' + hexstr
+        self.values.append(bytearray.fromhex(hexstr))
+        self.values[-1] = self.values[-1].rjust(self.width // 8, chr(0))
+        self.values[-1].reverse()
+
+    def __str__(self):
+        # TODO
+        return hex(self.addr) + ": <TODO>"
+
+    def deepcopy(self):
+        new = copy(self)
+        new.values = new.values[:]
+        return new
+
+    def bytes(self):
+        if self.width > 128:
+            # FIXME: this only works cleanly if width is a multiple of 128, can it be otherwise?
+            if self.width % 128 != 0:
+                sys.stderr.write("ERROR: register width %d not a multiple of 128" % self.width)
+                sys.exit(1)
+            new_values = []
+            for value in self.values:
+                for chunk in range(0, self.width // 8, 16):
+                    new_values.append(value[chunk : chunk + 16].rjust(128 // 8, chr(0)))
+            self.values = new_values
+            self.width = 128
+
+        if self.is_reg:
+            op_type = "\0\0\0B"
+        else:
+            op_type = "\0\0\0D"
+        bytestr = (
+            op_type
+            + struct.pack("<Q", self.addr)
+            + struct.pack("<I", self.width)
+            + struct.pack("<I", len(self.values))
+        )
+        for value in self.values:
+            bytestr += value
+        return bytestr
diff --git a/backends/tofino/bf-asm/walle/csr.py b/backends/tofino/bf-asm/walle/csr.py
new file mode 100644
index 00000000000..5f709d35d28
--- /dev/null
+++ b/backends/tofino/bf-asm/walle/csr.py
@@ -0,0 +1,3122 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+csr: Code dealing with compiler-facing JSON files and raw Semifore CSV files
+
+TODO: explain how to generate Semifore CSV files properly
+"""
+
+import copy
+import csv
+import glob
+import hashlib
+import os.path
+import re
+import string
+import sys
+from functools import reduce
+from operator import ior, mul
+
+import chip
+
+########################################################################
+## Utility functions
+
+
+def array_str(array_idx):
+    """
+    Given a list of integers, return a string containing those integers in []
+    for generating a C++ index expression
+    """
+    if array_idx is None:
+        return ""
+    return reduce(lambda x, y: x + "[%i]" % y, array_idx, "")
+
+
+def product(seq):
+    """
+    Given a list of integers, return their product
+
+    @param   seq     A list of integers
+    @return          The product of the integers in the list
+    """
+    return reduce(mul, seq, 1)
+
+
+def nd_array_loop(count, data, func, context=None, current_dim=0):
+    """
+    Given an N-dimensional list of objects, drill down to the inner-most lists
+    and call func() on them. Essentially a variable-depth nested for loop. The
+    reason func is called on the inner-most dimension and not each object itself
+    is to allow some state to be kept across iterations of the inner-most loop
+    (for example, like an address offset counter).
+
+    @param   count  A list of integers recording the number of elements
+                    in each dimension of the data. Should be N elements
+                    long.
+    @param   data   The N-dimensional list of data
+    @param   func   A function that takes two arguments, which is called on
+                    every list of the inner-most dimension. Its arguments are,
+                    in order:
+                        - The inner-most list to be processed
+                        - The 'context' of this function call: a list of
+                          integers recording at which index in each dimension
+                          the provided list is from. The last element in this
+                          list is uninitialized and intended to hold the
+                          iteration count of the inner-most loop.
+
+                          This list is SHARED ACROSS ALL INVOCATIONS of func(),
+                          so while it's safe to modify it should be copied if
+                          intended to be stored beyond the current scope.
+    """
+    if context == None:
+        context = [0] * len(count)
+    if current_dim < len(count) - 1:
+        for idx in range(0, count[current_dim]):
+            context[current_dim] = idx
+            nd_array_loop(
+                count, None if data is None else data[idx], func, context, current_dim + 1
+            )
+    else:
+        func(data, context)
+
+
+def count_array_loop(count, func, context=None, current_dim=0):
+    """
+    Given a vector of dimensions, call func for each possible legal index
+    Essentially a variable-depth nested for loop.
+
+    @param   count  A list of integers recording the number of elements
+                    in each dimension of the data. Should be N elements
+                    long.
+    @param   func   A function that takes one argument, which is called on
+                    on every legal set of indexes of the given dimensions
+                    Its argument is the 'context' of this function call: a list of
+                    integers recording at which index in each dimension.
+
+                      This list is SHARED ACROSS ALL INVOCATIONS of func(),
+                      so while it's safe to modify it should be copied if
+                      intended to be stored beyond the current scope.
+    """
+    if context == None:
+        context = [0] * len(count)
+    if current_dim < len(count):
+        for idx in range(0, count[current_dim]):
+            context[current_dim] = idx
+            count_array_loop(count, func, context, current_dim + 1)
+    else:
+        func(context)
+
+
+def format_comment(outfile, indent, text):
+    """
+    Output text as a C++ comment block in an output file
+    FIXME -- need better way of dealing with [list]?  For now just replace each [*] with
+             newline and indent and assume they're all in lists.
+    """
+
+    text = text.replace('[list]', '')
+    text = text.replace('[/list]', '\n')
+    text = text.replace('[*]', '\n    ')
+    text = text.replace('[code]', "")
+    text = text.replace('[/code]', "")
+    text = text.replace('[italic]', '')
+    text = text.replace('[/italic]', '')
+    lines = text.splitlines()
+    maxlen = 96
+    for line in lines:
+        line = line.rstrip()
+        pfx = len(line) - len(line.lstrip())
+        while len(line) > 0:
+            outfile.write(indent)
+            outfile.write("// ")
+            outfile.write(' ' * pfx)
+            line = line.lstrip()
+            pt = line.rfind(' ', 0, maxlen - pfx - len(indent))
+            if len(line) + len(indent) + pfx > maxlen:
+                if pt > 0:
+                    outfile.write(line[0:pt])
+                    line = line[pt + 1 :]
+                else:
+                    # line is longer than maxlen, but has no spaces.  So don't split it or
+                    # subsequent lines (this is probably a wide table with columns)
+                    maxlen = len(line) + len(indent) + pfx
+                    outfile.write(line)
+                    line = ''
+            else:
+                outfile.write(line)
+                line = ''
+            outfile.write("\n")
+
+
+def indent_comment(indent, text):
+    if not text:
+        return text
+    if text[-2:-1] != '\n':
+        text = text + '\n'
+    return indent + text.replace('\n', indent + '\n')
+
+
+########################################################################
+## Structures
+
+
+class CsrException(Exception):
+    """
+    An exception that occured while crunching malformed data according to the
+    given chip schema. An exception handler in walle.py will catch these
+    exceptions and then attempt to print a "traceback" recording where in the
+    chip schema the exception occured.
+
+    This traceback is maintained by keeping a local variable called 'path' in
+    any scope where a CsrException may be raised. 'path' is a list of
+    traversal_history objects.
+    """
+
+    pass
+
+
+class traversal_history(object):
+    """
+    A class which records part of Walle's traversal through input JSON data. A
+    single traversal_history corresponds to the traversal of one JSON file.
+
+    Attributes:
+    @attr template_name
+        The value at the top level "_name" key of the file currently being
+        processed
+    @attr path
+        An ordered list of keys and list indices visited in the current
+        traversal of this file.
+        If we drill down into a dictionary at key "a", push "a" onto the list.
+        If we access elements of a list, push a tuple recording the index at
+        each dimension of the list and then the list name. Eg,
+            [(4,),"a"] to represent a[4]
+            [(1,2,3),"b"] to represent b[1][2][3]
+    """
+
+    def __init__(self, template_name):
+        self.template_name = template_name
+        self.path = []
+
+
+class binary_cache(object):
+    """
+    A class used to store flat chip_obj lists, each corresponding to one JSON
+    file. The "_name" at the top of each JSON is used to index into the cache.
+    Requesting a JSON file from here will crunch it into binary if it hasn't
+    been already.
+    """
+
+    def __init__(self, schema):
+        self.schema = schema
+        self.templates = {}
+        self.binary_templates = {}
+
+    def get_type(self, key):
+        """
+        Get the addressmap name that the binary data at the given key
+        corresponds to, of the form "section_name.addressmap_name".
+        """
+        return self.templates[key]["_type"]
+
+    def get_data(self, key, path=None):
+        """
+        Return a list of objects inheriting from chip.chip_obj, representing
+        the write operations that must be done in the hardware to program a
+        hardware object of the given JSON file's "_type"
+
+        These lists are a deep copy of the one stored internally, so it is safe
+        to modify them
+        """
+        if path == None:
+            path = []
+
+        if key not in self.binary_templates:
+            obj_section, obj_type = self.templates[key]["_type"].split(".")
+            obj_schema = self.schema[obj_section][obj_type]
+            # TODO: There used to be a first deepcopy here, before the one in the
+            #       return statement. 99% sure it was unnecessary, but if things
+            #       seem broken revisit this
+            path.append(traversal_history(key))
+            self.binary_templates[key] = obj_schema.generate_binary(self.templates[key], self, path)
+            path.pop()
+
+        binary_data_copy = []
+        for chip_obj in self.binary_templates[key]:
+            binary_data_copy.append(chip_obj.deepcopy())
+        return binary_data_copy
+
+
+class csr_object(object):
+    """
+    Base class for objects in a Semifore register hierarchy
+
+    A Semifore object array is still represented as one csr_object instance,
+    albeit with a "count" attribute expressing how many hardware objects this
+    Semifore node actually corresponds to.
+
+    Since all objects in Semifore have names and can be arrays, all csr_objects
+    have name and count attributes. Since arrays can be multidimensional,
+    count is _always_ a tuple of array sizes, even if that tuple has only one
+    element. Single elements will have a count of (1,).
+    """
+
+    def __init__(self, name, count):
+        self.name = name
+        self.count = count
+
+    def replicate(self, templatized_self):
+        if self.count != (1,):
+            last_dim_obj = templatized_self
+            for dim in reversed(self.count):
+                last_dim_obj = [copy.deepcopy(last_dim_obj) for _ in range(0, dim)]
+            return last_dim_obj
+        else:
+            return templatized_self
+
+    def is_field(self):
+        return False
+
+    def is_singleton(self):
+        return False
+
+    def singleton_obj(self):
+        return self
+
+    def contains_reference(self):
+        return False
+
+
+class csr_composite_object(csr_object):
+    """
+    Base class for composite (non-leaf) CSR objects.  All such objects have one
+    or more children
+    """
+
+    def __init__(self, name, count):
+        csr_object.__init__(self, name, count)
+
+    def children(self):
+        raise CsrException("Unimplemented abstract method for " + type(self))
+
+    def check_child_rewrite(self, child, args):
+        """
+        Check to see if the child needs to be rewritten per something in the args, and, if
+        so, rewrite it and return it.  We call this a fair amount with the same child (so
+        work is duplicated); if that is a problem we should memoize.
+        """
+        if self.name not in args.rewrite:
+            return child
+        if self.name not in args.rewrite_used:
+            args.rewrite_used[self.name] = {}
+        rewrite = args.rewrite[self.name]
+        if child.name not in rewrite:
+            return child
+        args.rewrite_used[self.name][child.name] = True
+        rewrite = rewrite[child.name]
+        if rewrite[0] == 'delete':
+            return None
+        elif rewrite[0] == 'scan_chain':
+            description = ''
+            offset = child.offset
+            while not isinstance(child, reg):
+                if hasattr(child, 'description') and child.description:
+                    description = description + child.description
+                    if description[-2:-1] != '\n':
+                        description = description + '\n'
+                if len(child.children()) != 1 or child.count != (1,):
+                    raise CsrException(
+                        "unknown rewrite '%s' for %s.%s"
+                        % (rewrite[child.name][0], name, child.name)
+                    )
+                child = child.children()[0]
+            if hasattr(child, 'description') and child.description:
+                description = description + child.description
+                if description[-2:-1] != '\n':
+                    description = description + '\n'
+            child = scanset_reg(
+                rewrite[1], tuple(rewrite[2]), offset, child.width, self, child.fields
+            )
+            child.description = description
+            if len(child.fields) == 1:
+                child.fields = copy.copy(child.fields)
+                child.fields[0].name = rewrite[1]
+            if len(rewrite) > 3:
+                # import pdb; pdb.set_trace()
+                def find_scan_sel(obj, name, offset):
+                    desc = ''
+                    for ch in obj.children():
+                        pfx = len(ch.name) + 1
+                        if ch.name + "." == name[:pfx]:
+                            return find_scan_sel(ch, name[pfx:], offset + ch.offset)
+                        if ch.name == name:
+                            offset = offset + ch.offset
+                            desc_hdr = ch.name + ':\n'
+                            if hasattr(ch, 'description') and ch.description:
+                                desc = desc + desc_hdr + indent_comment('  ', ch.description)
+                                desc_hdr = ''
+                            if (
+                                len(ch.fields) == 1
+                                and hasattr(ch.fields[0], 'description')
+                                and ch.fields[0].description
+                            ):
+                                desc = (
+                                    desc + desc_hdr + indent_comment('  ', ch.fields[0].description)
+                                )
+                            return offset, desc
+                    return None, None
+
+                offset, desc = find_scan_sel(self, rewrite[3], 0)
+                if offset is None:
+                    raise CsrException(
+                        "No " + rewrite[3] + " in " + self.name + " for scan selector"
+                    )
+                child.sel_offset = offset
+                child.description = child.description + desc
+            return child
+        else:
+            raise CsrException(
+                "unknown rewrite '%s' for %s.%s" % (rewrite[el.name][0], name, el.name)
+            )
+        return None
+
+    def contains_reference(self):
+        """
+        return true if this object (directly or indirectly) contains a reference to
+        a top_level object
+        """
+        if not hasattr(self, 'contains_reference_cache'):
+            self.contains_reference_cache = False
+            for a in self.children():
+                if a.top_level() or a.contains_reference():
+                    self.contains_reference_cache = True
+                    break
+        return self.contains_reference_cache
+
+    def gen_method_declarator(self, outfile, args, rtype, classname, name, argdecls, suffix):
+        outfile.write("%s " % rtype)
+        if args.gen_decl == 'defn':
+            outfile.write("%s::" % classname)
+        outfile.write("%s(" % name)
+        first = True
+        for a in argdecls:
+            if not first:
+                outfile.write(", ")
+            if type(a) is tuple:
+                outfile.write(a[0])
+                if args.gen_decl != 'defn':
+                    outfile.write(" = " + a[1])
+            else:
+                outfile.write(a)
+            first = False
+        outfile.write(")")
+        if suffix != '':
+            outfile.write(" %s" % suffix)
+        if args.gen_decl == 'decl':
+            outfile.write(";\n")
+            return True
+        outfile.write(" {\n")
+        return False
+
+    def gen_emit_method(self, outfile, args, schema, classname, name, nameargs, indent):
+        outfile.write(indent)
+        argdecls = ["std::ostream &out"]
+        for idx, argtype in enumerate(nameargs):
+            argdecls.append("%sna%d" % (argtype, idx))
+        if args.gen_decl == 'defn':
+            argdecls.append("indent_t indent")
+        else:
+            argdecls.append("indent_t indent = indent_t(1)")
+        if self.gen_method_declarator(
+            outfile, args, "void", classname, "emit_json", argdecls, "const"
+        ):
+            return
+        indent += "  "
+        if args.enable_disable and not self.top_level():
+            outfile.write("%sif (disabled_) {\n" % indent)
+            outfile.write('%s  out << "0";\n' % indent)
+            outfile.write("%s  return; }\n" % indent)
+        outfile.write("%sout << '{' << std::endl;\n" % indent)
+        first = True
+        if self.top_level():
+            if len(nameargs) > 0:
+                tmplen = len(name) + len(nameargs) * 10 + 32
+                outfile.write("%schar tmp[%d];\n" % (indent, tmplen))
+                outfile.write('%ssnprintf(tmp, sizeof(tmp), "%s"' % (indent, name))
+                for i in range(0, len(nameargs)):
+                    outfile.write(", na%d" % i)
+                outfile.write(");\n")
+                outfile.write('%sout << indent << "\\"_name\\": \\"" << tmp << "\\"";\n' % indent)
+            else:
+                outfile.write('%sout << indent << "\\"_name\\": \\"%s\\"";\n' % (indent, name))
+            outfile.write('%sout << ", \\n";\n' % indent)
+            outfile.write(
+                '%sout << indent << "\\"_reg_version\\": \\"%s\\"";\n'
+                % (indent, schema["_reg_version"])
+            )
+            outfile.write('%sout << ", \\n";\n' % indent)
+            outfile.write(
+                '%sout << indent << "\\"_schema_hash\\": \\"%s\\"";\n'
+                % (indent, schema["_schema_hash"])
+            )
+            outfile.write('%sout << ", \\n";\n' % indent)
+            outfile.write(
+                '%sout << indent << "\\"_section\\": \\"%s\\"";\n' % (indent, self.parent)
+            )
+            outfile.write('%sout << ", \\n";\n' % indent)
+            outfile.write(
+                '%sout << indent << "\\"_type\\": \\"%s.%s\\"";\n'
+                % (indent, self.parent, self.name)
+            )
+            outfile.write('%sout << ", \\n";\n' % indent)
+            outfile.write(
+                '%sout << indent << "\\"_walle_version\\": \\"%s\\"";\n'
+                % (indent, schema["_walle_version"])
+            )
+            first = False
+        for a in sorted(self.children(), key=lambda a: a.name):
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            if not first:
+                outfile.write('%sout << ", \\n";\n' % indent)
+            outfile.write('%sout << indent << "\\"%s\\": ";\n' % (indent, a.name))
+            if a.disabled() and not args.expand_disabled_vector:
+                outfile.write('%sout << "0";\n' % indent)
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if a.count != (1,):
+                for idx_num, idx in enumerate(a.count):
+                    if args.enable_disable and args.checked_array and not a.disabled():
+                        outfile.write("%sif (%s" % (indent, field_name))
+                        for i in range(0, idx_num):
+                            outfile.write("[i%d]" % i)
+                        outfile.write(".disabled()) {\n")
+                        outfile.write('%s  out << "0";\n' % indent)
+                        outfile.write("%s} else {\n" % indent)
+                        indent += '  '
+                    outfile.write('%sout << "[\\n" << ++indent;\n' % indent)
+                    outfile.write(
+                        '%sfor (int i%d = 0; i%d < %d; i%d++) { \n'
+                        % (indent, idx_num, idx_num, idx, idx_num)
+                    )
+                    outfile.write('%s  if (i%d) out << ", \\n" << indent;\n' % (indent, idx_num))
+                    indent += '  '
+            single = a.singleton_obj()
+            if single != a:
+                outfile.write(
+                    '%sout << "{\\n" << indent+1 << "\\"%s\\": " << %s'
+                    % (indent, a.name, field_name)
+                )
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(" << '\\n';\n")
+                outfile.write("%sout << indent << '}';\n" % indent)
+            elif a.is_field() or a.top_level():
+                outfile.write("%sout << %s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(";\n")
+            elif a.disabled():
+                outfile.write('%sout << 0;\n' % indent)
+            else:
+                outfile.write("%s%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".emit_json(out, indent+1);\n")
+            if a.count != (1,):
+                for i in range(0, len(a.count)):
+                    indent = indent[2:]
+                    outfile.write("%s}\n" % indent)
+                    outfile.write("%sout << '\\n' << --indent << ']';\n" % indent)
+                    if args.enable_disable and args.checked_array and not a.disabled():
+                        indent = indent[2:]
+                        outfile.write("%s}\n" % indent)
+            first = False
+        outfile.write("%sout << '\\n' << indent-1 << \"}\";\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_uint_conversion(self, outfile, args, classname, indent):
+        pass
+
+    def gen_emit_binary_method(self, outfile, args, classname, indent):
+        def child_name(child):
+            name = child.name
+            if name in args.cpp_reserved:
+                name += '_'
+            return name
+
+        def field_name(child):
+            name = child_name(child)
+            if child.count != (1,):
+                for i in range(0, len(child.count)):
+                    name += "[j%d]" % i
+            return name
+
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "void",
+            classname,
+            "emit_binary",
+            ["std::ostream &out", "uint64_t a"],
+            "const",
+        ):
+            return
+        indent += "  "
+        if args.enable_disable:
+            outfile.write("%sif (disabled_) return;\n" % indent)
+        root_parent = self.parent
+        while type(root_parent) is not str:
+            root_parent = root_parent.parent
+        addr_decl = "auto "
+        for a in self.children():
+            addr_var = "a"
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            if root_parent == "memories":
+                indirect = True
+                width_unit = 128
+                address_unit = 16
+                type_tag = 'D'
+            elif a.name in args.write_dma:
+                indirect = True
+                width_unit = 32
+                address_unit = 1
+                type_tag = 'B'
+            else:
+                indirect = False
+                width_unit = 32
+                address_unit = 1
+                type_tag = 'R'
+            if isinstance(a, scanset_reg):
+                a.output_binary(outfile, args, indent, address_unit, width_unit)
+                continue
+            if args.enable_disable:
+                outfile.write("%sif (!%s.disabled()) {\n" % (indent, child_name(a)))
+                indent += '  '
+            if indirect and type(a) is reg:
+                outfile.write(
+                    "%sout << binout::tag('%s') << binout::byte8" % (indent, type_tag)
+                    + "(a + 0x%x) << binout::byte4(%d) << binout::byte4(%d);\n"
+                    % (
+                        a.offset // address_unit,
+                        width_unit,
+                        product(a.count) * a.width // width_unit,
+                    )
+                )
+            if a.count != (1,):
+                if args.enable_disable:
+                    outfile.write("%sauto addr = a;\n" % indent)
+                else:
+                    outfile.write("%s%saddr = a;\n" % (indent, addr_decl))
+                    addr_decl = ""
+                addr_var = "addr"
+                for idx_num, idx in enumerate(a.count):
+                    outfile.write(
+                        '%sfor (int j%d = 0; j%d < %d; j%d++) { \n'
+                        % (indent, idx_num, idx_num, idx, idx_num)
+                    )
+                    indent += '  '
+            single = a.singleton_obj()
+            if not indirect and single != a:
+                # FIXME -- should check each element being written singly to see if its
+                #  disabled and not write it if so?  The generate_binary code does not
+                #  do that, so we don't emit C++ code to do it either.
+                #  Would it cause problems for register arrays that are actually wideregs
+                #  under the hood?  See 3.2.1.1 in the Tofino Switch Architecture doc.
+                outfile.write("%sif (!%s.disabled()) {\n" % (indent, field_name(a)))
+                indent += '  '
+                if single.msb >= 64:
+                    for w in (
+                        list(range(single.msb // 32, -1, -1))
+                        if args.reverse_write
+                        else list(range(0, single.msb // 32 + 1))
+                    ):
+                        outfile.write(
+                            "%sout << binout::tag('R') << binout::byte4" % indent
+                            + "(%s + 0x%x) << binout::byte4(%s.value.getrange(%d, 32));\n"
+                            % (addr_var, a.offset // address_unit + 4, field_name(a), w * 32)
+                        )
+                else:
+                    if not args.reverse_write:
+                        outfile.write(
+                            "%sout << binout::tag('R') << binout::byte4" % indent
+                            + "(%s + 0x%x) << binout::byte4(%s);\n"
+                            % (addr_var, a.offset // address_unit, field_name(a))
+                        )
+                    if single.msb >= 32:
+                        outfile.write(
+                            "%sout << binout::tag('R') << binout::byte4" % indent
+                            + "(%s + 0x%x) << binout::byte4(%s >> 32);\n"
+                            % (addr_var, a.offset // address_unit + 4, field_name(a))
+                        )
+                    if args.reverse_write:
+                        outfile.write(
+                            "%sout << binout::tag('R') << binout::byte4" % indent
+                            + "(%s + 0x%x) << binout::byte4(%s);\n"
+                            % (addr_var, a.offset // address_unit, field_name(a))
+                        )
+                indent = indent[2:]
+                outfile.write("%s}\n" % indent)
+            else:
+                outfile.write(indent)
+                if a.top_level():
+                    outfile.write("if (%s)" % field_name(a))
+                outfile.write(field_name(a))
+                outfile.write("->" if a.top_level() else ".")
+                outfile.write(
+                    "emit_binary(out, %s + 0x%x);\n" % (addr_var, a.offset // address_unit)
+                )
+            if a.count != (1,):
+                outfile.write("%saddr += 0x%x;\n" % (indent, a.address_stride() // address_unit))
+                for i in range(0, len(a.count)):
+                    indent = indent[2:]
+                    outfile.write("%s}\n" % indent)
+            if args.enable_disable:
+                indent = indent[2:]
+                outfile.write("%s}\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_input_binary_method(self, outfile, args, classname, indent):
+        def child_name(child):
+            name = child.name
+            if name in args.cpp_reserved:
+                name += '_'
+            return name
+
+        def field_name(child):
+            name = child_name(child)
+            if child.count != (1,):
+                for i in range(0, len(child.count)):
+                    name += "[i%d]" % i
+            return name
+
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "void",
+            classname,
+            "input_binary",
+            ["uint64_t a", "char t", "uint32_t *d", "size_t l"],
+            "",
+        ):
+            return
+        indent += "  "
+        root_parent = self.parent
+        while type(root_parent) is not str:
+            root_parent = root_parent.parent
+        if root_parent == "memories":
+            width_unit = 128
+            address_unit = 16
+            outfile.write("%sBUG_CHECK(t == 'D', \"'%%c' tag in memories\", t);\n" % indent)
+        else:
+            width_unit = 32
+            address_unit = 1
+            outfile.write(
+                "%sBUG_CHECK(t != 'D', \"'%%c' tag in %s\", t);\n" % (indent, root_parent)
+            )
+        first = True
+        for a in sorted(self.children(), key=lambda a: -a.offset):
+            outfile.write(
+                '%s%sif (a >= 0x%x) {\n'
+                % (indent, '' if first else '} else ', a.offset // address_unit)
+            )
+            indent += '  '
+            t = a
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                outfile.write(
+                    '%sstd::cerr << "Address in ignored reg " << ' % indent
+                    + 'string_regname(this, this+1) << ".%s" << std::endl;\n' % t.name
+                )
+            elif isinstance(a, scanset_reg):
+                a.input_binary(outfile, args, indent, address_unit, width_unit)
+            elif a.disabled():
+                outfile.write(
+                    '%sstd::cerr << "Address in disabled reg " << ' % indent
+                    + 'string_regname(this, this+1) << ".%s" << std::endl;\n' % a.name
+                )
+            else:
+                outfile.write('%sa -= 0x%x;\n' % (indent, a.offset // address_unit))
+                idx_suffix = ''
+                if a.count != (1,):
+                    outfile.write(
+                        '%ssize_t idx = a / 0x%x;\n' % (indent, a.address_stride() // address_unit)
+                    )
+                    for idx_num, idx in reversed(list(enumerate(a.count))):
+                        outfile.write('%sint i%d = idx %% %d;\n' % (indent, idx_num, idx))
+                        if idx_num == 0:
+                            outfile.write(
+                                '%sBUG_CHECK(idx < %d, "Index too' % (indent, idx)
+                                + ' large for %%s.%s[%%zd]",\n' % a.name
+                            )
+                            outfile.write(
+                                '%s          ' % indent
+                                + 'string_regname(this, this+1).c_str(), idx);\n'
+                            )
+                        else:
+                            outfile.write('%sidx /= %d;\n' % (indent, idx))
+                        idx_suffix = ('[i%d]' % idx_num) + idx_suffix
+                    outfile.write(
+                        '%sa -= 0x%x * %s'
+                        % (indent, a.address_stride() // address_unit, '(' * (len(a.count) - 1))
+                    )
+                    for idx_num, idx in enumerate(a.count):
+                        if idx_num != 0:
+                            outfile.write('*%d + ' % idx)
+                        outfile.write('i%d' % idx_num)
+                        if idx_num != 0:
+                            outfile.write(')')
+                    outfile.write(';\n')
+                # outfile.write('%sstd::cout << string_regname(this, this+1) << ".%s' %
+                #              (indent, a.name))
+                # if a.count != (1,):
+                #    for idx_num, idx in enumerate(a.count):
+                #        outfile.write('[" << i%d << "]' % idx_num)
+                # outfile.write('" << std::endl;\n');
+                access = '.'
+                if a.top_level():
+                    outfile.write('%sif (!%s) {\n' % (indent, field_name(a)))
+                    outfile.write(
+                        '%s  auto *n = new %s;\n' % (indent, a.canon_name(a.map.object_name)[0])
+                    )
+                    outfile.write('%s  auto fn = string_regname(this, this+1);\n' % indent)
+                    outfile.write('%s  declare_registers(n, sizeof(*n),\n' % indent)
+                    outfile.write(
+                        '%s      [=](std::ostream &out, const char *addr, ' % indent
+                        + 'const void *end) {\n'
+                    )
+                    outfile.write('%s          out << fn << ".%s' % (indent, child_name(a)))
+                    if a.count != (1,):
+                        for idx_num, idx in enumerate(a.count):
+                            outfile.write('[" << i%d << "]' % idx_num)
+                    outfile.write('";\n')
+                    outfile.write('%s          n->emit_fieldname(out, addr, end); });\n' % indent)
+                    outfile.write(
+                        '%s  %s.set("%s", n); }\n' % (indent, field_name(a), child_name(a))
+                    )
+                    access = '->'
+                single = a.singleton_obj()
+                if single != a:
+                    outfile.write(
+                        "%sBUG_CHECK(t == 'R' && l == 1, \"tag '%%c' " % indent
+                        + 'input to singleton %s", t);\n' % field_name(a)
+                    )
+                    if single.msb >= 64:
+                        outfile.write(
+                            '%sBUG("widereg singleton %s not implemented");'
+                            % (indent, field_name(a))
+                        )
+                    elif single.msb >= 32:
+                        outfile.write(
+                            '%sBUG_CHECK((a|4) == 4, "invalid addr %%zd in ' % indent
+                            + '%s", a);\n' % field_name(a)
+                        )
+                        outfile.write('%s%s.set_subfield(*d, a*8, 32);\n' % (indent, field_name(a)))
+                    else:
+                        outfile.write('%s%s = *d;\n' % (indent, field_name(a)))
+                elif isinstance(a, reg) and a.count != (1,):
+                    outfile.write(
+                        '%sBUG_CHECK(a == 0 || l == 1, "%%" PRIu64 " off ' % indent
+                        + 'start of %s", a);\n' % a.name
+                    )
+                    if a.width % 32 != 0:
+                        raise CsrException("Register %s width not a multiple of 32" % a.name)
+                    size = a.width // 32
+                    outfile.write('%swhile (l > %d) {\n' % (indent, size))
+                    indent += '  '
+                    outfile.write(
+                        '%s%s%sinput_binary(a, t, d, %d);\n' % (indent, field_name(a), access, size)
+                    )
+                    outfile.write('%sd += %d; l -= %d;\n' % (indent, size, size))
+                    for idx_num, idx in reversed(list(enumerate(a.count))):
+                        outfile.write('%sif (++i%d >= %d) {\n' % (indent, idx_num, idx))
+                        indent += '  '
+                        if idx_num != 0:
+                            outfile.write('%si%d = 0;\n' % (indent, idx_num))
+                    outfile.write(
+                        '%sBUG("Too much data for %s");%s\n'
+                        % (indent, a.name, ' }' * (len(a.count) + 1))
+                    )
+                    indent = indent[2 * (len(a.count) + 1) :]
+                    outfile.write(
+                        '%s%s%sinput_binary(a, t, d, l);\n' % (indent, field_name(a), access)
+                    )
+                else:
+                    outfile.write(
+                        '%s%s%sinput_binary(a, t, d, l);\n' % (indent, field_name(a), access)
+                    )
+            indent = indent[2:]
+            first = False
+        outfile.write('%s}\n' % indent)
+
+        indent = indent[2:]
+        outfile.write('%s}\n' % indent)
+
+    def gen_binary_offset_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "uint64_t",
+            classname,
+            "binary_offset",
+            ["const void *addr", ("int *bit_offset", "0")],
+            "const",
+        ):
+            return
+        root_parent = self.parent
+        while type(root_parent) is not str:
+            root_parent = root_parent.parent
+        if root_parent == "memories":
+            width_unit = 128
+            address_unit = 16
+        else:
+            width_unit = 32
+            address_unit = 1
+        indent += "  "
+        outfile.write("%suint64_t offset = 0;\n" % indent)
+        outfile.write("%sif (bit_offset) *bit_offset = 0;\n" % indent)
+        outfile.write("%sif (addr < this || addr >= this+1) " % indent)
+        if self.contains_reference():
+            outfile.write("{\n")
+            indent += "  "
+            for a in self.children():
+                if a.disabled():
+                    continue
+                if not (a.top_level() or a.contains_reference()):
+                    continue
+                field_name = a.name
+                if field_name in args.cpp_reserved:
+                    field_name += '_'
+                if a.count != (1,):
+                    for i, idx in enumerate(a.count):
+                        outfile.write(
+                            '%sfor (int i%d = 0; i%d < %d; i%d++) { \n' % (indent, i, i, idx, i)
+                        )
+                        indent += '  '
+                        field_name += "[i%d]" % i
+                outfile.write(
+                    "%sif ((offset = %s%sbinary_offset(addr, bit_offset)) != -1)\n"
+                    % (indent, field_name, "->" if a.top_level() else ".")
+                )
+                outfile.write("%s  return offset + 0x%x" % (indent, a.offset // address_unit))
+                if a.count != (1,):
+                    for i, idx in enumerate(a.count):
+                        stride = a.address_stride() // address_unit
+                        for cnt in a.count[i + 1 :]:
+                            stride = stride * cnt
+                        outfile.write(" + i%d*0x%x" % (i, stride))
+                outfile.write(";\n")
+                if a.count != (1,):
+                    for i, idx in enumerate(a.count):
+                        indent = indent[2:]
+                        outfile.write("%s}\n" % indent)
+
+            indent = indent[2:]
+            outfile.write("%s}\n" % indent)
+        else:
+            outfile.write("return -1;\n")
+
+        first = True
+        for a in sorted(self.children(), key=lambda a: a.name, reverse=True):
+            if a.disabled():
+                continue
+            if a.top_level():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            outfile.write(indent)
+            if first:
+                first = False
+            else:
+                outfile.write("} else ")
+            outfile.write("if (addr >= &%s) {\n" % field_name)
+            indent += "  "
+            outfile.write("%soffset = 0x%x;\n" % (indent, a.offset // address_unit))
+            if a.count != (1,):
+                for i, idx in enumerate(a.count):
+                    outfile.write("%sif (addr < &%s[0]) return offset;\n" % (indent, field_name))
+                    outfile.write(
+                        "%sauto i%d = ((char *)addr - (char *)&%s[0])/sizeof(%s[0]);\n"
+                        % (indent, i, field_name, field_name)
+                    )
+                    stride = a.address_stride() // address_unit
+                    for cnt in a.count[i + 1 :]:
+                        stride = stride * cnt
+                    outfile.write("%soffset += i%d * 0x%x;\n" % (indent, i, stride))
+                    field_name += "[i%d]" % i
+            single = a.singleton_obj()
+            if not single.is_field() and not single.top_level():
+                outfile.write(
+                    "%soffset += %s.binary_offset(addr, bit_offset);\n" % (indent, field_name)
+                )
+            indent = indent[2:]
+        if first:
+            outfile.write("%sreturn -1;\n" % indent)
+        else:
+            outfile.write("%s} else {\n" % indent)
+            outfile.write("%s  return -1;\n" % indent)
+            outfile.write("%s}\n" % indent)
+            outfile.write("%sreturn offset;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_fieldname_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "void",
+            classname,
+            "emit_fieldname",
+            ["std::ostream &out", "const char *addr", "const void *end"],
+            "const",
+        ):
+            return
+        indent += "  "
+        if not self.is_singleton():
+            outfile.write("%sif ((void *)addr == this && end == this+1) return;\n" % indent)
+        first = True
+        for a in sorted(self.children(), key=lambda a: a.name, reverse=True):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            outfile.write(indent)
+            if first:
+                first = False
+            else:
+                outfile.write("} else ")
+            outfile.write("if (addr >= (char *)&%s) {\n" % field_name)
+            indent += "  "
+            outfile.write('%sout << ".%s";\n' % (indent, a.name))
+            if a.count != (1,):
+                for i, idx in enumerate(a.count):
+                    outfile.write(
+                        "%sint i%d = (addr - (char *)&%s[0])/(int)sizeof(%s[0]);\n"
+                        % (indent, i, field_name, field_name)
+                    )
+                    if idx > 1:
+                        outfile.write(
+                            "%sif (i%d < 0 || (i%d == 0 && 1 + &%s" % (indent, i, i, field_name)
+                        )
+                        outfile.write(" == end)) return;\n")
+                    outfile.write("%sout << '[' << i%d << ']';\n" % (indent, i))
+                    field_name += "[i%d]" % i
+            single = a.singleton_obj()
+            if not single.is_field() and not single.top_level():
+                outfile.write("%s%s.emit_fieldname(out, addr, end);\n" % (indent, field_name))
+            indent = indent[2:]
+        if not first:
+            outfile.write("%s}\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_unpack_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile, args, "int", classname, "unpack_json", ["json::obj *obj"], ""
+        ):
+            return
+        indent += "  "
+        outfile.write("%sint rv = 0;\n" % indent)
+        outfile.write("%sjson::map *m = dynamic_cast<json::map *>(obj);\n" % indent)
+        outfile.write("%sif (!m) return -1;\n" % indent)
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            index_num = 0
+            if a.count != (1,):
+                for index_num, idx in enumerate(a.count):
+                    outfile.write(
+                        "%sif (json::vector *v%s = dynamic_cast<json::vector *>("
+                        % (indent, index_num)
+                    )
+                    indent += "  "
+                    if index_num > 0:
+                        outfile.write("(*v%d)[i%d].get()" % (index_num - 1, index_num - 1))
+                    else:
+                        outfile.write('(*m)["%s"].get()' % a.name)
+                    outfile.write("))\n")
+                    outfile.write(
+                        "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                        % (indent, index_num, index_num, idx, index_num)
+                    )
+                    indent += "  "
+                index_num = len(a.count)
+            single = a.singleton_obj()
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if single != a:
+                outfile.write("%sif (json::map *s = dynamic_cast<json::map *>(" % indent)
+                indent += "  "
+                if index_num > 0:
+                    outfile.write("(*v%d)[i%d].get()" % (index_num - 1, index_num - 1))
+                else:
+                    outfile.write('(*m)["%s"].get()' % a.name)
+                outfile.write("))\n")
+                outfile.write("%sif (json::number *n = dynamic_cast<json::number *>" % indent)
+                indent += "  "
+                outfile.write('((*s)["%s"].get()))\n' % a.name)
+                outfile.write("%s%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(" = n->val;\n")
+                indent = indent[2:]
+                outfile.write("%selse rv = -1;\n" % indent)
+                indent = indent[2:]
+                outfile.write("%selse rv = -1;\n" % indent)
+            elif not a.is_field() and not a.top_level():
+                outfile.write("%srv |= %s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".unpack_json(")
+                if index_num > 0:
+                    outfile.write("(*v%d)[i%d].get()" % (index_num - 1, index_num - 1))
+                else:
+                    outfile.write('(*m)["%s"].get()' % a.name)
+                outfile.write(");\n")
+            else:
+                jtype = "json::number"
+                access = " = n->val"
+                if a.top_level():
+                    jtype = "json::string"
+                    access = ".set(n->c_str(), nullptr)"
+                outfile.write("%sif (%s *n = dynamic_cast<%s *>(" % (indent, jtype, jtype))
+                indent += "  "
+                if index_num > 0:
+                    outfile.write("(*v%d)[i%d].get()" % (index_num - 1, index_num - 1))
+                else:
+                    outfile.write('(*m)["%s"].get()' % a.name)
+                outfile.write(")) {\n")
+                outfile.write("%s%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write("%s;\n" % access)
+                if a.top_level():
+                    outfile.write(
+                        "%s} else if (json::number *n = dynamic_cast<json::number *>(" % indent[2:]
+                    )
+                    if index_num > 0:
+                        outfile.write("(*v%d)[i%d].get()" % (index_num - 1, index_num - 1))
+                    else:
+                        outfile.write('(*m)["%s"].get()' % a.name)
+                    outfile.write(")) {\n")
+                    outfile.write("%sif (n->val) rv = -1;\n" % indent)
+                indent = indent[2:]
+                outfile.write("%s} else rv = -1;\n" % indent)
+            for i in range(0, index_num):
+                indent = indent[4:]
+                outfile.write("%selse rv = -1;\n" % indent)
+        outfile.write("%sreturn rv;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_dump_unread_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "void",
+            classname,
+            "dump_unread",
+            ["std::ostream &out", "prefix *pfx"],
+            "const",
+        ):
+            return
+        indent += "  "
+        need_lpfx = True
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not a.singleton_obj().is_field() and not a.top_level():
+                if need_lpfx:
+                    outfile.write("%sprefix lpfx(pfx, 0);\n" % indent)
+                    need_lpfx = False
+                outfile.write('%slpfx.str = "%s' % (indent, a.name))
+                if a.count != (1,):
+                    for idx in a.count:
+                        outfile.write("[%d]" % idx)
+                outfile.write('";\n')
+                outfile.write("%s%s" % (indent, field_name))
+                if a.count != (1,):
+                    for idx in a.count:
+                        outfile.write("[0]")
+                outfile.write(".dump_unread(out, &lpfx);\n")
+            else:
+                outfile.write("%sif (!%s" % (indent, field_name))
+                if a.count != (1,):
+                    for idx in a.count:
+                        outfile.write("[0]")
+                outfile.write('.read) out << pfx << ".%s' % a.name)
+                if a.count != (1,):
+                    for idx in a.count:
+                        outfile.write("[%d]" % idx)
+                outfile.write('" << std::endl;\n')
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_modified_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(outfile, args, "bool", classname, "modified", [], "const"):
+            return
+        indent += "  "
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not args.checked_array:
+                if a.count != (1,):
+                    for index_num, idx in enumerate(a.count):
+                        outfile.write(
+                            "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                            % (indent, index_num, index_num, idx, index_num)
+                        )
+                        indent += "  "
+                outfile.write("%sif (%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".modified()) return true;\n")
+                if a.count != (1,):
+                    indent = indent[2 * len(a.count) :]
+            else:
+                outfile.write("%sif (%s.modified()) return true;\n" % (indent, field_name))
+        outfile.write("%sreturn false;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_set_modified_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile, args, "void", classname, "set_modified", [("bool v", "true")], ""
+        ):
+            return
+        indent += "  "
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not args.checked_array:
+                if a.count != (1,):
+                    for index_num, idx in enumerate(a.count):
+                        outfile.write(
+                            "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                            % (indent, index_num, index_num, idx, index_num)
+                        )
+                        indent += "  "
+                outfile.write("%s%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".set_modified(v);\n")
+                if a.count != (1,):
+                    indent = indent[2 * len(a.count) :]
+            else:
+                outfile.write("%s%s.set_modified(v);\n" % (indent, field_name))
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_disable_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(outfile, args, "bool", classname, "disable", [], ""):
+            return
+        indent += "  "
+        outfile.write("%sbool rv = true;\n" % indent)
+        outfile.write("%sif (modified()) {\n" % indent)
+        outfile.write('%s  std::clog << "ERROR: Disabling modified record ";\n' % indent)
+        outfile.write("%s  print_regname(std::clog, this, this+1);\n" % indent)
+        outfile.write("%s  std::clog << std::endl; \n" % indent)
+        outfile.write("%s  return false; }\n" % indent)
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not args.checked_array:
+                if a.count != (1,):
+                    for index_num, idx in enumerate(a.count):
+                        outfile.write(
+                            "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                            % (indent, index_num, index_num, idx, index_num)
+                        )
+                        indent += "  "
+                outfile.write("%sif (%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".disable()) rv = true;\n")
+                if a.count != (1,):
+                    indent = indent[2 * len(a.count) :]
+            else:
+                outfile.write("%sif (%s.disable()) rv = true;\n" % (indent, field_name))
+        outfile.write("%sif (rv) disabled_ = true;\n" % indent)
+        outfile.write("%sreturn rv;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_disable_if_reset_value_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile, args, "bool", classname, "disable_if_reset_value", [], ""
+        ):
+            return
+        indent += "  "
+        outfile.write("%sbool rv = true;\n" % indent)
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not args.checked_array:
+                if a.count != (1,):
+                    for index_num, idx in enumerate(a.count):
+                        outfile.write(
+                            "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                            % (indent, index_num, index_num, idx, index_num)
+                        )
+                        indent += "  "
+                outfile.write("%sif (!%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".disable_if_reset_value()) rv = false;\n")
+                if a.count != (1,):
+                    indent = indent[2 * len(a.count) :]
+            else:
+                outfile.write(
+                    "%sif (!%s.disable_if_reset_value()) rv = false;\n" % (indent, field_name)
+                )
+        outfile.write("%sif (rv) disabled_ = true;\n" % indent)
+        outfile.write("%sreturn rv;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_disable_if_unmodified_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile, args, "bool", classname, "disable_if_unmodified", [], ""
+        ):
+            return
+        indent += "  "
+        outfile.write("%sbool rv = true;\n" % indent)
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not args.checked_array:
+                if a.count != (1,):
+                    for index_num, idx in enumerate(a.count):
+                        outfile.write(
+                            "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                            % (indent, index_num, index_num, idx, index_num)
+                        )
+                        indent += "  "
+                outfile.write("%sif (!%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".disable_if_unmodified()) rv = false;\n")
+                if a.count != (1,):
+                    indent = indent[2 * len(a.count) :]
+            else:
+                outfile.write(
+                    "%sif (!%s.disable_if_unmodified()) rv = false;\n" % (indent, field_name)
+                )
+        outfile.write("%sif (rv) disabled_ = true;\n" % indent)
+        outfile.write("%sreturn rv;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_disable_if_zero_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(outfile, args, "bool", classname, "disable_if_zero", [], ""):
+            return
+        indent += "  "
+        outfile.write("%sbool rv = true;\n" % indent)
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not args.checked_array:
+                if a.count != (1,):
+                    for index_num, idx in enumerate(a.count):
+                        outfile.write(
+                            "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                            % (indent, index_num, index_num, idx, index_num)
+                        )
+                        indent += "  "
+                outfile.write("%sif (!%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".disable_if_zero()) rv = false;\n")
+                if a.count != (1,):
+                    indent = indent[2 * len(a.count) :]
+            else:
+                outfile.write("%sif (!%s.disable_if_zero()) rv = false;\n" % (indent, field_name))
+        outfile.write("%sif (rv && modified()) {\n" % indent)
+        outfile.write('%s  std::clog << "Disabling modified zero record ";\n' % indent)
+        outfile.write("%s  print_regname(std::clog, this, this+1);\n" % indent)
+        outfile.write("%s  std::clog << std::endl;\n" % indent)
+        outfile.write("%s  rv = false; }\n" % indent)
+        outfile.write("%sif (rv) disabled_ = true;\n" % indent)
+        outfile.write("%sreturn rv;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_enable_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(outfile, args, "void", classname, "enable", [], ""):
+            return
+        indent += "  "
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            a = self.check_child_rewrite(a, args)
+            if a is None:
+                continue
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if not args.checked_array:
+                if a.count != (1,):
+                    for index_num, idx in enumerate(a.count):
+                        outfile.write(
+                            "%sfor (int i%d = 0; i%d < %d; i%d++)\n"
+                            % (indent, index_num, index_num, idx, index_num)
+                        )
+                        indent += "  "
+                outfile.write("%s%s" % (indent, field_name))
+                if a.count != (1,):
+                    for i in range(0, len(a.count)):
+                        outfile.write("[i%d]" % i)
+                outfile.write(".enable();\n")
+                if a.count != (1,):
+                    indent = indent[2 * len(a.count) :]
+            else:
+                outfile.write("%s%s.enable();\n" % (indent, field_name))
+        outfile.write("%sdisabled_ = false;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def find_alias_arrays(self, args, classname):
+        self.alias_arrays = []
+        potential_alias_arrays = {}
+        array_match = re.compile('^(\w+)_(\d+)$')
+        for el in self.children():
+            el = self.check_child_rewrite(el, args)
+            if el is None:
+                continue
+            m = array_match.match(el.name)
+            if m:
+                base = m.group(1)
+                idx = int(m.group(2))
+                typ = el.type_name(args, classname, "_" + el.name)
+                if base in potential_alias_arrays:
+                    pot = potential_alias_arrays[base]
+                    if typ != pot['type']:
+                        pot['ok'] = False
+                    if idx > pot['max']:
+                        pot['max'] = idx
+                    pot['mask'] |= 2**idx
+                else:
+                    potential_alias_arrays[m.group(1)] = {
+                        "ok": True,
+                        "max": idx,
+                        "mask": 2**idx,
+                        "type": typ,
+                    }
+        for base, pot in potential_alias_arrays.items():
+            if pot['ok'] and pot['max'] > 0 and pot['mask'] == 2 ** (pot['max'] + 1) - 1:
+                self.alias_arrays.append((base, pot['type'], pot['max'] + 1))
+
+    def need_ctor(self):
+        if self.alias_arrays:
+            return True
+        for el in self.children():
+            el = self.check_child_rewrite(el, args)
+            if el is None:
+                continue
+            s = el.singleton_obj()
+            if s.is_field() and s.default and s.default != 0:
+                if type(s.default) is tuple:
+                    for v in s.default:
+                        if v != 0:
+                            return True
+                else:
+                    return True
+        return False
+
+    def gen_ctor(self, outfile, args, namestr, indent):
+        outfile.write("%s%s() : " % (indent, namestr))
+        first = True
+        if args.enable_disable:
+            outfile.write("disabled_(false)")
+            first = False
+        for el in sorted(self.children(), key=lambda a: a.name):
+            if el.disabled():
+                continue
+            el = self.check_child_rewrite(el, args)
+            if el is None:
+                continue
+            s = el.singleton_obj()
+            if s.is_field() and s.default and s.default != 0:
+                if type(s.default) is tuple:
+                    ok = True
+                    for v in s.default:
+                        if v != 0:
+                            ok = False
+                            break
+                    if ok:
+                        continue
+                if first:
+                    first = False
+                else:
+                    outfile.write(", ")
+                outfile.write(el.name)
+                if el.name in args.cpp_reserved:
+                    outfile.write('_')
+                if type(s.default) is tuple and len(s.default) > 1:
+                    outfile.write("({ ")
+                    for v in s.default:
+                        outfile.write(str(v) + ", ")
+                    outfile.write("})")
+                else:
+                    outfile.write('(%d)' % s.default)
+        if hasattr(self, 'alias_arrays'):
+            for alias in self.alias_arrays:
+                outfile.write(",\n%s  %s({" % (indent, alias[0]))
+                for idx in range(0, alias[2]):
+                    if idx > 0:
+                        outfile.write(",")
+                    outfile.write(" &this->%s_%d" % (alias[0], idx))
+                outfile.write(" })")
+        outfile.write(' {}\n')
+
+    def canon_name(self, name):
+        namestr = ''
+        nameargs = []
+        format = False
+        islong = False
+        for ch in name:
+            if format:
+                if ch in string.digits:
+                    continue
+                elif ch == 'l':
+                    islong = True
+                    continue
+                elif ch == 'd' or ch == 'i' or ch == 'u' or ch == 'x':
+                    nameargs.append('long ' if islong else 'int ')
+                elif ch == 'e' or ch == 'f' or ch == 'g':
+                    nameargs.append('double ' if islong else 'float ')
+                elif ch == 's':
+                    nameargs.append('const char *')
+                else:
+                    raise CsrException("unknown conversion '%%%s' in name\n" % ch)
+                format = False
+            elif ch in string.ascii_letters or ch in string.digits or ch == '_':
+                namestr += ch
+            elif ch == '.':
+                namestr += '_'
+            elif ch == '%':
+                format = True
+                islong = False
+            else:
+                raise CsrException("invalid character '%s' in name\n" % ch)
+        return namestr, nameargs
+
+    def type_name(self, args, parent, name):
+        namestr, nameargs = self.canon_name(name)
+        # FIXME -- should be checking for global names in args.global?
+        classname = parent
+        if classname != '':
+            classname += '::'
+        classname += namestr
+        rv = 'struct ' + classname
+        if self.count != (1,):
+            if args.checked_array:
+                for idx in self.count:
+                    rv = "checked_array<%d, %s>" % (idx, rv)
+            else:
+                for idx in self.count:
+                    rv = "%s[%d]" % (rv, idx)
+        return rv
+
+    def gen_type(self, outfile, args, schema, parent, name, indent):
+        namestr, nameargs = self.canon_name(name)
+        classname = parent
+        if classname != '':
+            classname += '::'
+        classname += namestr
+        if args.alias_array and not hasattr(self, 'alias_arrays'):
+            self.find_alias_arrays(args, classname)
+        if args.gen_decl != 'defn':
+            indent += "  "
+            outfile.write("struct %s {\n" % namestr)
+            if args.enable_disable:
+                outfile.write("%sbool disabled_;\n" % indent)
+                outfile.write("%sbool disabled() const { return disabled_; }\n" % indent)
+            if args.enable_disable or self.need_ctor():
+                self.gen_ctor(outfile, args, namestr, indent)
+            if self.top_level():
+                outfile.write(
+                    '%sstatic constexpr const char *_reg_version = "%s";\n'
+                    % (indent, schema['_reg_version'])
+                )
+                outfile.write(
+                    '%sstatic constexpr const char *_schema_hash = "%s";\n'
+                    % (indent, schema['_schema_hash'])
+                )
+        for el in sorted(self.children(), key=lambda a: a.name):
+            if el.disabled():
+                continue
+            el = self.check_child_rewrite(el, args)
+            if el is None:
+                continue
+            typ = el.singleton_obj()
+            notclass = typ.is_field() or typ.top_level()
+            isglobal = el.name in args.global_types
+            if args.gen_decl != 'defn':
+                if hasattr(el, 'description') and el.description:
+                    format_comment(outfile, indent, el.description)
+                if typ != el and hasattr(typ, 'description') and typ.description:
+                    format_comment(outfile, indent, typ.description)
+                outfile.write(indent)
+                if args.checked_array and notclass and el.count != (1,):
+                    for idx in el.count:
+                        outfile.write("checked_array<%d, " % idx)
+            eltypenamestr = el.name
+            if isglobal:
+                eltypenamestr = "::" + eltypenamestr
+            else:
+                eltypenamestr = "_" + eltypenamestr
+                if el.name == self.name:
+                    # FIXME -- maybe should elide the element if it is the only one?
+                    # sort of like singleton_obj but deal with arrays too
+                    eltypenamestr = eltypenamestr + "_el"
+                typ.gen_type(outfile, args, schema, classname, eltypenamestr, indent)
+            if args.gen_decl != 'defn':
+                field_name = el.name
+                if field_name in args.cpp_reserved:
+                    field_name += '_'
+                if args.checked_array and el.count != (1,):
+                    if not notclass:
+                        if not isglobal:
+                            outfile.write(";\n%s" % indent)
+                        for idx in el.count:
+                            outfile.write("checked_array<%d, " % idx)
+                        outfile.write(eltypenamestr)
+                    if el.count != (1,):
+                        for idx in el.count:
+                            outfile.write(">")
+                    outfile.write(" %s;\n" % field_name)
+                else:
+                    outfile.write(" %s" % field_name)
+                    if el.count != (1,):
+                        for idx in el.count:
+                            outfile.write("[%d]" % idx)
+                    outfile.write(";\n")
+        if args.gen_decl != 'defn' and hasattr(self, 'alias_arrays'):
+            for alias in self.alias_arrays:
+                outfile.write(
+                    "%salias_array<%d, %s> %s;\n" % (indent, alias[2], alias[1], alias[0])
+                )
+        if args.delete_copy and args.gen_decl != 'defn':
+            if not args.enable_disable and not self.need_ctor():
+                outfile.write("%s%s() = default;\n" % (indent, namestr))
+            outfile.write("%s%s(const %s &) = delete;\n" % (indent, namestr, namestr))
+            outfile.write("%s%s(%s &&) = delete;\n" % (indent, namestr, namestr))
+        if args.emit_json:
+            self.gen_emit_method(outfile, args, schema, classname, name, nameargs, indent)
+        if args.emit_binary:
+            self.gen_uint_conversion(outfile, args, classname, indent)
+            self.gen_emit_binary_method(outfile, args, classname, indent)
+        if args.input_binary:
+            self.gen_input_binary_method(outfile, args, classname, indent)
+        if args.binary_offset:
+            self.gen_binary_offset_method(outfile, args, classname, indent)
+        if args.emit_fieldname:
+            self.gen_fieldname_method(outfile, args, classname, indent)
+        if args.unpack_json:
+            self.gen_unpack_method(outfile, args, classname, indent)
+        if args.dump_unread:
+            self.gen_dump_unread_method(outfile, args, classname, indent)
+        if args.enable_disable:
+            self.gen_modified_method(outfile, args, classname, indent)
+            self.gen_set_modified_method(outfile, args, classname, indent)
+            self.gen_disable_method(outfile, args, classname, indent)
+            self.gen_disable_if_reset_value_method(outfile, args, classname, indent)
+            self.gen_disable_if_unmodified_method(outfile, args, classname, indent)
+            self.gen_disable_if_zero_method(outfile, args, classname, indent)
+            self.gen_enable_method(outfile, args, classname, indent)
+        if args.gen_decl != 'defn':
+            indent = indent[2:]
+            outfile.write("%s}" % indent)
+
+    def gen_global_types(self, outfile, args, schema):
+        for a in sorted(self.children(), key=lambda a: a.name):
+            if a.disabled():
+                continue
+            if not a.is_field() and not a.top_level():
+                a.gen_global_types(outfile, args, schema)
+                if a.name in args.global_types:
+                    if a.name in args.global_types_generated:
+                        if args.global_types_generated[a.name] != a:
+                            raise CsrException("Inconsistent definition of type " + a.name)
+                    else:
+                        args.global_types_generated[a.name] = a
+                        a.gen_type(outfile, args, schema, "", a.name, "")
+                        outfile.write(";\n")
+
+
+class address_map(csr_composite_object):
+    """
+    A Semifore addressmap. Contains registers and instances of other
+    addressmaps.
+
+    In practice, the count of an address_map is always (1,) and it is the
+    instances of the addressmap that are actually arrays.
+
+    @attr templatization_behavior
+        Controls how this address map gets used during template generation:
+            - If None, it is expanded as a dictionary wherever it appears in the
+              register hierarchy
+            - If "top_level", it is split off into its own JSON file and
+              replaced wherever it appears in the register hierarchy with a
+              string reference to that JSON file
+            - If "disabled", it is replaced wherever it appears in the register
+              hierarchy with a 0 (indicating "don't write"). No JSON file for
+              the address map is generated
+    @attr objs
+        An ordered list of objects contained in the addressmap - either regs,
+        groups, or address_map_instances
+
+    @attr parent
+        A string indicating which parent of the chip hierarchy the addressmap
+        falls under ("memories" or "regs)
+    """
+
+    def __init__(self, name, count, parent):
+        csr_composite_object.__init__(self, name, count)
+
+        self.templatization_behavior = None
+        self.objs = []
+        self.parent = parent
+
+    def min_width(self, round_to_power_of_2=True):
+        """
+        Some addressmap arrays have an explicit "stride" specifying how much
+        address space each element takes up. When they don't, we calculate the
+        stride by summing up the widths of all contained objects and rounding
+        to the next highest power of 2.
+
+        Whether an addressmap has a stride or not is up to the programmer of
+        the original Semifore CSR and, as far as Walle is conserned, arbitrary.
+        """
+        width = 0
+        for obj in self.objs:
+            obj_end = obj.offset * 8
+            if type(obj) is reg:
+                obj_end += obj.width * product(obj.count)
+            elif type(obj) is address_map_instance or type(obj) is group:
+                try:
+                    multiplier = product(obj.count) * obj.stride
+                except:
+                    multiplier = 1
+                obj_end += obj.min_width() * multiplier
+            else:
+                raise CsrException("Unrecognized object in address map ('" + type(obj) + "')")
+            if obj_end > width:
+                width = obj_end
+
+        width //= 8
+
+        if round_to_power_of_2:
+            # Round stride up to the next largest power of 2
+            round_width = 1
+            while round_width < width:
+                round_width *= 2
+            return round_width
+        else:
+            return width
+
+    def generate_binary(self, data, cache, path):
+        if data == 0:
+            # No-op
+            return {}
+        elif isinstance(data, basestring):
+            # Refernce to template
+
+            type_name = self.parent + "." + self.name
+
+            if data not in cache.templates:
+                raise CsrException("Could not find template with name '" + data + "'")
+
+            if cache.get_type(data) != type_name:
+                raise CsrException(
+                    "Expected type of instantiated object '"
+                    + data
+                    + "' to be '"
+                    + type_name
+                    + "', found '"
+                    + cache.get_type(data)
+                    + "'"
+                )
+
+            return cache.get_data(data, path)
+        elif type(data) is dict:
+            # Actual data
+            reg_values = []
+            for obj in self.objs:
+                if obj.name not in data:
+                    raise CsrException("Could not find key '" + obj.name + "'")
+                if data[obj.name] != 0 and not isinstance(data[obj.name], basestring):
+                    if obj.count == (1,):
+                        if type(data[obj.name]) is not dict:
+                            raise CsrException("Expected dictionary at key '" + obj.name + "'")
+                    else:
+                        # TODO: check all dimensions are the right size, maybe if a 'strict errors' flag is used
+                        if type(data[obj.name]) is not list or len(data[obj.name]) != obj.count[0]:
+                            array_size = "x".join(map(str, obj.count))
+                            raise CsrException(
+                                "Expected "
+                                + array_size
+                                + " element array of dictionaries at key '"
+                                + obj.name
+                                + "'"
+                            )
+
+                if type(obj) is reg:
+                    if obj.count == (1,):
+                        chip_obj = obj.generate_binary(data[obj.name], cache, path)
+                        reset_value = obj.get_reset_value()
+
+                        if chip_obj is not None and reset_value is not None:
+                            # Check if a non-zero value to put into the binary file is the same as the reset (initial) value.
+                            # (We will continue to write zero values, for caution's sake.
+                            #  Would block writes work if leave out subsection?)
+                            # If the non-zero value is the reset value, do not output it in the binary file.
+                            # We are having too many problems where the driver is clearing things (like interrupt enables)
+                            # before the binary file is loaded, and then the binary file re-enables them.
+                            if reset_value == chip_obj.orig_value:
+                                # print "Skipping setting %s, because it has the same value (%s) as its reset value of %s." % (obj.name, str(chip_obj), hex(reset_value))
+                                continue
+
+                        if chip_obj != None:
+                            reg_values.append(chip_obj)
+                    elif data[obj.name] != 0:
+                        # TODO: we should be able to DMA anything into the chip,
+                        #       so this count > 4 heuristic should work well
+                        #
+                        #       but right now the model doesn't implement chip-
+                        #       side register addresses, so we have to force
+                        #       direct register writes for the regs part of
+                        #       the schema and DMA for the mem part. lame.
+                        #       use this count heuristic once the model is fixed.
+                        #
+                        # if product(obj.count) > 4:
+                        root_parent = obj.parent
+                        while type(root_parent) is not str:
+                            root_parent = root_parent.parent
+                        # Force the mapram_config register programming
+                        # on the DMA block write path to avoid a race during
+                        # chip init where the map ram is being written and the
+                        # ECC mode is also being configured.  Since the map ram
+                        # is written with block writes, forcing this register
+                        # configuration on the same path removes the race.
+
+                        registers_to_write_with_dma = [
+                            "mapram_config",
+                            "imem_dark_subword16",
+                            "imem_dark_subword32",
+                            "imem_dark_subword8",
+                            "imem_mocha_subword16",
+                            "imem_mocha_subword32",
+                            "imem_mocha_subword8",
+                            "imem_subword16",
+                            "imem_subword32",
+                            "imem_subword8",
+                            "galois_field_matrix",
+                        ]
+                        if product(obj.count) > 4 and (
+                            root_parent == "memories" or obj.name in registers_to_write_with_dma
+                        ):
+                            mem = chip.dma_block(
+                                obj.offset,
+                                obj.width,
+                                src_key=obj.name,
+                                is_reg=root_parent == "regs",
+                            )
+
+                            def mem_loop(sub_data, context):
+                                path[-1].path.append(context)
+                                for idx in range(0, obj.count[-1]):
+                                    context[-1] = idx
+                                    obj.generate_binary(sub_data[idx], cache, path, mem)
+                                path[-1].path.pop()
+
+                            nd_array_loop(obj.count, data[obj.name], mem_loop)
+                            reg_values.append(mem)
+                        else:
+                            offset = [0]
+                            width = obj.width // 8  # TODO: warn if not power of (8 or 32 or w/e)?
+
+                            def reg_loop(sub_data, context):
+                                path[-1].path.append(context)
+                                for idx in range(0, obj.count[-1]):
+                                    context[-1] = idx
+                                    chip_obj = obj.generate_binary(sub_data[idx], cache, path)
+                                    if chip_obj != None:
+                                        chip_obj.add_offset(offset[0])
+                                        reg_values.append(chip_obj)
+                                    offset[0] += width
+                                path[-1].path.pop()
+
+                            nd_array_loop(obj.count, data[obj.name], reg_loop)
+
+                elif type(obj) is address_map_instance or type(obj) is group:
+                    if obj.count == (1,):
+                        sub_chip_objs = obj.generate_binary(data[obj.name], cache, path)
+                        for sub_chip_obj in sub_chip_objs:
+                            sub_chip_obj.add_offset(obj.offset)
+                            reg_values.append(sub_chip_obj)
+                    elif data[obj.name] != 0:
+                        offset = [0]
+
+                        def addr_map_loop(sub_data, context):
+                            path[-1].path.append(context)
+                            for idx in range(0, obj.count[-1]):
+                                context[-1] = idx
+                                sub_chip_objs = obj.generate_binary(sub_data[idx], cache, path)
+                                for sub_chip_obj in sub_chip_objs:
+                                    sub_chip_obj.add_offset(obj.offset + offset[0])
+                                    reg_values.append(sub_chip_obj)
+                                offset[0] += obj.stride
+                            path[-1].path.pop()
+
+                        nd_array_loop(obj.count, data[obj.name], addr_map_loop)
+                else:
+                    raise CsrException("Unrecognized object in address map ('" + type(obj) + "')")
+
+            return reg_values
+        else:
+            raise CsrException(
+                "Expected dictionary at addressmap node '%s' but found value of type %s"
+                % (self.name, type(data).__name__)
+            )
+
+    def generate_template(self, inject_size):
+        self_dict = {}
+        if self.templatization_behavior == "disabled":
+            return None
+        if self.templatization_behavior == "top_level":
+            self_dict["_type"] = self.parent + "." + self.name
+            self_dict["_name"] = "template(" + self_dict["_type"] + ")"
+        for obj in self.objs:
+            self_dict[obj.name] = obj.generate_template(inject_size)
+        return self.replicate(self_dict)
+
+    def children(self):
+        return self.objs
+
+    def is_singleton(self):
+        return len(self.objs) == 1 and self.objs[0].count == (1,)
+
+    def disabled(self):
+        return self.templatization_behavior == "disabled"
+
+    def top_level(self):
+        return self.templatization_behavior == "top_level"
+
+    def generate_cpp(self, outfile, args, schema):
+        try:
+            name = args.name
+        except AttributeError:
+            name = self.name
+        self.gen_type(outfile, args, schema, '', name, '')
+        all_used = True
+        for obj in args.rewrite:
+            if obj not in args.rewrite_used:
+                sys.stderr.write("Rewrite object %s not found\n" % obj)
+                all_used = False
+            else:
+                for child in args.rewrite[obj]:
+                    if child not in args.rewrite_used[obj]:
+                        sys.stderr.write("Rewrite child %s.%s not found\n" % (obj, child))
+                        all_used = False
+        if not all_used:
+            raise CsrException("Unused rewrite clauses in templates")
+
+    def print_as_text(self, indent):
+        if self.templatization_behavior != "disabled":
+            print("%saddress_map %s%s:" % (indent, self.name, str(self.count)))
+            for ch in self.objs:
+                ch.print_as_text(indent + "  ")
+
+
+class address_map_instance(csr_composite_object):
+    """
+    TODO: docstring
+    @attr offset
+        offset from the start of the containing address_map (instance)
+    @attr map
+        address_map object that is an instance of
+    @attr stride
+        If @count is not (1,), this is the offset from each instance in the array to
+        the next.  If @count is (1,) this should be null
+    """
+
+    def __init__(self, name, count, offset, addrmap, stride):
+        csr_composite_object.__init__(self, name, count)
+
+        self.offset = offset
+        self.map = addrmap
+        self.stride = stride
+
+    def min_width(self):
+        return self.map.min_width()
+
+    def generate_binary(self, data, cache, path):
+        path[-1].path.append(self.name)
+        binary = self.map.generate_binary(data, cache, path)
+        path[-1].path.pop()
+        return binary
+
+    def generate_template(self, inject_size):
+        if self.map.templatization_behavior == "top_level":
+            return self.replicate(self.map.name + "_object")
+        elif self.map.templatization_behavior == "disabled":
+            return self.replicate(0)
+        else:
+            return self.replicate(self.map.generate_template(inject_size))
+
+    def children(self):
+        return self.map.objs
+
+    def is_singleton(self):
+        return len(self.map.objs) == 1 and self.map.objs[0].count == (1,)
+
+    def disabled(self):
+        return self.map.templatization_behavior == "disabled"
+
+    def top_level(self):
+        return self.map.templatization_behavior == "top_level"
+
+    def address_stride(self):
+        return self.stride
+
+    def type_name(self, args, parent, name):
+        self.map.type_name(args, parent, name)
+
+    def gen_type(self, outfile, args, schema, parent, name, indent):
+        if self.map.templatization_behavior == "disabled":
+            raise CsrException("disabled address_map hit in gen_type")
+        elif self.map.templatization_behavior == "top_level":
+            if args.gen_decl != 'defn':
+                tname = self.map.object_name
+                if tname is None:
+                    tname = self.map.parent + '.' + self.map.name
+                outfile.write("register_reference<struct %s>" % self.canon_name(tname)[0])
+        else:
+            self.map.gen_type(outfile, args, schema, parent, name, indent)
+
+    def print_as_text(self, indent):
+        print(
+            "%saddress_map_instance %s%s: offset=0x%x%s"
+            % (
+                indent,
+                self.name,
+                str(self.count),
+                self.offset,
+                " stride=0x%x" % self.stride if self.stride else "",
+            )
+        )
+        if self.map.templatization_behavior == "top_level":
+            print(
+                "%s  address_map %s%s: (top level %s)"
+                % (indent, self.name, str(self.count), self.map.name)
+            )
+        else:
+            self.map.print_as_text(indent + "  ")
+
+
+class group(address_map):
+    """
+    TODO: docstring
+    @attr stride
+        If @count is not (1,) this the offset from each element to the next
+        If @count is (1,) this should be null
+    @attr offset
+        offset from the start of the containing addres_map
+    """
+
+    def __init__(self, name, count, offset, parent, stride):
+        address_map.__init__(self, name, count, parent)
+        self.stride = stride
+        self.offset = offset
+
+    def generate_binary(self, data, cache, path):
+        path[-1].path.append(self.name)
+        binary = address_map.generate_binary(self, data, cache, path)
+        path[-1].path.pop()
+        return binary
+
+    def min_width(self):
+        """
+        A group array's stride, unlike addressmap instance arrays, is not
+        rounded up to a power of two if it has to be calculated.
+        """
+        if self.stride != None:
+            return self.stride
+        else:
+            return address_map.min_width(self, round_to_power_of_2=False)
+
+    def address_stride(self):
+        return self.stride
+
+    def print_as_text(self, indent):
+        print(
+            "%sgroup %s%s: offset=0x%x%s"
+            % (
+                indent,
+                self.name,
+                str(self.count),
+                self.offset,
+                " stride=0x%x" % self.stride if self.stride else "",
+            )
+        )
+        for ch in self.objs:
+            ch.print_as_text(indent + "  ")
+
+
+class reg(csr_composite_object):
+    """
+    TODO: docstring
+    @attr parent
+        Containing address_map object
+    @attr offset
+        Offset from the start of the containing address_map_instance
+    @attr width
+        width in bits
+    @attr fields
+        vector of fields in the register
+    """
+
+    def __init__(self, name, count, offset, width, parent):
+        csr_composite_object.__init__(self, name, count)
+
+        self.parent = parent
+        self.offset = offset
+        self.width = width
+        self.fields = []
+
+    def __str__(self):
+        f = "("
+        sep = ""
+        for x in self.fields:
+            f += sep + str(x)
+            sep = ", "
+        f += ")"
+        return "reg %s fields:%s" % (self.name, f)
+
+    def get_reset_value(self):
+        rv = 0
+        for f in self.fields:
+            rv |= f.default[0] << f.lsb
+        return rv
+
+    def generate_binary(self, data, cache, path, mem=None):
+        if data == 0:
+            # No-op
+            return None
+        elif isinstance(data, basestring):
+            # Refernce to template
+
+            path[-1].path.append(self.name)
+
+            type_name = self.parent + "." + self.name
+
+            if data not in cache.templates:
+                raise CsrException("Could not find template with name '" + data + "'")
+
+            if cache.get_type(data) != type_name:
+                raise CsrException(
+                    "Expected type of instantiated object '"
+                    + data
+                    + "' to be '"
+                    + type_name
+                    + "', found '"
+                    + cache.get_type(data)
+                    + "'"
+                )
+
+            cached_data = cache.get_data(data, path)
+            path[-1].path.pop()
+
+            if mem:
+                mem.add_word(cached_data)
+            else:
+                return cached_data
+        elif type(data) is dict:
+            path[-1].path.append(self.name)
+
+            reg_value = [0]
+            # TODO:  put field names in path histories
+            for field in self.fields:
+
+                if field.name not in data:
+                    raise CsrException("Could not find key '" + field.name + "'")
+
+                width = field.msb - field.lsb + 1
+                if field.count == (1,):
+                    value = data[field.name]
+                    if type(value) is not int:
+                        raise CsrException(
+                            "Expected integer value for field '%s.%s' but found value of type %s"
+                            % (self.name, field.name, type(value).__name__)
+                        )
+                    elif value < 0:
+                        raise CsrException(
+                            "Value for field '%s.%s' is negative (%i)"
+                            % (self.name, field.name, value)
+                        )
+                    elif value <= pow(2, width):
+                        reg_value[0] |= value << field.lsb
+                    else:
+                        raise CsrException(
+                            "Width of field '%s.%s' (%i bits) not large enough to hold value (%i)"
+                            % (self.name, field.name, width, value)
+                        )
+                else:
+                    offset = [0]
+
+                    def field_loop(sub_data, context):
+                        path[-1].path.append(context)
+                        for idx in range(0, field.count[-1]):
+                            context[-1] = idx
+                            value = sub_data[idx]
+                            if type(value) is not int:
+                                raise CsrException(
+                                    "Expected integer value for field '%s.%s%s' but found value of type %s"
+                                    % (
+                                        self.name,
+                                        field.name,
+                                        array_str(context),
+                                        type(value).__name__,
+                                    )
+                                )
+                            elif value < 0:
+                                raise CsrException(
+                                    "Value for field '%s.%s%s' is negative (%i)"
+                                    % (self.name, field.name, array_str(context), value)
+                                )
+                            elif value <= pow(2, width):
+                                reg_value[0] |= value << field.lsb + offset[0]
+                                offset[0] += width
+                            else:
+                                raise CsrException(
+                                    "Width of field '%s.%s%s' (%i bits) not large enough to hold value (%i)"
+                                    % (self.name, field.name, array_str(context), width, value)
+                                )
+                        path[-1].path.pop()
+
+                    # TODO: check all dimension sizes
+                    if (
+                        type(data[field.name]) is not list
+                        or len(data[field.name]) != field.count[0]
+                    ):
+                        array_size = "x".join(map(str, field.count))
+                        raise CsrException(
+                            "Expected "
+                            + array_size
+                            + " element array of integers at key '"
+                            + field.name
+                            + "'"
+                        )
+
+                    nd_array_loop(field.count, data[field.name], field_loop)
+
+            path[-1].path.pop()
+
+            if mem:
+                mem.add_word(reg_value[0])
+            elif self.width <= 32:
+                return chip.direct_reg(self.offset, reg_value[0], src_key=self.name)
+            else:
+                return chip.indirect_reg(self.offset, reg_value[0], self.width, src_key=self.name)
+        else:
+            raise CsrException(
+                "Expected dictionary at register node '%s' but found value of type %s"
+                % (self.name, type(data).__name__)
+            )
+
+    def generate_template(self, inject_size):
+        self_dict = {}
+        for field in self.fields:
+            self_dict[field.name] = field.generate_template(inject_size)
+        return self.replicate(self_dict)
+
+    def children(self):
+        return self.fields
+
+    def is_singleton(self):
+        return len(self.fields) == 1 and self.fields[0].count == (1,)
+
+    def singleton_obj(self):
+        if self.is_singleton() and self.fields[0].name == self.name:
+            return self.fields[0]
+        return self
+
+    def address_stride(self):
+        return self.width // 8
+
+    def disabled(self):
+        return False
+
+    def top_level(self):
+        return False
+
+    def gen_word_expressions(self, args, prefix):
+        """
+        generate expressions to calculate the value of each word of the register
+        """
+
+        class context:
+            shift = 0
+            words = []
+
+        context.words = [None] * ((self.width + 31) // 32)
+        for a in self.fields:
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            if prefix:
+                if self.name == a.name and len(self.fields) == 1:
+                    field_name = prefix
+                else:
+                    field_name = prefix + "." + field_name
+            context.shift = a.lsb
+
+            def emit_field_slice(field, word, shift):
+                if context.words[word] is None:
+                    context.words[word] = ''
+                else:
+                    context.words[word] += " + "
+                if shift != 0:
+                    context.words[word] += "("
+                context.words[word] += field
+                if shift > 0:
+                    context.words[word] += " << %d)" % shift
+                elif shift < 0:
+                    context.words[word] += " >> %d)" % -shift
+
+            def emit_ubits_field(index_list):
+                word = context.shift // 32
+                shift = context.shift % 32
+                name = field_name + array_str(index_list)
+                emit_field_slice(name, word, shift)
+                if shift + a.msb - a.lsb >= 32:
+                    emit_field_slice(name, word + 1, shift - 32)
+                if shift + a.msb - a.lsb >= 64:
+                    emit_field_slice(name, word + 2, shift - 64)
+                context.shift = context.shift + a.msb - a.lsb + 1
+
+            def emit_widereg_field(index_list):
+                word = context.shift // 32
+                shift = context.shift % 32
+                name = field_name + array_str(index_list) + ".value.getrange("
+                emit_field_slice(name + "0, %d)" % (32 - shift), word, shift)
+                shift = 32 - shift
+                while shift < a.msb - a.lsb + 1:
+                    word += 1
+                    emit_field_slice(name + "%d, 32)" % shift, word, 0)
+                    shift += 32
+
+            if a.count != (1,):
+                if a.msb - a.lsb + 1 > 64:
+                    count_array_loop(a.count, emit_widereg_field)
+                else:
+                    count_array_loop(a.count, emit_ubits_field)
+            else:
+                if a.msb - a.lsb + 1 > 64:
+                    emit_widereg_field(None)
+                else:
+                    emit_ubits_field(None)
+        return context.words
+
+    def gen_uint_conversion(self, outfile, args, classname, indent):
+        if self.width > 32:
+            return
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile, args, "", classname, "operator uint32_t", [], "const"
+        ):
+            return
+        outfile.write("%s  return " % indent)
+        outfile.write("%s;\n" % self.gen_word_expressions(args, None)[0])
+        outfile.write("%s}\n" % indent)
+
+    def gen_emit_binary_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "void",
+            classname,
+            "emit_binary",
+            ["std::ostream &out", "uint64_t a"],
+            "const",
+        ):
+            return
+        indent += "  "
+        if self.count != (1,):
+            pass
+        indirect = (self.parent.parent == "memories") or (self.name in args.write_dma)
+        if not indirect:
+            outfile.write("%sif (!disabled_) {\n" % indent)
+            indent += "  "
+        pairs = enumerate(self.gen_word_expressions(args, None))
+        if not indirect and args.reverse_write:
+            # DANGER -- certain registers must be written in reverse order (higher
+            # address then lower), so we reverse the order of register writes here.
+            # block writes must be in order (lowest to highest) as they are a block
+            pairs = reversed(list(pairs))
+        for idx, val in pairs:
+            if val is None:
+                val = '0'
+            outfile.write("%sout << " % indent)
+            if not indirect:
+                outfile.write("binout::tag('R') << binout::byte4(a + %d)\n" % (idx * 4))
+                outfile.write("%s    << " % indent)
+            outfile.write("binout::byte4(%s);\n" % val)
+        if not indirect:
+            indent = indent[2:]
+            outfile.write("%s}\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def gen_input_binary_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "void",
+            classname,
+            "input_binary",
+            ["uint64_t a", "char t", "uint32_t *d", "size_t l"],
+            "",
+        ):
+            return
+        indent += '  '
+        words = (self.width + 31) // 32
+        indirect = self.parent.parent == "memories" or (self.name in args.write_dma) or words == 1
+        # words == 1 is not really indirect, but we don't need to figure out which word is
+        # being written, so we can use the simpler code
+        zero_default = True
+        for a in self.fields:
+            if isinstance(a.default, tuple):
+                if reduce(ior, a.default, 0) != 0:
+                    zero_default = False
+                    break
+            elif a.default is None or a.default != 0:
+                zero_default = False
+                break
+        if indirect:
+            outfile.write(
+                '%sBUG_CHECK(l == %d, "expecting %d words, got %%zd in %s", l);\n'
+                % (indent, words, words, self.name)
+            )
+            if zero_default:
+                outfile.write('%sif ((d[0]' % indent)
+                for i in range(1, words):
+                    outfile.write('|d[%d]' % i)
+                outfile.write(') == 0) return;\n')
+        else:
+            outfile.write(
+                '%sBUG_CHECK(t == \'R\' && l == 1, "expecting direct in %s");\n'
+                % (indent, self.name)
+            )
+            if zero_default:
+                outfile.write('%sif (d[0] == 0) return;\n' % indent)
+            outfile.write('%sa /= 4;\n' % indent)
+        for a in self.fields:
+            field_name = a.name
+            if field_name in args.cpp_reserved:
+                field_name += '_'
+            lsb = a.lsb
+            size = a.msb - a.lsb + 1
+
+            def input_ubits_field(index_list):
+                nonlocal lsb
+                outfile.write(indent)
+                if indirect:
+                    word = lsb // 32
+                    aop = '='
+                else:
+                    outfile.write('if (a == %d) ' % (lsb // 32))
+                    word = 0
+                    aop = '|='
+                outfile.write('%s%s %s ' % (field_name, array_str(index_list), aop))
+                if lsb % 32 + size < 32:
+                    outfile.write('(d[%d] >> %d) & 0x%x;\n' % (word, lsb % 32, (1 << size) - 1))
+                elif lsb % 32 + size == 32:
+                    outfile.write('d[%d] >> %d;\n' % (word, lsb % 32))
+                else:
+                    outfile.write('(d[%d] >> %d)' % (word, lsb % 32))
+                    if indirect:
+                        outfile.write(' | ')
+                    else:
+                        outfile.write(';\n')
+                    msb = lsb + size - 1
+                    for i in range(lsb // 32 + 1, msb // 32):
+                        if indirect:
+                            outfile.write('((uint64_t)d[%d] << %d) | ' % (i, i * 32 - lsb))
+                        else:
+                            outfile.write(
+                                '%sif (a == %d) %s%s |= (uint64_t)d[0] << %d;\n'
+                                % (indent, i, field_name, array_str(index_list), i * 32 - lsb)
+                            )
+                    if indirect:
+                        outfile.write(
+                            '(((uint64_t)d[%d] & 0x%x) << %d);\n'
+                            % (msb // 32, (1 << (msb % 32 + 1)) - 1, msb // 32 * 32 - lsb)
+                        )
+                    else:
+                        outfile.write(
+                            '%sif (a == %d) %s%s |= ((uint64_t)d[0] & 0x%x) << %d;\n'
+                            % (
+                                indent,
+                                msb // 32,
+                                field_name,
+                                array_str(index_list),
+                                (1 << (msb % 32 + 1)) - 1,
+                                msb // 32 * 32 - lsb,
+                            )
+                        )
+                lsb += size
+
+            def input_widereg_field(index_list):
+                nonlocal lsb
+                outfile.write('%sBUG("widereg input not implemented");\n' % indent)
+                lsb += size
+
+            if a.count != (1,):
+                if a.msb - a.lsb + 1 > 64:
+                    count_array_loop(a.count, input_widereg_field)
+                else:
+                    count_array_loop(a.count, input_ubits_field)
+            else:
+                if a.msb - a.lsb + 1 > 64:
+                    input_widereg_field(None)
+                else:
+                    input_ubits_field(None)
+
+        indent = indent[2:]
+        outfile.write('%s}\n' % indent)
+
+    def gen_binary_offset_method(self, outfile, args, classname, indent):
+        outfile.write(indent)
+        if self.gen_method_declarator(
+            outfile,
+            args,
+            "uint64_t",
+            classname,
+            "binary_offset",
+            ["const void *addr", ("int *bit_offset", "0")],
+            "const",
+        ):
+            return
+        indent += "  "
+        outfile.write("%sif (bit_offset) {" % indent)
+        indent += "  "
+        outfile.write("%s/* TDB */\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+        outfile.write("%sreturn 0;\n" % indent)
+        indent = indent[2:]
+        outfile.write("%s}\n" % indent)
+
+    def print_as_text(self, indent):
+        print(
+            "%sreg %s%s: offset=0x%x width=%d"
+            % (indent, self.name, str(self.count), self.offset, self.width)
+        )
+        for ch in self.fields:
+            ch.print_as_text(indent + "  ")
+
+
+class scanset_reg(reg):
+    """
+    A register that needs to be written multiple times (same address) to hold an array
+    @attr parent
+        Containing address_map object
+    @attr offset
+        Offset from the start of the containing address_map_instance
+    @attr sel_offset
+        offset from the start of the containing address_map_instance for the selector reg
+    @attr width
+        width in bits
+    @attr fields
+        vector of fields in the register
+    """
+
+    def __init__(self, name, count, offset, width, parent, fields):
+        reg.__init__(self, name, count, offset, width, parent)
+        if isinstance(fields, list):
+            self.fields = fields
+        else:
+            self.fields = [fields]
+
+    def __str__(self):
+        f = "("
+        sep = ""
+        for x in self.fields:
+            f += sep + str(x)
+            sep = ", "
+        f += ")"
+        return "scanset %s%s fields:%s" % (self.name, str(self.count), f)
+
+    def output_binary(self, outfile, args, indent, address_unit, width_unit):
+        # import pdb; pdb.set_trace()
+        name = self.name
+        if name in args.cpp_reserved:
+            name += '_'
+        if self.count == (1,):
+            raise CsrException("invalid count in scanset_reg")
+        if args.enable_disable:
+            outfile.write("%sif (!%s.disabled()) {\n" % (indent, name))
+            indent += '  '
+        if not hasattr(self, 'sel_offset'):
+            if args.enable_disable:
+                outfile.write("%sif (!%s.disabled()) {\n" % (indent, name))
+                indent += '  '
+            outfile.write(
+                "%sout << binout::tag('S') << binout::byte8(0)" % indent
+                + " << binout::byte4(0)\n%s    " % indent
+                + " << binout::byte8(a + 0x%x)" % (self.offset // address_unit)
+                + " << binout::byte4(32) << binout::byte4(%d);\n"
+                % (product(self.count) * self.width // width_unit)
+            )
+        for idx_num, idx in enumerate(self.count):
+            outfile.write(
+                '%sfor (int j%d = 0; j%d < %d; j%d++) { \n'
+                % (indent, idx_num, idx_num, idx, idx_num)
+            )
+            name = name + "[j%d]" % idx_num
+            if hasattr(self, 'sel_offset') and idx_num == 0:
+                if args.enable_disable:
+                    outfile.write("%sif (!%s.disabled()) {\n" % (indent, name))
+                    indent += '  '
+                outfile.write(
+                    "%sout << binout::tag('S') << binout::byte8" % indent
+                    + "(a + 0x%x)" % (self.sel_offset // address_unit)
+                    + " << binout::byte4(j0)\n%s    " % indent
+                    + " << binout::byte8(a + 0x%x)" % (self.offset // address_unit)
+                    + " << binout::byte4(32) << binout::byte4(%d);\n"
+                    % (product(self.count[1:]) * self.width // width_unit)
+                )
+            indent += '  '
+        pairs = enumerate(self.gen_word_expressions(args, name))
+        for idx, val in pairs:
+            if val is None:
+                val = '0'
+            outfile.write("%sout << binout::byte4(%s);\n" % (indent, val))
+        for i in range(0, len(self.count) + (2 if args.enable_disable else 0)):
+            indent = indent[2:]
+            outfile.write("%s}\n" % indent)
+
+    def input_binary(self, outfile, args, indent, address_unit, width_unit):
+        raise CsrException("scanset_reg.input_binary not implemented")
+
+
+class field(csr_object):
+    """
+    TODO: docstring
+    @attr default
+        default (reset-init) value
+    @attr parent
+        containing register object
+    @attr msb, lsb
+        Range of bits in containing register for this field.  If @count is not (1,), this
+        is just the first element; the second will be at lsb = msb+1 etc
+    """
+
+    def __init__(self, name, count, msb, lsb, default, parent):
+        csr_object.__init__(self, name, count)
+
+        self.default = default
+        self.parent = parent
+        self.msb = msb
+        self.lsb = lsb
+
+    def __str__(self):
+        return "%s[%d:%d]" % (self.name, self.msb, self.lsb)
+
+    def generate_template(self, inject_size):
+        if inject_size:
+            return self.replicate(self.msb - self.lsb + 1)
+        else:
+            if self.count == (1,):
+                return self.default[0]
+            else:
+                return self.default
+
+    def is_field(self):
+        return True
+
+    def disabled(self):
+        return False
+
+    def top_level(self):
+        return False
+
+    def type_name(self, args, parent, name):
+        size = self.msb - self.lsb + 1
+        if size > 64:
+            rv = "widereg<%d>" % size
+        else:
+            rv = "ubits<%d>" % size
+        if self.count != (1,):
+            if args.checked_array:
+                for idx in self.count:
+                    rv = "checked_array<%d, %s>" % (idx, rv)
+            else:
+                for idx in self.count:
+                    rv = "%s[%d]" % (rv, idx)
+        return rv
+
+    def gen_type(self, outfile, args, schema, parent, name, indent):
+        size = self.msb - self.lsb + 1
+        if args.gen_decl != 'defn':
+            if size > 64:
+                outfile.write("widereg<%d>" % size)
+            else:
+                outfile.write("ubits<%d>" % size)
+
+    def print_as_text(self, indent):
+        print(
+            "%sfield %s%s: [%d:%d]%s"
+            % (
+                indent,
+                self.name,
+                str(self.count),
+                self.msb,
+                self.lsb,
+                " default=" + str(self.default) if self.default else "",
+            )
+        )
+
+
+########################################################################
+## Utility functions
+
+
+def parse_resets(reset_str):
+    """
+    Turn a reset value from a Semifore CSV string into a tuple of ints
+
+    Semifore CSV formats most reset values as hex integers of the from 0x___
+    Arrays of fields, however, result in comma-separated lists:
+    [0x__, 0x__, ...]
+
+    If the array is 1D this function will still output the size as a 1-element
+    tuple, just for consistency of iterability.
+    """
+    reset_strs = reset_str.replace("[", "").replace("]", "").split(",")
+    resets = [int(x, 0) for x in reset_strs]
+    return tuple(resets)
+
+
+def parse_array_size(size_str):
+    """
+    Turn an array size from a Semifore CSV string into a tuple of ints
+
+    Semifore CSV formats the size of an array as an int in square brackets.
+    Multidimensional arrays are just a lot of these concatenated together:
+    [i]
+    [i][j][k]
+    ...
+
+    If the array is 1D this function will still output the size as a 1-element
+    tuple, just for consistency of iterability.
+    """
+    size_strs = size_str.replace("]", "").split("[")[1:]
+    sizes = list(map(int, size_strs))
+    if len(sizes) > 0:
+        return tuple(sizes)
+    else:
+        return (1,)
+
+
+def parse_csrcompiler_csv(filename, section_name):
+    """
+    Given a Semifore CSV file, parse it into a bunch of csr_object instances.
+    Since the chip hierarchy is contained across multiple CSV files, each one
+    has a unique "section name".
+
+    @param  filename        The filename of the CSV file to parse
+    @param  section_name    A string meaningfully describing the contents of
+                            the CSV (eg, "memories" and "regs")
+    @return A list of all addressmaps parsed out of the file.
+    """
+
+    csv_field_types = {
+        "configuration",
+        "userdefined configuration",
+        "constant",
+        "counter",
+        "status",
+        "hierarchicalInterrupt",
+        "interrupt",
+    }
+
+    csv_addressmap_types = {
+        "addressmap",
+        "userdefined addressmap",
+    }
+
+    csv_register_types = {
+        "register",
+        "wide register",
+        "userdefined register",
+        "userdefined wide register",
+    }
+
+    csv_group_types = {
+        "group",
+        "userdefined group",
+    }
+
+    addr_maps = {}
+    active_addr_map = None
+    active_group = []
+    active_reg = None
+    active_reg_default = 0
+
+    with open(filename, "rt", encoding='utf-8', errors='ignore') as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        row_num = 0
+        for row in csv_reader:
+            array_size = parse_array_size(row["Array"])
+
+            active_object = None
+            if len(active_group) > 0:
+                active_container = active_group[-1]
+            else:
+                active_container = active_addr_map
+
+            if row["Type"] in csv_addressmap_types:
+                addr_maps[row["Identifier"]] = address_map(
+                    row["Identifier"], array_size, section_name
+                )
+                active_addr_map = addr_maps[row["Identifier"]]
+            elif row["Type"] in csv_register_types:
+                reg_width = int(row["Register Size"].replace(" bits", ""), 0)
+                active_container.objs.append(
+                    reg(
+                        row["Identifier"],
+                        array_size,
+                        int(row["Offset"], 0),
+                        reg_width,
+                        active_addr_map,
+                    )
+                )
+                active_reg = active_container.objs[-1]
+                active_object = active_reg
+                if row["Reset Value"] == "":
+                    active_reg_default = 0
+                else:
+                    active_reg_default = int(row["Reset Value"], 0)
+            elif row["Type"] in csv_field_types:
+                if len(array_size) > 1:
+                    raise CsrException(
+                        "Multi-dimensional field arrays not currently supported (in CSV file '"
+                        + filename
+                        + "' line "
+                        + str(row_num)
+                        + ")"
+                    )
+                range_tokens = row["Position"].replace("[", "").replace("]", "").split(":")
+                msb = int(range_tokens[0])
+                if len(range_tokens) == 1:
+                    lsb = msb
+                else:
+                    lsb = int(range_tokens[1])
+                if row["Reset Value"] == "":
+                    default = []
+                    elem_width = msb - lsb + 1
+                    elem_mask = 2**elem_width - 1
+                    for elem_idx in range(array_size[0]):
+                        default_offset = lsb + elem_width * elem_idx
+                        default_val = (active_reg_default >> default_offset) & elem_mask
+                        default.append(default_val)
+                    default = tuple(default)
+                else:
+                    default = parse_resets(row["Reset Value"])
+                    if array_size != (1,):
+                        if len(default) == 1:
+                            default = default * array_size[0]
+                        elif len(default) != array_size[0]:
+                            raise CsrException(
+                                "Field reset value list is not the same length as the field array itself (in CSV file '"
+                                + filename
+                                + "' line "
+                                + str(row_num)
+                                + ")"
+                            )
+
+                active_reg.fields.append(
+                    field(row["Identifier"], array_size, msb, lsb, default, active_reg)
+                )
+                active_object = active_reg.fields[-1]
+            elif row["Type"] == "addressmap instance":
+                try:
+                    stride = int(row["Stride"].replace(" bytes", ""), 0)
+                except:
+                    stride = addr_maps[row["Type Name"]].min_width()
+
+                active_container.objs.append(
+                    address_map_instance(
+                        row["Identifier"],
+                        array_size,
+                        int(row["Offset"], 0),
+                        addr_maps[row["Type Name"]],
+                        None if array_size == (1,) else stride,
+                    )
+                )
+            elif row["Type"] in csv_group_types:
+                try:
+                    stride = int(row["Stride"].replace(" bytes", ""), 0)
+                except:
+                    stride = None
+
+                active_container.objs.append(
+                    group(
+                        row["Identifier"],
+                        array_size,
+                        int(row["Offset"], 0),
+                        section_name,
+                        None if array_size == (1,) else stride,
+                    )
+                )
+                active_group.append(active_container.objs[-1])
+                active_object = active_group[-1]
+            elif row["Type"] == "endgroup":
+                popped_group = active_group.pop()
+                if popped_group.stride == None:
+                    popped_group.stride = popped_group.min_width()
+            elif row["Type"] == "userdefined memory":
+                # ignore for now?
+                pass
+            elif row["Type"] == "reserved":
+                # ignore for now?
+                pass
+            elif row["Type"] == "unknown":
+                # ignore for now?
+                pass
+            else:
+                raise CsrException(
+                    "Unrecognized type '"
+                    + row["Type"]
+                    + "' in CSV file '"
+                    + filename
+                    + "' line "
+                    + str(row_num)
+                )
+
+            if "Description" in row and row["Description"] and active_object:
+                active_object.description = row["Description"]
+
+            row_num += 1
+
+    return addr_maps
+
+
+def build_schema(dir, walle_version):
+    """
+    Build a chip schema based on the top-level CSV files from Semifore
+
+    The schema is a dictionary of dictionaries. The top-level keys are the
+    "sections" of the chip's interface and metadata:
+        - memories: Memories and large register arrays. Things like the parser
+                    TCAM are found here. Taken from the <chip>_mem Semifore
+                    hierarchy, in byte-granularity chip addresses
+        - regs:     Registers, like statistics counters and MAU crossbars.
+                    Taken from the <chip> Semifore hierarchy, in 32-bit PCIe
+                    addresses
+        - _schema_hash:  An MD5 hash of the CSV file contents used to generate
+                    the rest of the schema
+
+    The non-metadata entries contain a dictionary of all of that hierarchy's
+    addressmaps, mapping from addressmap name to addrses_map objects.
+
+    @param  dir     A string pointing to the directory containing (a copy of) the
+                    bfnregs repo subdir "modules/<chip>_reg" generated by Semifore
+                    using csr_config.css
+    @return A new schema object
+    """
+    new_schema = {}
+    schema_hash = 0
+    hasher = hashlib.md5()
+
+    version_file = os.path.join(dir, "..", "..", "VERSION")
+    csv_files = os.path.join(dir, "module", "csv")
+    if not os.path.isdir(csv_files):
+        csv_files = dir
+
+    if not os.path.isfile(version_file) or not os.path.isdir(csv_files):
+        raise Exception(
+            "Directory '"
+            + os.path.abspath(dir)
+            + "' could not be opened, "
+            + "does not exist, or does not appear to be a valid bfnregs "
+            + "chip module."
+        )
+
+    for filename in os.listdir(csv_files):
+        if filename.endswith(".csv"):
+            key = os.path.splitext(filename)[0]
+            if key == "pipe_top_level":
+                key = "memories"
+            elif key == "tofino":
+                key = "regs"
+            elif key.endswith("_mem"):
+                key = "memories"
+            elif key.endswith("_reg"):
+                key = "regs"
+            filename = os.path.join(csv_files, filename)
+
+            new_schema[key] = parse_csrcompiler_csv(filename, key)
+
+            with open(filename, "rb") as csv_file:
+                hasher.update(csv_file.read())
+
+    if len(new_schema) == 0:
+        raise Exception("No csv files found under '" + os.path.abspath(csv_files) + "'")
+
+    with open(version_file, "r") as version_file_handle:
+        reg_version = version_file_handle.read()
+        hasher.update(reg_version.encode('utf-8'))
+
+    new_schema["_reg_version"] = reg_version
+    new_schema["_walle_version"] = walle_version
+    new_schema["_schema_hash"] = hasher.hexdigest()
+
+    return new_schema
+
+
+# Unit tests
+if __name__ == "__main__":
+    y = reg("bar", (1,), 0, 32, None)
+    y.fields.append(field("y1", (1,), 7, 0, y))
+    y.fields.append(field("y2", (1,), 15, 8, y))
+    y.fields.append(field("y3", (1,), 23, 16, y))
+    y.fields.append(field("y4", (1,), 31, 24, y))
+    data = {
+        "y1": 0x30,
+        "y2": 0x32,
+        "y3": 0x34,
+        "y4": 0x36,
+    }
+    z = y.generate_binary(data, None, [traversal_history("root")])
+    if z.value != "0246":
+        print("ERROR: Expected 32-bit object to have string value '6420'")
+        print("    32 bit value was " + z.value)
+
+    y = reg("baz", (1,), 0, 40, None)
+    y.fields.append(field("y1", (1,), 7, 0, y))
+    y.fields.append(field("y2", (1,), 15, 8, y))
+    y.fields.append(field("y3", (1,), 23, 16, y))
+    y.fields.append(field("y4", (1,), 31, 24, y))
+    y.fields.append(field("y5", (1,), 39, 32, y))
+    data = {
+        "y1": 0x30,
+        "y2": 0x32,
+        "y3": 0x34,
+        "y4": 0x36,
+        "y5": 0x38,
+    }
+    z = y.generate_binary(data, None, [traversal_history("root")])
+    if z.value != "02468":
+        print("Expected 40-bit field to have string value '86420'")
+        print("    40 bit value was " + z.value)
diff --git a/backends/tofino/bf-asm/walle/walle.py b/backends/tofino/bf-asm/walle/walle.py
new file mode 100755
index 00000000000..ec2f3ebef53
--- /dev/null
+++ b/backends/tofino/bf-asm/walle/walle.py
@@ -0,0 +1,830 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Walle - JSON-to-binary cruncher tool
+See README.md for usage instructions
+
+The code is organized into three main modules:
+    - walle: Command-line interface and glue code
+    - csr: Code dealing with compiler-facing JSON files and raw Semifore CSV
+        files
+    - chip: Code dealing with driver-facing binary config files
+
+The main program flow is, on a first run of Walle:
+    - The csr module is used to parse Semifore CSV files into the classes
+      that inherit from csr_object, each of which being a Python representation
+      of a Semifore object.
+    - These objects are pickled into a file called "chip.schema" so the raw CSV
+      does not have to be used again or distributed with the toolchain.
+
+Thereafter, Walle operates on JSON files that mirror the structure of the
+Semifore hierarchy and assign integer values to register fields from the
+schema. The details of this format are specified in the README.md fileself.
+    - To generate blank JSON, the csr module will recursively call the
+      generate_template() methods of the csr_objects in the schema.
+    - To crunch JSON into binary, the csr module will recursively call
+      generate_binary() methods of the csr_objects in the schema, passing
+      along the relevent tree of JSON data. These methods create a flat list of
+      objects that represent driver write operations, all of which are classes
+      from the chip module that inherit from chip_obj.
+    - The flat list of chip objects is looped over, calling each one's bytes()
+      method which returns the actual binary string to be passed to the driver.
+      These bytes are concatenated onto the binary file being output.
+      The address of this write may be manipulated, since Semifore addresses
+      are auto-generated and may need to be operated on before they appear as
+      the chip expects (for instance, chip memories are word-addressed while
+      Semifore addresses are byte-addressed)
+
+It's important to note that all addresses calculated within one JSON file are
+relative, so to produce correct chip addresses the binary __must__ be
+calculated starting at a top-level addressmap in the Semifore hierarchy. When
+expanding a JSON config to contain data instead of references to other JSON
+files, Walle will alter the addresses calculated for the _included_ JSON
+to be relative to the addresses in the _including_ JSON.
+"""
+import argparse
+import copy
+import json
+import os
+import pickle
+import subprocess
+import sys
+
+import chip
+import csr
+import yaml
+
+__version__ = '0.4.13'
+
+########################################################################
+## Utility functions
+
+
+class CsrUnpickler(pickle.Unpickler):
+    """
+    This module is a hacky fix for a bug that sometimes shows up when using a
+    chip.schema file generated across different systems.
+
+    Specifically:
+    On system A, walle is a globally installed package via setup.py and can
+    be accessed from the terminal with the command 'walle'
+    On system B, walle is NOT globally installed and instead accessed locally
+    by directly pointing to walle.py
+
+    The chip.schema files generated on A and B will encode Python classes
+    from two different module paths. A will have classes from "walle.csr" while
+    B will have classes from "csr". If we move a schema from one system to
+    another, the module lookup will fail.
+
+    We fix this by looking for references to classes from the 'csr' module and
+    directly retrieving them from the csr module that this top-level script has
+    already imported.
+    """
+
+    def find_class(self, module, name):
+        if module == "csr" or module[-4:] == ".csr":
+            return getattr(csr, name)
+        else:
+            return pickle.Unpickler.find_class(self, module, name)
+
+
+def annotate_names(obj, threshold, path=""):
+    if type(obj) is list:
+        if threshold > 0 and len(obj) > threshold:
+            for idx, elem in enumerate(obj):
+                if type(elem) is dict:
+                    elem["_absolute_name"] = path + "_" + str(idx)
+                annotate_names(elem, threshold, path)
+        else:
+            for elem in obj:
+                annotate_names(elem, threshold, path)
+    elif type(obj) is dict:
+        for key, elem in list(obj.items()):
+            annotate_names(elem, threshold, (path + "_" if len(path) > 0 else "") + key)
+
+
+def print_schema_info(schema_file, schema):
+    hierarchies = []
+    sys.stdout.write("file: " + schema_file + "\n")
+    for key in schema:
+        if key[0] == "_":
+            sys.stdout.write(key[1:] + ": " + str(schema[key]) + "\n")
+        else:
+            hierarchies.append(key)
+    sys.stdout.write("hierarchies: " + ", ".join(hierarchies) + "\n")
+
+
+def parse_template_args(args, params):
+    """
+    Extend argparse.Namespace with additional arguments for cpp generation
+    from templates.yaml that may not exist as command line arguments
+    FIXME -- should be a way to do this with ArgParse?
+    """
+
+    def bool_arg(args, attr, val):
+        if not type(val) is bool:
+            raise Exception("Attribute " + attr + " requires bool argument, got " + str(val))
+        setattr(args, attr, val)
+
+    def str_arg(args, attr, val):
+        if not type(val) is str:
+            raise Exception("Attribute " + attr + " requires string argument, got " + str(val))
+        setattr(args, attr, val)
+
+    def add_list_arg(args, attr, val):
+        getattr(args, attr).append(val)
+
+    def add_set_arg(args, attr, val):
+        getattr(args, attr).add(val)
+
+    def set_decl(args, attr, val):
+        args.gen_decl = 'decl'
+
+    def set_defn(args, attr, val):
+        args.gen_decl = 'defn'
+
+    def no_arg(args, attr, val):
+        pass
+
+    options = {
+        'alias_array': (True, bool_arg),
+        'binary_offset': (False, bool_arg),
+        'checked_array': (True, bool_arg),
+        'decl': (None, set_decl),
+        'delete_copy': (False, bool_arg),
+        'defn': (None, set_defn),
+        'dump_unread': (False, bool_arg),
+        'emit_binary': (False, bool_arg),
+        'emit_fieldname': (False, bool_arg),
+        'emit_json': (False, bool_arg),
+        'enable_disable': (False, bool_arg),
+        'expand_disabled_vector': (False, bool_arg),
+        'gen_decl': ('both', str_arg),
+        'global_types': (set(), add_set_arg),
+        'global': (None, lambda args, attr, val: add_set_arg(args, 'global_types', val)),
+        'include': ([], add_list_arg),
+        'input_binary': (False, bool_arg),
+        'name': (None, str_arg),
+        'namespace': (False, str_arg),
+        'reverse_write': (False, bool_arg),
+        'rewrite': ({}, no_arg),
+        'rewrite_used': ({}, no_arg),
+        'unpack_json': (False, bool_arg),
+        'widereg': (False, bool_arg),
+        'write_dma': (set(), add_set_arg),
+    }
+
+    if not hasattr(args, 'cpp_reserved'):
+        args.cpp_reserved = set(
+            [
+                "and",
+                "asm",
+                "auto",
+                "break",
+                "case",
+                "catch",
+                "char",
+                "class",
+                "const",
+                "continue",
+                "default",
+                "delete",
+                "do",
+                "double",
+                "else",
+                "enum",
+                "extern",
+                "float",
+                "for",
+                "friend",
+                "goto",
+                "if",
+                "inline",
+                "int",
+                "long",
+                "new",
+                "not",
+                "or",
+                "operator",
+                "private",
+                "protected",
+                "public",
+                "register",
+                "return",
+                "short",
+                "signed",
+                "sizeof",
+                "static",
+                "struct",
+                "switch",
+                "template",
+                "this",
+                "throw",
+                "try",
+                "typedef",
+                "union",
+                "unsigned",
+                "virtual",
+                "void",
+                "volatile",
+                "while",
+                "xor",
+            ]
+        )
+    for opt in options:
+        if options[opt][0] is not None:
+            if hasattr(args, opt):
+                setattr(args, opt, copy.copy(getattr(args, opt)))
+            else:
+                setattr(args, opt, copy.copy(options[opt][0]))
+    for p in params:
+        s = p.split('=', 1)
+        if p in options:
+            options[p][1](args, p, True)
+        elif p[0] == '-' and p[1:] in options:
+            options[p[1:]][1](args, p[1:], False)
+        elif s[0] in options:
+            options[s[0]][1](args, s[0], s[1])
+        elif p[:2] == "-I":
+            args.include.append(p[2:])
+        else:
+            sys.stderr.write("Unknown parameter %s\n" % str(p))
+
+    if args.enable_disable:
+        args.cpp_reserved = args.cpp_reserved.copy()
+        args.cpp_reserved.update(
+            [
+                "disable",
+                "disabled",
+                "disable_if_unmodified",
+                "disable_if_zero",
+                "enable",
+                "modified",
+                "set_modified",
+            ]
+        )
+
+
+def read_template_file(template_file, args, schema):
+    with open(template_file, "rb") as template_objects_file:
+        templatization_cfg = yaml.load(template_objects_file, Loader=yaml.SafeLoader)
+        top_level_objs = templatization_cfg["generate"]
+        disabled_objs = templatization_cfg["ignore"]
+        if "global" in templatization_cfg:
+            parse_template_args(args, templatization_cfg["global"])
+    for section_name, section in list(schema.items()):
+        if section_name not in top_level_objs:
+            if section_name[0] != "_":
+                sys.stderr.write("no template cfg for " + section_name + ", ignoring\n")
+            continue
+        for obj in top_level_objs[section_name]:
+            section[obj].templatization_behavior = "top_level"
+            section[obj].object_name = None
+            if top_level_objs[section_name][obj] is None:
+                continue
+            for fname, params in list(top_level_objs[section_name][obj].items()):
+                for p in params:
+                    if p[:5] == 'name=':
+                        section[obj].object_name = p[5:]
+                        break
+        for obj in disabled_objs[section_name]:
+            if section[obj].templatization_behavior != None:
+                raise Exception(obj + " cannot be both templatized and ignored")
+            section[obj].templatization_behavior = "disabled"
+    return top_level_objs
+
+
+def generate_templates(args, schema):
+    if args.o == None:
+        args.o = "templates"
+    if not os.path.exists(args.o):
+        os.makedirs(args.o)
+
+    top_level_objs = read_template_file(args.generate_templates, args, schema)
+    for section_name, section in list(schema.items()):
+        if section_name not in top_level_objs:
+            continue
+        for top_level_obj in top_level_objs[section_name]:
+            template = section[top_level_obj].generate_template(False)
+            sizes = section[top_level_obj].generate_template(True)
+
+            if args.template_indices != None:
+                annotate_names(template, args.template_indices)
+                annotate_names(sizes, args.template_indices)
+
+            # Copy in schema metadata
+            schema_metadata = [key for key in list(schema.keys()) if key[0] == "_"]
+            for metadata in schema_metadata:
+                template[metadata] = schema[metadata]
+                sizes[metadata] = schema[metadata]
+            template["_section"] = section_name
+            sizes["_section"] = section_name
+
+            cfg_name = section_name + "." + top_level_obj + ".cfg.json"
+            with open(os.path.join(args.o, cfg_name), "wb") as outfile:
+                json.dump(template, outfile, indent=4, sort_keys=True)
+            size_name = section_name + "." + top_level_obj + ".size.json"
+            with open(os.path.join(args.o, size_name), "wb") as outfile:
+                json.dump(sizes, outfile, indent=4, sort_keys=True)
+
+
+def arbitrary_ASCII_text_to_52digit_decimal_hash(input):
+    import hashlib
+
+    # 52 characters left in 63 after the prefix "IDENTIFIER_",
+    # and math.log2(10**52) => 172.74026093414284,
+    # and math.log2(10**52)/8 => 21.592532616767855,
+    # so going to request 22 bytes of hash digest.
+    # the next line of commented-out code is _fantastic_ in/on Python 3.8.10,
+    #   but fails in/on Python 3.5.2 [as present in/on the Jarvis image on my old BXDSW VM as of Sept. 7 2022 1:40am NY time]
+    ### hash_digest_as_bytes = hashlib.shake_256( bytes(input, "ASCII") ).digest(22)
+    hash_digest_as_bytes = hashlib.sha224(bytes(input, "ASCII")).digest()
+    # sha224 => 28 bytes of digest, the closest match that is >= 22 bytes and available in/on Python 3.5.2
+
+    hash_digest_as_int = int.from_bytes(hash_digest_as_bytes, "big")
+    return ("%052d" % hash_digest_as_int)[:52]
+
+
+def arbitrary_text_to_valid_C_identifier(
+    input_iterable_of_characters, dry_run_to_get_hash_input=False
+):
+    """Takes a single input, which must be an iterable of characters for correct behavior to be
+    promised.  When given valid input, returns a string that is a valid C and C++ identifier,
+    regardless of what characters are used in the input.
+
+    _Intentionally_ *not* considering [ASCII] underscores as OK to copy untranslated as-is,
+    since _both_ leading underscores _and_ 2-or-more underscores in a row are considered as
+    ''reserved'' by the ISO C++ standard [and probably also by the ISO C standard].
+
+    _Only_ ASCII alphanumerics are ''OK as is''.
+
+    Quoting <https://gcc.gnu.org/onlinedocs/cpp/Implementation-limits.html>:
+
+        "The C standard requires only that the first 63 be significant"
+
+    In other words, the first 63 characters are definitely going to be "paid attention to",
+    and the rest may be handled as "comments".  I think we are probably safe with shifting
+    our upper bound to 200 or 999 characters.
+
+    Using a decimal hash to almost-guarantee uniqueness in the first 63 characters."""
+
+    if (not input_iterable_of_characters) or (len(input_iterable_of_characters) < 1):
+        raise ValueError("This function requires an input of positive length.")
+
+    INCLUSIVE_MAX_OUTPUT_LENGTH = 255  # D. R. Y.
+
+    temp_ASCIIonly_string = ""
+    for char in input_iterable_of_characters:
+        if (
+            len(temp_ASCIIonly_string) > 999
+        ):  # there`s not much good in letting it go on for an arbitrarily-long time
+            break
+        if (
+            '/' == char
+        ):  # {part 1 of 3} of a kludge so that we can include path separators in the hash input
+            temp_ASCIIonly_string += char
+        if char.isalnum() and (ord(char) >= 32) and (ord(char) <= 126):
+            # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+            # "char.isascii()" does _not_ always work
+            temp_ASCIIonly_string += char
+        elif not (temp_ASCIIonly_string.endswith('_') or temp_ASCIIonly_string.endswith('/')):
+            #                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+            # {part 2 of 3} of a kludge so that we can include path separators in the hash input
+            temp_ASCIIonly_string += '_'
+
+    if dry_run_to_get_hash_input:
+        return temp_ASCIIonly_string
+
+    result = "IDENTIFIER_" + arbitrary_ASCII_text_to_52digit_decimal_hash(temp_ASCIIonly_string)
+    if not temp_ASCIIonly_string.startswith('_'):
+        result += '_'
+    result += temp_ASCIIonly_string
+
+    result = result[:INCLUSIVE_MAX_OUTPUT_LENGTH]
+    result = result.replace(
+        '/', '_'
+    )  # {part 3 of 3} of a kludge so that we can include path separators in the hash input
+    return result
+
+
+def pathname_to_valid_C_identifier(file_pathname, dry_run_to_get_hash_input=False):
+    """This makes the assumption that the input is a string
+    [or at least "string-like object"]
+    with the data in a format along the lines of "/a/b/c/d/e/file"
+    """
+    assert len(file_pathname) > 0
+
+    first_char_upper_case = lambda x: "" if (len(x) < 1) else x[0].upper() + x[1:].lower()
+
+    # somewhat hackish...  does anybody want to propose an "elegant" alternative for the next 3 lines?
+    if file_pathname.endswith(".cpp"):
+        file_pathname = file_pathname[:-4]
+    if file_pathname.endswith(".hpp"):
+        file_pathname = file_pathname[:-4]
+    if file_pathname.endswith(".h"):
+        file_pathname = file_pathname[:-2]
+
+    split = file_pathname.split('/')  # POSIXism warning re '/'
+    file = first_char_upper_case(split[-1])
+    last_4_dirs_if_possible = [
+        first_char_upper_case(x) for x in split[-5:-1]
+    ]  # worst-case scenario, this is an empty list
+
+    return arbitrary_text_to_valid_C_identifier(
+        '/'.join(last_4_dirs_if_possible + [file]), dry_run_to_get_hash_input
+    )
+
+
+def generate_cPlusPlus_file(outfile, top_level, args, schema, file_basename):
+    outfile.write(
+        "/* Autogenerated from %s and %s -- DO NOT EDIT */\n" % (args.schema, args.generate_cpp)
+    )
+
+    fake_pathname = args.o + '/' + top_level.name + '/' + file_basename
+
+    synthetic_identifier = pathname_to_valid_C_identifier(fake_pathname)
+    outfile.write("/* --- vvv --- DEBUG  --- vvv ---\n")
+    outfile.write("DEBUG: args.o = ''%s''\n" % args.o)
+    outfile.write("DEBUG: file_basename = ''%s''\n" % file_basename)
+    outfile.write("\n")
+    outfile.write("DEBUG: args.schema = ''%s''\n" % args.schema)
+    outfile.write("DEBUG: top_level.name = ''%s''\n" % top_level.name)
+    outfile.write("DEBUG: top_level.parent = ''%s''\n" % top_level.parent)
+    outfile.write("\n")
+    outfile.write("DEBUG: fake_pathname = ''%s''\n" % fake_pathname)
+    outfile.write("\n")
+    outfile.write(
+        "DEBUG: input to hash algo.: ''%s''\n"
+        % pathname_to_valid_C_identifier(fake_pathname, dry_run_to_get_hash_input=True)
+    )
+    outfile.write("   --- ^^^ --- DEBUG  --- ^^^ --- */\n")
+    del fake_pathname
+
+    if args.gen_decl == 'decl':
+        outfile.write('#ifndef %s\n' % synthetic_identifier)
+        outfile.write('#define %s 1\n\n' % synthetic_identifier)
+
+    for incl in args.include:
+        outfile.write('#include "%s"\n' % incl)
+    if args.emit_json or args.emit_fieldname or args.dump_unread:
+        outfile.write('#include "lib/indent.h"\n')
+    if args.unpack_json:
+        outfile.write('#include "backends/tofino/bf-asm/json.h"\n')
+    if args.alias_array:
+        outfile.write('#include "backends/tofino/bf-asm/alias_array.h"\n')
+    if args.checked_array:
+        outfile.write('#include "backends/tofino/bf-asm/checked_array.h"\n')
+    if args.emit_binary:
+        outfile.write('#include "backends/tofino/bf-asm/binary_output.h"\n')
+    outfile.write('#include "backends/tofino/bf-asm/ubits.h"\n')
+    outfile.write('#include "backends/tofino/bf-asm/register_reference.h"\n')
+    if args.widereg:
+        outfile.write('#include "backends/tofino/bf-asm/widereg.h"\n')
+    outfile.write('\n')
+    outfile.write("using namespace P4;")
+    if len(args.global_types) > 0:
+        args.global_types_generated = {}
+        top_level.gen_global_types(outfile, args, schema)
+    if args.namespace:
+        outfile.write('namespace %s {\n\n' % args.namespace)
+    top_level.generate_cpp(outfile, args, schema)
+    outfile.write(";\n")
+    if args.namespace:
+        outfile.write('\n}  // end namespace %s\n\n' % args.namespace)
+    if args.gen_decl == 'decl':
+        outfile.write('\n#endif /* end of "ifndef %s" */\n' % synthetic_identifier)
+
+
+def extend_args(args, params):
+    """
+    parse additional template arguments into a copy of 'args'
+    """
+    args = copy.copy(args)
+    parse_template_args(args, params)
+    return args
+
+
+def generate_cpp(args, schema):
+    if args.o == None:
+        args.o = "gen"
+    if not os.path.exists(args.o):
+        os.makedirs(args.o)
+
+    top_level_objs = read_template_file(args.generate_cpp, args, schema)
+    global_args = args
+    for section_name, section in list(schema.items()):
+        if section_name not in top_level_objs:
+            continue
+        for top_level_obj, files in list(top_level_objs[section_name].items()):
+            if files is None:
+                continue
+            args = global_args
+            if 'args' in files:
+                args = extend_args(args, files['args'])
+            if 'rewrite' in files:
+                args = copy.copy(args)
+                args.rewrite = files['rewrite']
+            for generate_file, params in list(files.items()):
+                if generate_file == 'args':
+                    continue
+                if generate_file == 'rewrite':
+                    continue
+                if (
+                    (("DEBUG" in globals().keys()) and globals()["DEBUG"])
+                    or ("DEBUG" in locals().keys())
+                    and locals()["DEBUG"]
+                ):
+                    print("===vvv=== DEBUG ===vvv===")
+                    print("globals:", globals())
+                    print("locals:", locals())
+                    print("===^^^=== DEBUG ===^^^===")
+                generate_cPlusPlus_file(
+                    open(os.path.join(args.o, generate_file), "w"),
+                    section[top_level_obj],
+                    extend_args(args, params),
+                    schema,
+                    generate_file,
+                )
+
+
+def print_schema_text(args, schema):
+    def do_print(indent, obj):
+        for key, val in list(obj.items()):
+            if type(val) is str:
+                print("%s%s: %s" % (indent, key, val))
+            elif type(val) is dict:
+                print("%s%s:" % (indent, key))
+                do_print(indent + "  ", val)
+            elif val.templatization_behavior == "top_level":
+                print("%s%s:" % (indent, key))
+                val.print_as_text(indent + "  ")
+
+    read_template_file(args.print_schema, args, schema)
+    do_print("", schema)
+
+
+def build_binary_cache(args, schema):
+    cache = csr.binary_cache(schema)
+    try:
+        for config_filename in args.configs:
+            with open(config_filename, "rb") as configfile:
+                try:
+                    template = json.load(configfile)
+                except:
+                    sys.stderr.write(
+                        "ERROR: Input file '"
+                        + config_filename
+                        + "' could not be decoded as JSON.\n"
+                    )
+                    sys.exit(1)
+
+                if type(template) is not dict or "_name" not in template or "_type" not in template:
+                    sys.stderr.write(
+                        "ERROR: Input file '"
+                        + config_filename
+                        + "' does not appear to be valid Walle configuration JSON.\n"
+                    )
+                    sys.exit(1)
+
+                if (
+                    "_schema_hash" not in template
+                    or template["_schema_hash"] != schema["_schema_hash"]
+                ):
+                    sys.stderr.write(
+                        "ERROR: Input file '"
+                        + config_filename
+                        + "' does not match the current chip schema.\n"
+                    )
+                    if not args.ignore_schema_mismatch:
+                        sys.exit(1)
+
+                cache.templates[template["_name"]] = template
+    except IOError as e:
+        sys.stderr.write(
+            "ERROR: Could not open '%s' for reading: %s (errno %i).\n"
+            % (config_filename, e[1], e[0])
+        )
+        sys.exit(e[0])
+
+    return cache
+
+
+def dump_binary(args, binary_cache, out_file):
+    addr_func = {
+        # Memories are ram-word addressed, not byte addressed
+        "memories": lambda addr: addr >> 4,
+        # TODO: use actual func once model+indirect writes are fixed
+        "regs": lambda addr: addr,
+        # # Regs are give in 32-bit PCIe address space and need to be
+        # # converted to 42-bit chip address space
+        # "regs": lambda addr: ((addr&0x0FF80000)<<14)|(addr&0x0007FFFF)
+    }
+
+    for template in args.top:
+        try:
+            path = []
+            data = binary_cache.get_data(template, path=path)
+            data_type = binary_cache.get_type(template)
+        except csr.CsrException as e:
+            # TODO: decompose:
+            sys.stderr.write("ERROR: " + str(e) + "\n")
+            tb = []
+            for frame in path:
+                tb.append("{" + frame.template_name + "}")
+                arr_subscript = None
+                for node in frame.path:
+                    if type(node) is str:
+                        tb.append(node)
+                        if arr_subscript != None:
+                            tb[-1] += arr_subscript
+                            arr_subscript = None
+                    elif type(node) is list:
+                        arr_subscript = csr.array_str(node)
+                    else:
+                        tb.append(str(node))
+            sys.stderr.write("Traceback: " + ".".join(tb) + "\n")
+            sys.exit(1)
+
+        template_section = data_type.split(".")[0]
+        for chip_obj in data:
+            chip_obj.addr = addr_func[template_section](chip_obj.addr)
+            out_file.write(chip_obj.bytes())
+
+    if args.append_sentinel:
+        out_file.write(chip.direct_reg(0xFFFFFFFF, 0).bytes())
+
+
+def walle_process(parser, args=None):
+    if len(args.top) == 0:
+        args.top = ["memories.top", "regs.top"]
+
+    if args.generate_schema != None:
+        schema = csr.build_schema(args.generate_schema, __version__)
+        with open(args.schema, "wb") as outfile:
+            pickle.dump(schema, outfile, protocol=2)
+            print("Schema generated from:\n")
+            cmd = 'echo | git -C %s log -1' % args.generate_schema
+            output = subprocess.check_output(cmd, shell=True)
+            print(output.decode('utf-8'))
+            outfile.write(b'\n\n' + output)
+
+        if args.generate_templates != None:
+            if not os.path.isfile(args.schema):
+                sys.stderr.write(
+                    "ERROR: Schema file '"
+                    + os.path.abspath(args.schema)
+                    + "' could not be opened or does not exist.\n"
+                )
+                sys.exit(1)
+            generate_templates(args, schema)
+    else:
+
+        if not os.path.isfile(args.schema):
+            sys.stderr.write(
+                "ERROR: Schema file '"
+                + os.path.abspath(args.schema)
+                + "' could not be opened or does not exist.\n"
+            )
+            sys.exit(1)
+
+        with open(args.schema, "rb") as infile:
+            schema = CsrUnpickler(infile).load()
+
+        if args.schema_info:
+            print_schema_info(os.path.abspath(args.schema), schema)
+        elif args.dump_schema:
+            print(yaml.dump(schema))
+        elif args.print_schema:
+            print_schema_text(args, schema)
+        elif args.generate_templates != None:
+            generate_templates(args, schema)
+        elif args.generate_cpp != None:
+            generate_cpp(args, schema)
+        else:
+            if len(args.configs) == 0:
+                parser.print_help()
+            else:
+                if args.o == None:
+                    args.o = "a.out"
+                cache = build_binary_cache(args, schema)
+                with open(args.o, "wb") as binfile:
+                    dump_binary(args, cache, binfile)
+
+                sys.stdout.write("Binary '" + args.o + "' generated successfully.\n")
+
+
+def main():
+    """
+    The main entry point for the script
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__)
+    parser.add_argument(
+        "--schema",
+        '-s',
+        metavar='SCHEMA-FILE',
+        help="The chip schema to use",
+        type=str,
+        default="chip.schema",
+    )
+    parser.add_argument(
+        "--schema-info",
+        action='store_true',
+        help="Print metadata stored in the selected chip schema and exit",
+    )
+    parser.add_argument("--target", "-t", help="The chip target", type=str, default="tofino")
+    parser.add_argument("--dump-schema", action='store_true', help="Dump chip schema as yaml")
+    parser.add_argument(
+        "--print-schema",
+        metavar='TOP-LEVEL-OBJS-FILE',
+        type=str,
+        help="Dump chip schema as (readable?) text",
+    )
+    parser.add_argument(
+        "--generate-schema",
+        metavar='BFNREGS-TARGET-DIR',
+        type=str,
+        help="Generate a chip schema from the bfnregs target regs directory",
+        default=None,
+    )
+    parser.add_argument(
+        "--ignore-schema-mismatch",
+        action='store_true',
+        help="Attempt to crunch input files, even if they do not match the current chip schema",
+    )
+    parser.add_argument(
+        "--generate-templates",
+        metavar='TOP-LEVEL-OBJS-FILE',
+        type=str,
+        help="Generate an 'all-0s' template for each addressmap listed in the given top-level objects file",
+    )
+    parser.add_argument(
+        "--generate-cpp",
+        metavar='TOP-LEVEL-OBJS-FILE',
+        type=str,
+        help="Generate C++ code for each addressmap listed in the given top-level objects file",
+    )
+    parser.add_argument(
+        "--template-indices",
+        metavar='THRESHOLD',
+        help="Include human-readable index keys for register arrays greater than the specified threshold size",
+        type=int,
+        default=None,
+    )
+    parser.add_argument(
+        "--append-sentinel",
+        action='store_true',
+        help="Append a direct register write to address 0xFFFFFFFF to the end of the binary output",
+    )
+    parser.add_argument(
+        '--top',
+        metavar='IDENTIFIER',
+        type=str,
+        action='append',
+        default=[],
+        help='Identifier of a template to generate binary config data for',
+    )
+    parser.add_argument(
+        '-o',
+        metavar='FILE',
+        type=str,
+        default=None,
+        help='Name of file to write binary config data into (or directory to write templates into)',
+    )
+    parser.add_argument(
+        'configs',
+        metavar='CONFIG-FILE',
+        type=str,
+        nargs='*',
+        help='A JSON configuration file to process',
+    )
+
+    args = parser.parse_args()
+    if getattr(sys, 'frozen', False):
+        # running as a bundle: look for the schema in the bundled directory
+        args.schema = os.path.join(sys._MEIPASS, 'lib', args.target, 'chip.schema')
+    walle_process(parser, args)
+
+
+########################################################################
+## Frontend logic
+
+if __name__ == "__main__":
+    main()
diff --git a/backends/tofino/bf-asm/widereg.cpp b/backends/tofino/bf-asm/widereg.cpp
new file mode 100644
index 00000000000..f6b223423c0
--- /dev/null
+++ b/backends/tofino/bf-asm/widereg.cpp
@@ -0,0 +1,29 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "widereg.h"
+
+#include <map>
+#include <sstream>
+
+#include "lib/log.h"
+
+void widereg_base::log(const char *op, bitvec v) const {
+    std::ostringstream tmp;
+    LOG1(this << ' ' << op << ' ' << v
+              << (v != value ? tmp << " (now " << value << ")", tmp : tmp).str());
+}
diff --git a/backends/tofino/bf-asm/widereg.h b/backends/tofino/bf-asm/widereg.h
new file mode 100644
index 00000000000..f6f8e37a05e
--- /dev/null
+++ b/backends/tofino/bf-asm/widereg.h
@@ -0,0 +1,170 @@
+/**
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.  See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef BACKENDS_TOFINO_BF_ASM_WIDEREG_H_
+#define BACKENDS_TOFINO_BF_ASM_WIDEREG_H_
+
+#include <limits.h>
+
+#include <functional>
+#include <iostream>
+#include <sstream>
+
+#include "lib/bitvec.h"
+#include "lib/log.h"
+
+using namespace P4;
+
+void print_regname(std::ostream &out, const void *addr, const void *end);
+
+struct widereg_base;
+
+struct widereg_base {
+    bitvec value, reset_value;
+    mutable bool read, write;
+    mutable bool disabled_;
+
+    widereg_base() : read(false), write(false), disabled_(false) {}
+    explicit widereg_base(bitvec v)
+        : value(v), reset_value(v), read(false), write(false), disabled_(false) {}
+    explicit widereg_base(uintptr_t v)
+        : value(v), reset_value(v), read(false), write(false), disabled_(false) {}
+#if __WORDSIZE == 64
+    // For 32-bit systems intptr_t is defined as int
+    explicit widereg_base(intptr_t v)
+        : value(v), reset_value(v), read(false), write(false), disabled_(false) {}
+#endif
+    explicit widereg_base(int v)
+        : value(v), reset_value(v), read(false), write(false), disabled_(false) {}
+    operator bitvec() const {
+        read = true;
+        return value;
+    }
+    bool modified() const { return write; }
+    void set_modified(bool v = true) { write = v; }
+    bool disabled() const { return disabled_; }
+    bool disable_if_unmodified() { return write ? false : (disabled_ = true); }
+    bool disable_if_zero() const { return value.empty() && !write; }
+    bool disable_if_reset_value() { return value == reset_value ? (disabled_ = true) : false; }
+    bool disable() const {
+        if (write) {
+            LOG1("ERROR: Disabling modified register in " << this);
+            return false;
+        }
+        disabled_ = true;
+        return disabled_;
+    }
+    void enable() const { disabled_ = false; }
+    void rewrite() { write = false; }
+    virtual bitvec operator=(bitvec v) = 0;
+    virtual unsigned size() = 0;
+    void log(const char *op, bitvec v) const;
+};
+
+inline static unsigned int to_unsigned(const bitvec &v) {
+    std::stringstream ss;
+    ss << v;
+    std::string str(ss.str());
+    unsigned int rv = std::strtoul(str.c_str(), 0, 16);
+    return rv;
+}
+
+inline std::ostream &operator<<(std::ostream &out, const widereg_base *u) {
+    print_regname(out, u, u + 1);
+    return out;
+}
+inline std::ostream &operator<<(std::ostream &out, const widereg_base &u) {
+    return out << to_unsigned(u.value);
+}
+
+template <int N>
+struct widereg : widereg_base {
+    widereg() : widereg_base() {}
+    const widereg &check() {
+        if (value.max().index() >= N) {
+            LOG1("ERROR: out of range for " << N << " bits in " << this);
+            value.clrrange(N, value.max().index() - N + 1);
+        }
+        return *this;
+    }
+    explicit widereg(bitvec v) : widereg_base(v) { check(); }
+    explicit widereg(uintptr_t v) : widereg_base(v) { check(); }
+#if __WORDSIZE == 64
+    // For 32-bit systems intptr_t is defined as int
+    explicit widereg(intptr_t v) : widereg_base(v) { check(); }
+#endif
+    explicit widereg(int v) : widereg_base(v) { check(); }
+    widereg(const widereg &) = delete;
+    widereg(widereg &&) = default;
+    bitvec operator=(bitvec v) {
+        if (disabled_) LOG1("ERROR: Writing disabled register value in " << this);
+        if (write) LOG1("WARNING: Overwriting " << value << " with " << v << " in " << this);
+        value = v;
+        write = true;
+        log("=", v);
+        check();
+        return v;
+    }
+    uintptr_t operator=(uintptr_t v) {
+        *this = bitvec(v);
+        return v;
+    }
+    intptr_t operator=(intptr_t v) {
+        *this = bitvec(v);
+        return v;
+    }
+    const widereg &operator=(const widereg &v) {
+        *this = v.value;
+        v.read = true;
+        return v;
+    }
+    const widereg_base &operator=(const widereg_base &v) {
+        *this = v.value;
+        v.read = true;
+        return v;
+    }
+    unsigned size() { return N; }
+    const widereg &operator|=(bitvec v) {
+        if (disabled_) LOG1("ERROR: Writing disabled register value in " << this);
+        if (write)
+            LOG1("WARNING: Overwriting " << value << " with " << (v | value) << " in " << this);
+        value |= v;
+        write = true;
+        log("|=", v);
+        return check();
+    }
+    const widereg &set_subfield(uintptr_t v, unsigned bit, unsigned size) {
+        if (disabled_) LOG1("ERROR: Writing disabled register value in " << this);
+        if (bit + size > N) {
+            LOG1("ERROR: subfield " << bit << ".." << (bit + size - 1) << " out of range in "
+                                    << this);
+        } else if (auto o = value.getrange(bit, size)) {
+            if (write)
+                LOG1((o != v ? "ERROR" : "WARNING")
+                     << ": Overwriting subfield(" << bit << ".." << (bit + size - 1) << ") value "
+                     << o << " with " << v << " in " << this);
+        }
+        if (v >= (1U << size))
+            LOG1("ERROR: Subfield value " << v << " too large for " << size << " bits in " << this);
+        value.putrange(bit, size, v);
+        write = true;
+        log("|=", bitvec(v) << bit);
+        return check();
+    }
+};
+
+#endif /* BACKENDS_TOFINO_BF_ASM_WIDEREG_H_ */
diff --git a/backends/tofino/bf-p4c/CMakeLists.txt b/backends/tofino/bf-p4c/CMakeLists.txt
index 0a5410a6d3e..412b5f3818b 100644
--- a/backends/tofino/bf-p4c/CMakeLists.txt
+++ b/backends/tofino/bf-p4c/CMakeLists.txt
@@ -558,9 +558,6 @@ target_compile_definitions(bfp4c
   INTERFACE "-DCONFIG_PREFIX=\"${CMAKE_INSTALL_PREFIX}\""
   INTERFACE "-DCONFIG_PKGDATADIR=\"${CMAKE_INSTALL_PREFIX}/${P4C_ARTIFACTS_OUTPUT_DIRECTORY}\""
 )
-target_compile_definitions(bfp4c
-  INTERFACE "-DHAVE_JBAY=1"
-)
 if (ENABLE_BAREFOOT_INTERNAL)
   target_compile_definitions(bfp4c
     INTERFACE "-DBAREFOOT_INTERNAL=1"
diff --git a/backends/tofino/bf-p4c/arch/psa/psa.cpp b/backends/tofino/bf-p4c/arch/psa/psa.cpp
index dd430a6a501..c93f63e561c 100644
--- a/backends/tofino/bf-p4c/arch/psa/psa.cpp
+++ b/backends/tofino/bf-p4c/arch/psa/psa.cpp
@@ -900,13 +900,10 @@ class LoadTargetArchitecture : public Inspector {
         if (Device::currentDevice() == Device::TOFINO) {
             filenames.push_back("tofino1_specs.p4");
             filenames.push_back("tofino1_base.p4");
-        }
-#if HAVE_JBAY
-        else {
+        } else {
             filenames.push_back("tofino2_specs.p4");
             filenames.push_back("tofino2_base.p4");
         }
-#endif  // HAVE_JBAY
         filenames.push_back("tofino/stratum.p4");
         filenames.push_back("tofino/p4_14_prim.p4");
 
diff --git a/backends/tofino/bf-p4c/arch/v1model.cpp b/backends/tofino/bf-p4c/arch/v1model.cpp
index b02b32930f4..9811778738b 100644
--- a/backends/tofino/bf-p4c/arch/v1model.cpp
+++ b/backends/tofino/bf-p4c/arch/v1model.cpp
@@ -170,7 +170,6 @@ class LoadTargetArchitecture : public Inspector {
 
         structure->addMetadata(EGRESS, MetadataField{"standard_metadata"_cs, "egress_rid"_cs, 16},
                                MetadataField{"eg_intr_md"_cs, "egress_rid"_cs, 16});
-#ifdef HAVE_JBAY
         structure->addMetadata(INGRESS,
                                MetadataField{"ig_intr_md_for_mb"_cs, "mirror_io_select"_cs, 1},
                                MetadataField{"ig_intr_md_for_dprsr"_cs, "mirror_io_select"_cs, 1});
@@ -225,7 +224,6 @@ class LoadTargetArchitecture : public Inspector {
         structure->addMetadata(
             EGRESS, MetadataField{"eg_intr_md_for_mb"_cs, "mirror_coalesce_length"_cs, 8},
             MetadataField{"eg_intr_md_for_dprsr"_cs, "mirror_coalesce_length"_cs, 8});
-#endif
     }
 
     void analyzeTofinoModel() {
@@ -313,14 +311,10 @@ class LoadTargetArchitecture : public Inspector {
         if (Device::currentDevice() == Device::TOFINO) {
             filenames.push_back("tofino1_specs.p4");
             filenames.push_back("tofino1_base.p4");
-        }
-#if HAVE_JBAY
-        else if (Device::currentDevice() == Device::JBAY) {
+        } else if (Device::currentDevice() == Device::JBAY) {
             filenames.push_back("tofino2_specs.p4");
             filenames.push_back("tofino2_base.p4");
-        }
-#endif  // HAVE_JBAY
-        else
+        } else
             BUG("Unsupported device id %s", Device::currentDevice());
         filenames.push_back("tofino/stratum.p4");
         filenames.push_back("tofino/p4_14_prim.p4");
diff --git a/backends/tofino/bf-p4c/driver/barefoot.py b/backends/tofino/bf-p4c/driver/barefoot.py
index 23233325fa5..a0af756b1f1 100755
--- a/backends/tofino/bf-p4c/driver/barefoot.py
+++ b/backends/tofino/bf-p4c/driver/barefoot.py
@@ -179,12 +179,6 @@ def add_command_line_options(self):
             default=False,
             help="Add source outputs to the archive.",
         )
-        self._argGroup.add_argument(
-            "--enable-bf-asm",
-            action="store_true",
-            default=False,
-            help="Use the assembler to generate a binary.",
-        )
         self._argGroup.add_argument(
             "--bf-rt-schema",
             action="store",
@@ -487,19 +481,18 @@ def process_command_line_options(self, opts):
         """! Main parsing or command line options
         @param opts Object holding set arguments
         """
-        # Add assembler options if they are available.
-        if opts.enable_bf_asm or os.getenv("ENABLE_BF_ASM"):
-            if os.environ['P4C_BUILD_TYPE'] == "DEVELOPER":
-                bfas = find_file('bf-asm', 'bfas')
-            else:
-                bfas = find_file(os.environ['P4C_BIN_DIR'], 'bfas')
-
-            bfrt_schema = find_file(os.environ['P4C_BIN_DIR'], 'bfrt_schema.py')
-            p4c_gen_conf = find_file(os.environ['P4C_BIN_DIR'], 'p4c-gen-conf')
-            self.add_command('assembler', bfas)
-            self.add_command('bf-rt-verifier', bfrt_schema)
-            self.add_command('p4c-gen-conf', p4c_gen_conf)
-            self._commandsEnabled.append('assembler')
+        # Add assembler options.
+        if os.environ['P4C_BUILD_TYPE'] == "DEVELOPER":
+            bfas = find_file('.', 'bfas')
+        else:
+            bfas = find_file(os.environ['P4C_BIN_DIR'], 'bfas')
+
+        bfrt_schema = find_file(os.environ['P4C_BIN_DIR'], 'bfrt_schema.py')
+        p4c_gen_conf = find_file(os.environ['P4C_BIN_DIR'], 'p4c-gen-conf')
+        self.add_command('assembler', bfas)
+        self.add_command('bf-rt-verifier', bfrt_schema)
+        self.add_command('p4c-gen-conf', p4c_gen_conf)
+        self._commandsEnabled.append('assembler')
 
         BackendDriver.process_command_line_options(self, opts)
 
diff --git a/backends/tofino/bf-p4c/driver/p4c.tofino.cfg b/backends/tofino/bf-p4c/driver/p4c.tofino.cfg
index 09fbd5bfc24..05e840437b8 100644
--- a/backends/tofino/bf-p4c/driver/p4c.tofino.cfg
+++ b/backends/tofino/bf-p4c/driver/p4c.tofino.cfg
@@ -27,8 +27,7 @@ class TofinoBackend(bfn.BarefootBackend):
         self.config_compiler("__TARGET_TOFINO__=1")
 
     def process_command_line_options(self, opts):
-        if opts.enable_bf_asm or os.getenv("ENABLE_BF_ASM"):
-            self.config_assembler("tofino")
+        self.config_assembler("tofino")
         bfn.BarefootBackend.process_command_line_options(self, opts)
 
 
diff --git a/backends/tofino/bf-p4c/driver/p4c.tofino2.cfg b/backends/tofino/bf-p4c/driver/p4c.tofino2.cfg
index 1a8001e26fe..f452bdea69e 100644
--- a/backends/tofino/bf-p4c/driver/p4c.tofino2.cfg
+++ b/backends/tofino/bf-p4c/driver/p4c.tofino2.cfg
@@ -37,8 +37,7 @@ class Tofino2Backend(bfn.BarefootBackend):
         self.config_compiler("__TOFINO2_VARIANT__={}".format(Tofino2Variants[target]))
 
     def process_command_line_options(self, opts):
-        if opts.enable_bf_asm or os.getenv("ENABLE_BF_ASM"):
-            self.config_assembler(self._target)
+        self.config_assembler(self._target)
         bfn.BarefootBackend.process_command_line_options(self, opts)
 
 for t in Tofino2Variants.keys():
diff --git a/backends/tofino/bf-p4c/mau/action_analysis.cpp b/backends/tofino/bf-p4c/mau/action_analysis.cpp
index 3ec4dc731b3..3697c4846b9 100644
--- a/backends/tofino/bf-p4c/mau/action_analysis.cpp
+++ b/backends/tofino/bf-p4c/mau/action_analysis.cpp
@@ -2417,9 +2417,7 @@ void ActionAnalysis::check_constant_to_actiondata(ContainerAction &cont_action,
     // 16 and 20, the range for instruction constants is different between architectures.
     // For Tofino it is -8..7 but for JBay it is -4..7
     int const_src_min = CONST_SRC_MAX;
-#ifdef HAVE_JBAY
     if (Device::currentDevice() == Device::JBAY) const_src_min = JBAY_CONST_SRC_MIN;
-#endif /* HAVE_JBAY */
 
     if (cont_action.convert_instr_to_bitmasked_set ||
         cont_action.convert_instr_to_byte_rotate_merge) {
diff --git a/backends/tofino/bf-p4c/mau/asm_output.cpp b/backends/tofino/bf-p4c/mau/asm_output.cpp
index 5e19bef1cdd..040a9f3ad40 100644
--- a/backends/tofino/bf-p4c/mau/asm_output.cpp
+++ b/backends/tofino/bf-p4c/mau/asm_output.cpp
@@ -1053,9 +1053,7 @@ void MauAsmOutput::emit_table_format(std::ostream &out, indent_t indent,
     fmt_state fmt;
     out << indent << "format: {";
     int group = (ternary || gateway) ? -1 : 0;
-#ifdef HAVE_JBAY
     if (Device::currentDevice() == Device::JBAY && gateway) group = 0;
-#endif
 
     for (auto match_group : use.match_groups) {
         int type;
diff --git a/backends/tofino/bf-p4c/mau/gateway.cpp b/backends/tofino/bf-p4c/mau/gateway.cpp
index 727e570d8cb..559cba36b27 100644
--- a/backends/tofino/bf-p4c/mau/gateway.cpp
+++ b/backends/tofino/bf-p4c/mau/gateway.cpp
@@ -43,7 +43,6 @@ const Device::GatewaySpec &TofinoDevice::getGatewaySpec() const {
     };
     return spec;
 }
-#if HAVE_JBAY
 const Device::GatewaySpec &JBayDevice::getGatewaySpec() const {
     static const Device::GatewaySpec spec = {
         /* .PhvBytes = */ 4,
@@ -59,7 +58,6 @@ const Device::GatewaySpec &JBayDevice::getGatewaySpec() const {
     };
     return spec;
 }
-#endif
 
 class CanonGatewayExpr::NeedNegate : public Inspector {
     bool rv = false;
diff --git a/backends/tofino/bf-p4c/mau/stateful_alu.cpp b/backends/tofino/bf-p4c/mau/stateful_alu.cpp
index bb419a004e6..e2a6ae97804 100644
--- a/backends/tofino/bf-p4c/mau/stateful_alu.cpp
+++ b/backends/tofino/bf-p4c/mau/stateful_alu.cpp
@@ -46,7 +46,6 @@ const Device::StatefulAluSpec &TofinoDevice::getStatefulAluSpec() const {
     return spec;
 }
 
-#if HAVE_JBAY
 const Device::StatefulAluSpec &JBayDevice::getStatefulAluSpec() const {
     static const Device::StatefulAluSpec spec = {
         /* .CmpMask = */ true,
@@ -64,7 +63,6 @@ const Device::StatefulAluSpec &JBayDevice::getStatefulAluSpec() const {
         /* .MaxRegfileRows = */ 4};
     return spec;
 }
-#endif
 
 /**
  * @brief This class detects a following pattern:
@@ -2283,13 +2281,11 @@ std::map<std::pair<cstring, cstring>, std::vector<CreateSaluInstruction::param_t
          {param_t::VALUE, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT}},
         {{"RegisterAction"_cs, "underflow"_cs},
          {param_t::VALUE, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT}},
-#ifdef HAVE_JBAY
         {{"LearnAction"_cs, "apply"_cs},
          {param_t::VALUE, param_t::HASH, param_t::LEARN, param_t::OUTPUT, param_t::OUTPUT,
           param_t::OUTPUT, param_t::OUTPUT}},
         {{"MinMaxAction"_cs, "apply"_cs},
          {param_t::VALUE, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT}},
-#endif
         {{"SelectorAction"_cs, "apply"_cs},
          {param_t::VALUE, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT, param_t::OUTPUT}}};
 
diff --git a/backends/tofino/bf-p4c/mau/tofino/memories.cpp b/backends/tofino/bf-p4c/mau/tofino/memories.cpp
index c309e1a1847..0e4d2a2a956 100644
--- a/backends/tofino/bf-p4c/mau/tofino/memories.cpp
+++ b/backends/tofino/bf-p4c/mau/tofino/memories.cpp
@@ -3506,7 +3506,6 @@ bool Memories::allocate_all_swbox_users() {
             }
         }
 
-#ifdef HAVE_JBAY
         // JBay has no overflow bus between logical row 7 and 8
         if ((Device::isMemoryCoreSplit()) && i == MATCH_CENTRAL_ROW) {
             for (auto group : must_place_in_half) {
@@ -3526,9 +3525,7 @@ bool Memories::allocate_all_swbox_users() {
             curr_oflow = nullptr;
             must_place_in_half.clear();
         }
-#endif /* HAVE_JBAY */
     }
-
     if (!action_bus_users.empty() || !synth_bus_users.empty()) {
         for (auto abu : action_bus_users) abu->left_to_place();
 
diff --git a/backends/tofino/bf-p4c/mau/walk_power_graph.cpp b/backends/tofino/bf-p4c/mau/walk_power_graph.cpp
index 7a727149c5d..5a495c12dfe 100644
--- a/backends/tofino/bf-p4c/mau/walk_power_graph.cpp
+++ b/backends/tofino/bf-p4c/mau/walk_power_graph.cpp
@@ -444,10 +444,8 @@ double WalkPowerGraph::estimate_power() {
     always_powered_on_.clear();
     if (Device::currentDevice() == Device::TOFINO) {
         return estimate_power_tofino();
-#if HAVE_JBAY
     } else if (Device::currentDevice() == Device::JBAY) {
         return estimate_power_non_tofino();
-#endif /* HAVE_JBAY */
     } else {
         BUG("estimate_power -- invalid device %d", Device::currentDevice());
     }
diff --git a/backends/tofino/bf-p4c/midend.cpp b/backends/tofino/bf-p4c/midend.cpp
index 85467f960be..56552c9a28a 100644
--- a/backends/tofino/bf-p4c/midend.cpp
+++ b/backends/tofino/bf-p4c/midend.cpp
@@ -328,14 +328,12 @@ bool skipFlexibleHeader(const Visitor::Context *, const IR::Type_StructLike *e)
  */
 class CompileTimeOperations : public P4::CompileTimeOperations {
     bool preorder(const IR::Declaration_Instance *di) {
-#ifdef HAVE_JBAY
         // JBay supports (limited) div/mod in RegisterAction
         if (Device::currentDevice() == Device::JBAY) {
             if (auto st = di->type->to<IR::Type_Specialized>()) {
                 if (st->baseType->path->name.name.endsWith("Action")) return false;
             }
         }
-#endif
         return true;
     }
 };
diff --git a/backends/tofino/bf-p4c/midend/parser_enforce_depth_req.cpp b/backends/tofino/bf-p4c/midend/parser_enforce_depth_req.cpp
index 5e9a7922c27..203bd3edd66 100644
--- a/backends/tofino/bf-p4c/midend/parser_enforce_depth_req.cpp
+++ b/backends/tofino/bf-p4c/midend/parser_enforce_depth_req.cpp
@@ -507,9 +507,7 @@ class AddParserPad : public Modifier {
     // Tofino1-like architectures
     std::set<cstring> tofArch = {
         "tna"_cs,
-#if HAVE_JBAY
         "t2na"_cs,
-#endif /* HAVE_JBAY */
     };
 
     /**
diff --git a/backends/tofino/bf-p4c/phv/phv_fields.cpp b/backends/tofino/bf-p4c/phv/phv_fields.cpp
index 9d2f42115e3..45a616126fe 100644
--- a/backends/tofino/bf-p4c/phv/phv_fields.cpp
+++ b/backends/tofino/bf-p4c/phv/phv_fields.cpp
@@ -1568,9 +1568,9 @@ struct ComputeFieldAlignments : public Inspector {
 
         // For non-set instructions accessing an AttachedOutput
         if ((instr->operands.size() == 3) && (instr->name != "set")) {
-            int op_id = 0;
+            bool first = true;
             for (auto op_f : instr->operands) {
-                if (!op_id) {
+                if (first) {
                     // Keep destination field that may need alignment setting
                     dst_f = phv.field(op_f);
                 } else {
@@ -1592,7 +1592,7 @@ struct ComputeFieldAlignments : public Inspector {
                         }
                     }
                 }
-                op_id++;
+                first = false;
             }
         }
         return false;
diff --git a/backends/tofino/cmake/spdlog.cmake b/backends/tofino/cmake/spdlog.cmake
index c4d8fdce85e..361da20dd29 100644
--- a/backends/tofino/cmake/spdlog.cmake
+++ b/backends/tofino/cmake/spdlog.cmake
@@ -1,21 +1,33 @@
 message(STATUS "Fetching spdlog")
 
-include(FetchContent)
-
 # Preserve previous FETCHCONTENT_QUIET setting
 set(FETCHCONTENT_QUIET_PREV ${FETCHCONTENT_QUIET})
 set(FETCHCONTENT_QUIET OFF)
 
+set(SPDLOG_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/spdlog)
+
+# Check if the source directory exists.
+if(EXISTS ${SPDLOG_SOURCE_DIR}/CMakeLists.txt)
+    # If it exists but wasn't built before, manually add it.
+    set(FETCHCONTENT_SOURCE_DIR_SPDLOG ${SPDLOG_SOURCE_DIR})
+    # Avoid fetching again.
+    set(FETCHCONTENT_UPDATES_DISCONNECTED_SPDLOG ON)
+endif()
+
 FetchContent_Declare(
   spdlog
   GIT_REPOSITORY https://github.com/gabime/spdlog.git
   GIT_TAG v1.8.3
-  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/spdlog
+  SOURCE_DIR ${SPDLOG_SOURCE_DIR}
   USES_TERMINAL_DOWNLOAD TRUE
   GIT_PROGRESS TRUE
 )
 
-FetchContent_MakeAvailable(spdlog)
+FetchContent_GetProperties(spdlog)
+if(NOT spdlog_POPULATED)
+  FetchContent_Populate(spdlog)
+  add_subdirectory(${SPDLOG_SOURCE_DIR} ${CMAKE_BINARY_DIR}/spdlog)
+endif()
 
 # Restore FETCHCONTENT_QUIET setting
 set(FETCHCONTENT_QUIET ${FETCHCONTENT_QUIET_PREV})
diff --git a/backends/tofino/compiler_interfaces/schemas/mau_schema.py b/backends/tofino/compiler_interfaces/schemas/mau_schema.py
index 72a64ef051d..39b0a53dfa8 100644
--- a/backends/tofino/compiler_interfaces/schemas/mau_schema.py
+++ b/backends/tofino/compiler_interfaces/schemas/mau_schema.py
@@ -233,7 +233,6 @@ class StageMatchMemoryDetails(StageMemoryDetailsWithEntryWidthAndIdeal):
 
 
 class MatchTables(jsl.Document):
-
     class StageDetails(jsl.Document):
         title = "StageDetails"
         description = "Information about packing and resource usage on a per-stage basis."
diff --git a/backends/tofino/compiler_interfaces/schemas/power_schema.py b/backends/tofino/compiler_interfaces/schemas/power_schema.py
index 04e5cddc7e9..386c6da7242 100644
--- a/backends/tofino/compiler_interfaces/schemas/power_schema.py
+++ b/backends/tofino/compiler_interfaces/schemas/power_schema.py
@@ -111,7 +111,6 @@ class Features(jsl.Document):
 
 
 class MatchTables(jsl.Document):
-
     class StageDetails(jsl.Document):
         title = "StageDetails"
         description = "Information about table power usage on a per-stage basis."
diff --git a/backends/tofino/compiler_interfaces/tools/create_mau_characterize.py b/backends/tofino/compiler_interfaces/tools/create_mau_characterize.py
index e2d117eaf3e..ed276579d82 100755
--- a/backends/tofino/compiler_interfaces/tools/create_mau_characterize.py
+++ b/backends/tofino/compiler_interfaces/tools/create_mau_characterize.py
@@ -766,9 +766,12 @@ def log_match_and_action_formats(all_match_and_action_formats):
     for table_name, stage in keys:
         if table_name not in tbl_to_info:
             tbl_to_info[table_name] = OrderedDict()
-        match_format_json, actual_match_entries, action_formats_json, actual_action_entries = (
-            all_match_and_action_formats[(table_name, stage)]
-        )
+        (
+            match_format_json,
+            actual_match_entries,
+            action_formats_json,
+            actual_action_entries,
+        ) = all_match_and_action_formats[(table_name, stage)]
         tbl_to_info[table_name][stage] = (
             match_format_json,
             actual_match_entries,
@@ -789,9 +792,12 @@ def log_match_and_action_formats(all_match_and_action_formats):
         all_match_formats = []
         all_action_formats = []
         for stage in tbl_to_info[table_name]:
-            match_format_json, actual_match_entries, action_formats_json, actual_action_entries = (
-                tbl_to_info[table_name][stage]
-            )
+            (
+                match_format_json,
+                actual_match_entries,
+                action_formats_json,
+                actual_action_entries,
+            ) = tbl_to_info[table_name][stage]
             all_match_formats.append((stage, match_format_json, actual_match_entries))
             all_action_formats.append((stage, action_formats_json, actual_action_entries))
 
@@ -844,9 +850,12 @@ def produce_mau_characterize(source, output):
     log.info("%s\n" % box)
 
     # Populate table summary information
-    table_info, sram_summary, all_overhead_structures, all_match_and_action_formats = (
-        _parse_mau_json(context)
-    )
+    (
+        table_info,
+        sram_summary,
+        all_overhead_structures,
+        all_match_and_action_formats,
+    ) = _parse_mau_json(context)
 
     # Output summary table in log file
 
diff --git a/backends/tofino/compiler_interfaces/tools/create_mau_json.py b/backends/tofino/compiler_interfaces/tools/create_mau_json.py
index d121e48f011..6874e88f50d 100755
--- a/backends/tofino/compiler_interfaces/tools/create_mau_json.py
+++ b/backends/tofino/compiler_interfaces/tools/create_mau_json.py
@@ -579,10 +579,11 @@ def get_match_memory(match_stage_table, match_table, context, entries_so_far):
     mem_elem["entry_bit_width_requested"] = ideal_entry_bits
     mem_elem["entry_bit_width_allocated"] = allocated_match_bits
 
-    mem_elem["ideal_entries_per_table_word"], mem_elem["ideal_table_word_bit_width"] = (
-        get_ideal_match_entries(
-            match_stage_table, match_table, mem_elem["imm_bit_width_in_overhead_requested"]
-        )
+    (
+        mem_elem["ideal_entries_per_table_word"],
+        mem_elem["ideal_table_word_bit_width"],
+    ) = get_ideal_match_entries(
+        match_stage_table, match_table, mem_elem["imm_bit_width_in_overhead_requested"]
     )
 
     memories.append(mem_elem)
diff --git a/ir/json_loader.h b/ir/json_loader.h
index 335a256630a..34fa571185c 100644
--- a/ir/json_loader.h
+++ b/ir/json_loader.h
@@ -92,11 +92,17 @@ class JSONLoader {
     const IR::Node *get_node() {
         if (!json || !json->is<JsonObject>()) return nullptr;  // invalid json exception?
         int id;
-        load("Node_ID", id);
+        auto success = load("Node_ID", id);
+        if (!success) {
+            return nullptr;
+        }
         if (id >= 0) {
             if (node_refs.find(id) == node_refs.end()) {
                 cstring type;
-                load("Node_Type", type);
+                auto success = load("Node_Type", type);
+                if (!success) {
+                    return nullptr;
+                }
                 if (auto fn = get(IR::unpacker_table, type)) {
                     node_refs[id] = fn(*this);
                     // Creating JsonObject from source_info read from jsonFile