Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mlp example #172

Draft
wants to merge 32 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2e28da6
Updated Docker to main for local build
Apr 24, 2024
08c5fda
Sync scripts for updated docker setups (seperating CPU and GPU contai…
Apr 24, 2024
8537c2c
removed docker as submodule
Apr 24, 2024
c2e47dc
Pull from Makefile to avoid repeated sync on submodule
Apr 24, 2024
b3081ba
Added instructions to log for better readability on CI log
Apr 24, 2024
9fd3d67
Initially seperate dataflow sv out of top
Apr 24, 2024
cf6f26c
Split two files initially
Apr 24, 2024
3bfe064
Refactored dataflow level
Apr 24, 2024
caad0b1
Added missing dependences and updated file paths
Apr 24, 2024
5b8990f
Rafactored memory map emit
Apr 24, 2024
0509014
Added rounding at the output of relu
Apr 24, 2024
7aaba88
Merge branch 'main' into mlp_example
Apr 24, 2024
181b537
Fixed relu var name typos
Apr 25, 2024
f93c976
Updated linear layer with the right parameter names
Apr 25, 2024
0a8b4dd
Remove temporary code for parallelism
Apr 25, 2024
35afa85
Added missing component interface
Apr 25, 2024
8ef71cf
removed extra comma in parameter map
Apr 25, 2024
9210756
Updated parallelism parameter formats
Apr 26, 2024
3c272a7
Updated bram width calculation with the latest parallelism parameters
Apr 26, 2024
29faf71
Fix quantization meta data for fixed-point quantization (#173)
ChengZhang-98 Apr 26, 2024
50816af
reverted changes made by the quantization PR
Apr 26, 2024
c1d1fd7
reduced test case
Apr 26, 2024
8535c57
Fetched previous verilog analysis pass for testing
Apr 28, 2024
590f20e
Fetched the latest version of draft
Apr 28, 2024
8076bc1
Pass syntax error in Python
Apr 28, 2024
5d26837
refactored test pass format
Apr 28, 2024
cfb256e
format quantize PR https://github.com/DeepWok/mase/pull/173
Apr 28, 2024
d825cbe
Added bit truncation in bram param to avoid verilator warnings
Apr 28, 2024
3a23cb2
Fixed bitwidth error (this is temporary for the version of casting in…
Apr 28, 2024
80c671f
Fixed minor parallelism parameter shapes
Apr 28, 2024
8b5309c
Get working flow for hardware testing - but need to check hardware re…
Apr 28, 2024
047f27b
Mlp quantization error (#178)
Apr 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions machop/chop/models/manual/bert_quantized/quant_config_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import toml
from chop.tools.config_load import convert_str_na_to_none

from ..quant_utils import parse_node_config
from ..quant_utils import parse_node_q_config


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -68,20 +68,20 @@ def create_a_layer_config(
# fmt: off
qc = {
"attention": {
"query": deepcopy(parse_node_config(layer_qc.get("attention", {}).get("query", linear_qc), "linear")),
"key": deepcopy(parse_node_config(layer_qc.get("attention", {}).get("key", linear_qc), "linear")),
"value": deepcopy(parse_node_config(layer_qc.get("attention", {}).get("value", linear_qc), "linear")),
"matmul_0": deepcopy(parse_node_config(layer_qc.get("attention", {}).get("matmul_0", matmul_qc), "matmul")),
"matmul_1": deepcopy(parse_node_config(layer_qc.get("attention", {}).get("matmul_1", matmul_qc), "matmul")),
"query": deepcopy(parse_node_q_config(layer_qc.get("attention", {}).get("query", linear_qc), "linear")),
"key": deepcopy(parse_node_q_config(layer_qc.get("attention", {}).get("key", linear_qc), "linear")),
"value": deepcopy(parse_node_q_config(layer_qc.get("attention", {}).get("value", linear_qc), "linear")),
"matmul_0": deepcopy(parse_node_q_config(layer_qc.get("attention", {}).get("matmul_0", matmul_qc), "matmul")),
"matmul_1": deepcopy(parse_node_q_config(layer_qc.get("attention", {}).get("matmul_1", matmul_qc), "matmul")),
"output": {
"dense": deepcopy(parse_node_config(layer_qc.get("attention", {}).get("output", {}).get("dense", linear_qc), "linear")),
"dense": deepcopy(parse_node_q_config(layer_qc.get("attention", {}).get("output", {}).get("dense", linear_qc), "linear")),
},
},
"intermediate": {
"dense": deepcopy(parse_node_config(layer_qc.get("intermediate", {}).get("dense", linear_qc), "linear")),
"dense": deepcopy(parse_node_q_config(layer_qc.get("intermediate", {}).get("dense", linear_qc), "linear")),
},
"output": {
"dense": deepcopy(parse_node_config(layer_qc.get("output", {}).get("dense", linear_qc), "linear")),
"dense": deepcopy(parse_node_q_config(layer_qc.get("output", {}).get("dense", linear_qc), "linear")),
},
}
# fmt: on
Expand All @@ -94,10 +94,10 @@ def _parse_and_complete_config(
) -> dict:
assert "default" in config, "Must provide a default config"
default_qc: dict = config["default"]
linear_qc: dict = parse_node_config(
linear_qc: dict = parse_node_q_config(
config.get("linear", default_qc), mase_op="linear"
)
matmul_qc: dict = parse_node_config(
matmul_qc: dict = parse_node_q_config(
config.get("matmul", default_qc), mase_op="matmul"
)
general_layer_qc: dict = config.get("model_layer", None)
Expand Down
28 changes: 14 additions & 14 deletions machop/chop/models/manual/llama_quantized/quant_config_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import toml
from chop.tools.config_load import convert_str_na_to_none

from ..quant_utils import parse_node_config
from ..quant_utils import parse_node_q_config


"""
Expand Down Expand Up @@ -48,18 +48,18 @@ def create_a_layer_config(
# fmt: off
qc = {
"self_attn": {
"q_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("q_proj", linear_qc), "linear")),
"k_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("k_proj", linear_qc), "linear")),
"v_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("v_proj", linear_qc), "linear")),
"o_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("o_proj", linear_qc), "linear")),
"rotary_positional_encoding": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("rotary_positional_encoding", rotary_positional_encoding_qc), "rotary_positional_encoding")),
"matmul_0": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("matmul_0", matmul_qc), "matmul")),
"matmul_1": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("matmul_1", matmul_qc), "matmul")),
"q_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("q_proj", linear_qc), "linear")),
"k_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("k_proj", linear_qc), "linear")),
"v_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("v_proj", linear_qc), "linear")),
"o_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("o_proj", linear_qc), "linear")),
"rotary_positional_encoding": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("rotary_positional_encoding", rotary_positional_encoding_qc), "rotary_positional_encoding")),
"matmul_0": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("matmul_0", matmul_qc), "matmul")),
"matmul_1": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("matmul_1", matmul_qc), "matmul")),
},
"mlp": {
"gate_proj": deepcopy(parse_node_config(layer_qc.get("mlp", {}).get("gate_proj", linear_qc), "linear")),
"down_proj": deepcopy(parse_node_config(layer_qc.get("mlp", {}).get("down_proj", linear_qc), "linear")),
"up_proj": deepcopy(parse_node_config(layer_qc.get("mlp", {}).get("up_proj", linear_qc), "linear"))
"gate_proj": deepcopy(parse_node_q_config(layer_qc.get("mlp", {}).get("gate_proj", linear_qc), "linear")),
"down_proj": deepcopy(parse_node_q_config(layer_qc.get("mlp", {}).get("down_proj", linear_qc), "linear")),
"up_proj": deepcopy(parse_node_q_config(layer_qc.get("mlp", {}).get("up_proj", linear_qc), "linear"))
},
}
# fmt: on
Expand All @@ -69,14 +69,14 @@ def create_a_layer_config(
def _parse_and_complete_config(config: dict, num_hidden_layers: int) -> dict:
assert "default" in config, "Must provide default config for by_name_parser"
default_qc: dict = config["default"]
linear_qc: dict = parse_node_config(
linear_qc: dict = parse_node_q_config(
config.get("linear", default_qc), mase_op="linear"
)
rotary_positional_encoding_qc: dict = parse_node_config(
rotary_positional_encoding_qc: dict = parse_node_q_config(
config.get("rotary_positional_encoding", default_qc),
mase_op="rotary_positional_encoding",
)
matmul_qc: dict = parse_node_config(
matmul_qc: dict = parse_node_q_config(
config.get("matmul", default_qc), mase_op="matmul"
)
general_layer_qc: dict = config.get("model_layer", None)
Expand Down
24 changes: 12 additions & 12 deletions machop/chop/models/manual/opt_quantized/quant_config_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import toml

from ....tools.config_load import convert_str_na_to_none
from ....passes.graph import parse_node_config
from ....passes.graph import parse_node_q_config

from chop.passes.graph.transforms.quantize.quant_parsers import parse_quant_config
from chop.passes.graph.transforms.quantize.quant_parsers import parse_node_q_config

"""
An example of quant_config for opt
Expand Down Expand Up @@ -43,15 +43,15 @@ def create_a_layer_config(
# fmt: off
qc = {
"self_attn": {
"q_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("q_proj", linear_qc), "linear")),
"k_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("k_proj", linear_qc), "linear")),
"v_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("v_proj", linear_qc), "linear")),
"out_proj": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("out_proj", linear_qc), "linear")),
"bmm_0": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("bmm_0", bmm_qc), "matmul")),
"bmm_1": deepcopy(parse_node_config(layer_qc.get("self_attn", {}).get("bmm_1", bmm_qc), "matmul")),
"q_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("q_proj", linear_qc), "linear")),
"k_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("k_proj", linear_qc), "linear")),
"v_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("v_proj", linear_qc), "linear")),
"out_proj": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("out_proj", linear_qc), "linear")),
"bmm_0": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("bmm_0", bmm_qc), "matmul")),
"bmm_1": deepcopy(parse_node_q_config(layer_qc.get("self_attn", {}).get("bmm_1", bmm_qc), "matmul")),
},
"fc1": deepcopy(parse_node_config(layer_qc.get("fc1", linear_qc), "linear")),
"fc2": deepcopy(parse_node_config(layer_qc.get("fc2", linear_qc), "linear")),
"fc1": deepcopy(parse_node_q_config(layer_qc.get("fc1", linear_qc), "linear")),
"fc2": deepcopy(parse_node_q_config(layer_qc.get("fc2", linear_qc), "linear")),
}
# fmt: on
return qc
Expand All @@ -60,10 +60,10 @@ def create_a_layer_config(
def _parse_and_complete_config(config: dict, num_hidden_layers: int) -> dict:
assert "default" in config, "Must provide default config for by_name_parser"
default_qc: dict = config["default"]
linear_qc: dict = parse_node_config(
linear_qc: dict = parse_node_q_config(
config.get("linear", default_qc), mase_op="linear"
)
bmm_qc: dict = parse_node_config(config.get("bmm", default_qc), mase_op="matmul")
bmm_qc: dict = parse_node_q_config(config.get("bmm", default_qc), mase_op="matmul")
general_layer_qc: dict = config.get("model_layer", None)

# parsed config
Expand Down
4 changes: 2 additions & 2 deletions machop/chop/models/manual/quant_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Callable

from chop.passes.graph import parse_node_config
from chop.passes.graph import parse_node_q_config
from chop.passes.graph import quantized_func_map
from chop.passes.graph import quantized_module_map

Expand All @@ -16,4 +16,4 @@ def get_quantized_func(mase_op: str, quant_config: dict) -> Callable:


def parse_op_quant_config(mase_op: str, config: dict) -> dict:
return parse_node_config(config=config, mase_op=mase_op)
return parse_node_q_config(config=config, mase_op=mase_op)
1 change: 1 addition & 0 deletions machop/chop/passes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
verify_common_metadata_analysis_pass,
run_cosim_analysis_pass,
get_synthesis_results,
test_verilog_analysis_pass,
)
from .graph.transforms import (
prune_transform_pass,
Expand Down
2 changes: 1 addition & 1 deletion machop/chop/passes/graph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
)

from .transforms.quantize import quantized_func_map, quantized_module_map
from .transforms.quantize.quant_parsers import parse_node_config
from .transforms.quantize.quant_parsers import parse_node_q_config

ANALYSIS_PASSES = [
"init_metadata",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ def add_component_source(node):
node.meta["mase"]["hardware"]["dependence_files"] = []

node.meta["mase"]["hardware"]["device_id"] = -1
# Init data parallelism to 1 and use DSE pass for exploration
node.meta["mase"]["hardware"]["parallelism"] = {}
args = node.meta["mase"]["common"]["args"]
for arg, arg_info in args.items():
if isinstance(arg_info, dict):
node.meta["mase"]["hardware"]["parallelism"][arg] = [
1 for _ in range(len(arg_info["shape"]))
]

results = node.meta["mase"]["common"]["results"]
for result, result_info in results.items():
node.meta["mase"]["hardware"]["parallelism"][result] = [
1 for _ in range(len(result_info["shape"]))
]

# Current only support on-chip parameters
args = node.meta["mase"]["common"]["args"]
Expand Down Expand Up @@ -81,17 +95,17 @@ def add_verilog_param(node):
else 1
)
# If node data parallelism is set, take from hardware metadata
if node.meta["mase"]["hardware"]["parallelism"] is not None:
vp[_cap(arg + f"_parallelism_dim_{dim}")] = node.meta["mase"][
"hardware"
]["parallelism"][len(arg_info["shape"]) - 1 - dim]
# Otherwise, assign to tensor size by default
else:
vp[_cap(arg + f"_parallelism_dim_{dim}")] = (
arg_info["shape"][len(arg_info["shape"]) - 1 - dim]
if dim < len(arg_info["shape"])
else 1
)
assert node.meta["mase"]["hardware"]["parallelism"][arg] is not None
vp[_cap(arg + f"_parallelism_dim_{dim}")] = node.meta["mase"][
"hardware"
]["parallelism"][arg][len(arg_info["shape"]) - 1 - dim]
# # Otherwise, assign to tensor size by default
# else:
# vp[_cap(arg + f"_parallelism_dim_{dim}")] = (
# arg_info["shape"][len(arg_info["shape"]) - 1 - dim]
# if dim < len(arg_info["shape"])
# else 1
# )
elif type(arg_info) == bool:
vp[_cap(arg)] = 1 if arg_info else 0
else:
Expand All @@ -107,16 +121,16 @@ def add_verilog_param(node):
if dim < len(result_info["shape"])
else 1
)
if node.meta["mase"]["hardware"]["parallelism"] is not None:
vp[_cap(result + f"_parallelism_dim_{dim}")] = node.meta["mase"][
"hardware"
]["parallelism"][len(result_info["shape"]) - 1 - dim]
else:
vp[_cap(result + f"_parallelism_dim_{dim}")] = (
result_info["shape"][len(result_info["shape"]) - 1 - dim]
if dim < len(result_info["shape"])
else 1
)
assert node.meta["mase"]["hardware"]["parallelism"] is not None
vp[_cap(result + f"_parallelism_dim_{dim}")] = node.meta["mase"][
"hardware"
]["parallelism"][result][len(result_info["shape"]) - 1 - dim]
# else:
# vp[_cap(result + f"_parallelism_dim_{dim}")] = (
# result_info["shape"][len(result_info["shape"]) - 1 - dim]
# if dim < len(result_info["shape"])
# else 1
# )
else:
vp[_cap(result)] = result_info

Expand Down Expand Up @@ -369,11 +383,6 @@ def add_hardware_metadata_analysis_pass(graph, pass_args=None):
for node in graph.nodes:
add_component_source(node)

# Temporary: fix parallelism to small value to enable verilator simulation
for node in graph.nodes:
# Batch parallelism set to 1, data parallelism to 4
node.meta["mase"]["hardware"]["parallelism"] = [1, 4]

# Add hardware parameters
for node in graph.nodes:
add_verilog_param(node)
Expand Down
6 changes: 5 additions & 1 deletion machop/chop/passes/graph/analysis/report/report_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import copy


import torch

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -118,7 +120,7 @@ def report_node_hardware_type_analysis_pass(graph, pass_args: dict = {}):
return graph, {}


def report_node_meta_param_analysis_pass(graph, pass_args: dict = None):
def report_node_meta_param_analysis_pass(graph, pass_args: dict = {}):
"""
Perform meta parameter analysis on the nodes in the graph and generate a report.

Expand All @@ -131,6 +133,7 @@ def report_node_meta_param_analysis_pass(graph, pass_args: dict = None):
:return: The analyzed graph and an empty dictionary.
:rtype: tuple(MaseGraph, dict)
"""
torch.set_printoptions(threshold=20)
which_param = pass_args.get("which", ("all",))
assert isinstance(which_param, (list, tuple))
for param in which_param:
Expand Down Expand Up @@ -184,4 +187,5 @@ def report_node_meta_param_analysis_pass(graph, pass_args: dict = None):
with open(Path(save_path), "w") as f:
f.write(table_txt)
logger.info(f"Node meta param table is saved to {save_path}")
torch.set_printoptions(threshold=1000)
return graph, {}
80 changes: 80 additions & 0 deletions machop/chop/passes/graph/analysis/verilog/cocotb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python3

import os, logging

from mase_cocotb.random_test import check_results
from mase_cocotb.runner import mase_runner

import cocotb
from cocotb.triggers import Timer
from cocotb.triggers import FallingEdge
from cocotb.clock import Clock

logger = logging.getLogger(__name__)


# DUT test specifications
class VerificationCase:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

verification classes should inherit from mase_cocotb.testbench.Testbench

def __init__(self, iterations=1, samples=10):
self.samples = samples
self.iterations = iterations


@cocotb.test()
async def test_top(dut):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This functionality is already implemented in chop.passes.graph.transforms.verilog.emit_tb

"""Test top-level model hardware design"""
samples = 1000
test_case = VerificationCase(samples=samples)

# Reset cycle
await Timer(20, units="ns")
dut.rst.value = 1
await Timer(100, units="ns")
dut.rst.value = 0

# Create a 10ns-period clock on port clk
clock = Clock(dut.clk, 10, units="ns")
# Start the clock
cocotb.start_soon(clock.start())
await Timer(500, units="ns")

# Synchronize with the clock
dut.data_in_0_valid.value = 0
dut.data_out_0_ready.value = 1
debug_state(dut, "Pre-clk")
await FallingEdge(dut.clk)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Driving handshakes manually by assigning signals and awaiting clock edges is too verbose and error prone. We should use mase_cocotb.interfaces.streaming.StreamDriver instead

debug_state(dut, "Post-clk")
debug_state(dut, "Pre-clk")
await FallingEdge(dut.clk)
debug_state(dut, "Post-clk")

done = False
# Set a timeout to avoid deadlock
for i in range(samples * 100):
await FallingEdge(dut.clk)
debug_state(dut, "Post-clk")
dut.data_in_0_valid.value = test_case.data_in.pre_compute()
await Timer(1, units="ns")
dut.data_out_0_ready.value = test_case.outputs.pre_compute(
dut.data_out_0_valid.value
)
await Timer(1, units="ns")
debug_state(dut, "Post-clk")

dut.data_in_0_valid.value, dut.data_in_0.value = test_case.data_in.compute(
dut.data_in_0_ready.value
)
await Timer(1, units="ns")
dut.data_out_0_ready.value = test_case.outputs.compute(
dut.data_out_0_valid.value, dut.data_out_0.value
)
debug_state(dut, "Pre-clk")

if test_case.data_in.is_empty() and test_case.outputs.is_full():
done = True
break
assert (
done
), "Deadlock detected or the simulation reaches the maximum cycle limit (fixed it by adjusting the loop trip count)"

check_results(test_case.outputs.data, test_case.ref)
Loading
Loading