Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPEN: Add wmem #8

Merged
merged 7 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- support 32bit scale
- cmake support
- const qualifier to `<acc>_dev_t` function arguments
- support for N-EUREKA's dedicated weight memory

### Changed

Expand Down
29 changes: 15 additions & 14 deletions ne16/hal/ne16_task.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,32 +102,33 @@ void ne16_task_set_weight_offset(ne16_task_t *task,
task->data.cfg.weight_offset_factor = weight_offset;
}

/** ne16_pad_ptr
/** ne16_pad_addr
*
* Calculate the pointer to the start of the ptr as if
* it was the start to the padded data.
* Necessary for input pointer when it's padded.
*/
uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride,
const uint8_t padding_top, const uint8_t padding_left) {
uint32_t ne16_pad_addr(uint32_t ptr, const uint32_t width,
uint32_t width_stride, const uint8_t padding_top,
const uint8_t padding_left) {
return ptr - (padding_top * width + padding_left) * width_stride;
}

void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
void ne16_task_set_addr_conv(ne16_task_t *task, uint32_t input_addr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr) {
task->data.infeat_ptr =
ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
task->data.outfeat_ptr = output_ptr;
task->data.weights_ptr = weights_ptr;
uint32_t output_addr, uint32_t weights_addr) {
task->data.infeat_addr =
ne16_pad_addr(input_addr, w_in, w_in_stride, padding_top, padding_left);
task->data.outfeat_addr = output_addr;
task->data.weights_addr = weights_addr;
}

void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr) {
task->data.scale_ptr = scale_ptr;
task->data.scale_shift_ptr = shift_ptr;
task->data.scale_bias_ptr = bias_ptr;
void ne16_task_set_addr_norm_quant(ne16_task_t *task, uint32_t scale_addr,
uint32_t shift_addr, uint32_t bias_addr) {
task->data.scale_addr = scale_addr;
task->data.scale_shift_addr = shift_addr;
task->data.scale_bias_addr = bias_addr;
}

void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
Expand Down
26 changes: 13 additions & 13 deletions ne16/hal/ne16_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,12 @@ typedef struct ne16_cfg_t {
} ne16_cfg_t;

typedef struct ne16_task_data_t {
uint32_t weights_ptr;
uint32_t infeat_ptr;
uint32_t outfeat_ptr;
uint32_t scale_ptr;
uint32_t scale_shift_ptr;
uint32_t scale_bias_ptr;
uint32_t weights_addr;
uint32_t infeat_addr;
uint32_t outfeat_addr;
uint32_t scale_addr;
uint32_t scale_shift_addr;
uint32_t scale_bias_addr;
ne16_cfg_t cfg;
} ne16_task_data_t;

Expand All @@ -130,15 +130,15 @@ void ne16_task_set_weight_offset(ne16_task_t *task,
uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
uint32_t i_width, uint32_t n_height,
uint32_t n_width);
uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride, const uint8_t padding_top,
const uint8_t padding_left);
void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
uint32_t ne16_pad_addr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride, const uint8_t padding_top,
const uint8_t padding_left);
void ne16_task_set_addr_conv(ne16_task_t *task, uint32_t input_addr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr);
void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr);
uint32_t output_addr, uint32_t weights_addr);
void ne16_task_set_addr_norm_quant(ne16_task_t *task, uint32_t scale_addr,
uint32_t shift_addr, uint32_t bias_addr);
/** ne16_task_set_strides
*
* All the strides variables are strides between elements alongside that
Expand Down
1 change: 1 addition & 0 deletions neureka/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ Github repo [link](https://github.com/siracusa-soc/ne).
- [ ] Weight type
- [x] int8
- [ ] int2-7
- [x] Dedicated weight memory
39 changes: 22 additions & 17 deletions neureka/hal/neureka_task.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,33 +114,38 @@ void neureka_task_set_weight_source(neureka_task_t *task,
task->data.cfg.conf0 |= weight_source;
}

/** neureka_pad_ptr
/** neureka_pad_addr
*
* Calculate the pointer to the start of the ptr as if
* it was the start to the padded data.
* Necessary for input pointer when it's padded.
*/
uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride, const uint8_t padding_top,
const uint8_t padding_left) {
uint32_t neureka_pad_addr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride,
const uint8_t padding_top,
const uint8_t padding_left) {
return ptr - (padding_top * width + padding_left) * width_stride;
}

void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
void neureka_task_set_addr_conv(neureka_task_t *task, uint32_t input_addr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr) {
task->data.infeat_ptr =
neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
task->data.outfeat_ptr = output_ptr;
task->data.weights_ptr = weights_ptr;
}

void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr) {
task->data.scale_ptr = scale_ptr;
task->data.scale_shift_ptr = shift_ptr;
task->data.scale_bias_ptr = bias_ptr;
uint32_t output_addr, uint32_t weights_addr) {
task->data.infeat_addr = neureka_pad_addr(input_addr, w_in, w_in_stride,
padding_top, padding_left);
task->data.outfeat_addr = output_addr;
if ((task->data.cfg.conf0 & NEUREKA_MASK_FLAG_WEIGHT_SOURCE) ==
NEUREKA_FLAG_WEIGHT_SOURCE_WMEM) {
weights_addr -= 0x10400000;
}
task->data.weights_addr = weights_addr;
}

void neureka_task_set_addr_norm_quant(neureka_task_t *task, uint32_t scale_addr,
uint32_t shift_addr, uint32_t bias_addr) {
task->data.scale_addr = scale_addr;
task->data.scale_shift_addr = shift_addr;
task->data.scale_bias_addr = bias_addr;
}

void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
Expand Down
27 changes: 14 additions & 13 deletions neureka/hal/neureka_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,12 @@ typedef struct neureka_cfg_t {
} neureka_cfg_t;

typedef struct neureka_task_data_t {
uint32_t weights_ptr;
uint32_t infeat_ptr;
uint32_t outfeat_ptr;
uint32_t scale_ptr;
uint32_t scale_shift_ptr;
uint32_t scale_bias_ptr;
uint32_t weights_addr;
uint32_t infeat_addr;
uint32_t outfeat_addr;
uint32_t scale_addr;
uint32_t scale_shift_addr;
uint32_t scale_bias_addr;
neureka_cfg_t cfg;
} neureka_task_data_t;

Expand Down Expand Up @@ -139,15 +139,16 @@ void neureka_task_set_weight_source(neureka_task_t *task,
uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
uint32_t i_width, uint32_t n_height,
uint32_t n_width);
uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride, const uint8_t padding_top,
const uint8_t padding_left);
void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
uint32_t neureka_pad_addr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride,
const uint8_t padding_top,
const uint8_t padding_left);
void neureka_task_set_addr_conv(neureka_task_t *task, uint32_t input_addr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr);
void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr);
uint32_t output_addr, uint32_t weights_addr);
void neureka_task_set_addr_norm_quant(neureka_task_t *task, uint32_t scale_addr,
uint32_t shift_addr, uint32_t bias_addr);
/** neureka_task_set_strides
*
* All the strides variables are strides between elements alongside that
Expand Down
18 changes: 9 additions & 9 deletions src/pulp_nnx_ne16.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,11 @@ void ne16_nnx_resolve_wait(const ne16_dev_t *dev, ne16_task_t *task) {
}
}

static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
uint32_t size_j, uint32_t size_k,
uint32_t stride_j, uint32_t stride_k,
uint32_t overlap_i, uint32_t overlap_j,
uint32_t offset_i, uint32_t offset_j) {
static inline uint32_t _get_tile_addr(uint32_t ptr, int i, int j, int size_i,
uint32_t size_j, uint32_t size_k,
uint32_t stride_j, uint32_t stride_k,
uint32_t overlap_i, uint32_t overlap_j,
uint32_t offset_i, uint32_t offset_j) {
return ptr + (i * (size_i - overlap_i) - offset_i) * stride_j +
(j * (size_j - overlap_j) - offset_j) * stride_k;
}
Expand All @@ -97,18 +97,18 @@ void ne16_nnx_dispatch_stride2x2(const ne16_dev_t *dev, ne16_task_t *task,
const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0;
const uint32_t output_width_offset = w_out % stride == 1 ? 1 : 0;

const uint32_t input_base = task->data.infeat_ptr;
const uint32_t output_base = task->data.outfeat_ptr;
const uint32_t input_base = task->data.infeat_addr;
const uint32_t output_base = task->data.outfeat_addr;
const uint32_t tile_padding = task->data.cfg.padding;

for (uint32_t i = 0; i < n_h; i++) {
for (uint32_t j = 0; j < n_w; j++) {
task->data.infeat_ptr = _get_tile_ptr(
task->data.infeat_addr = _get_tile_addr(
input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
task->data.cfg.input_stride.d1, task->data.cfg.input_stride.d0,
h_ker - stride, w_ker - stride, i == 0 ? 0 : input_height_offset,
j == 0 ? 0 : input_width_offset);
task->data.outfeat_ptr = _get_tile_ptr(
task->data.outfeat_addr = _get_tile_addr(
output_base, i, j, 2, 2, k_out, task->data.cfg.output_stride.d2 << 1,
task->data.cfg.output_stride.d1 << 1, 0, 0,
i == 0 ? 0 : output_height_offset, j == 0 ? 0 : output_width_offset);
Expand Down
15 changes: 11 additions & 4 deletions test/HeaderWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def includes(self):
return "#include <pmsis.h>\n\n"

def define(self, name, expr):
if expr is None:
return f"#define {name.upper()}\n"

if isinstance(expr, str):
expr = f'"{expr}"'
elif isinstance(expr, bool):
Expand Down Expand Up @@ -152,10 +155,12 @@ def generate_source(self, name, body):
with open(filepath, "w") as file:
file.write(body)

def generate_vector_source(self, name, size, _type, init=None, golden=None):
def generate_vector_source(
self, name, size, _type, init=None, golden=None, section="PI_L1"
):
render = ""
render += f'#include "{name}.h"\n\n'
render += self.render_vector(name, "PI_L1 " + _type, size, init=init)
render += self.render_vector(name, f"{section} {_type}", size, init=init)

if golden is not None:
render += self.render_vector(
Expand All @@ -165,8 +170,10 @@ def generate_vector_source(self, name, size, _type, init=None, golden=None):

self.generate_source(name, render)

def generate_vector_files(self, name, size, _type, init=None, golden=None):
self.generate_vector_source(name, size, _type, init, golden)
def generate_vector_files(
self, name, size, _type, init=None, golden=None, section="PI_L1"
):
self.generate_vector_source(name, size, _type, init, golden, section)
self.generate_vector_header(name, size, _type, init, golden)

def render_dims(self, name, dims):
Expand Down
8 changes: 7 additions & 1 deletion test/Ne16TestConf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from pydantic import field_validator, model_validator

from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
from NnxTestClasses import NnxTestConf
from NnxTestClasses import NnxTestConf, WmemLiteral
from TestClasses import IntegerType, KernelShape, Stride, implies


Expand Down Expand Up @@ -109,3 +109,9 @@ def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf:
f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
)
return self

@field_validator("wmem")
@classmethod
def check_valid_wmem(cls, v: WmemLiteral) -> WmemLiteral:
assert v == "tcdm", f"Unsupported wmem {v}. Supported tcdm."
return v
11 changes: 10 additions & 1 deletion test/NeurekaTestConf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from pydantic import field_validator, model_validator

from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
from NnxTestClasses import NnxTestConf
from NnxTestClasses import NnxTestConf, WmemLiteral
from TestClasses import IntegerType, KernelShape, Stride, implies


Expand Down Expand Up @@ -99,3 +99,12 @@ def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf:
f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
)
return self

@field_validator("wmem")
@classmethod
def check_valid_wmem(cls, v: WmemLiteral) -> WmemLiteral:
_supported_wmem = ["tcdm", "sram"]
assert (
v in _supported_wmem
), f"Unsupported wmem {v}. Supported {_supported_wmem}."
return v
32 changes: 24 additions & 8 deletions test/NnxTestClasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from __future__ import annotations

import os
from typing import Callable, Optional, Set, Tuple, Type, Union
from typing import Callable, Literal, Optional, Set, Tuple, Type, Union

import numpy as np
import numpy.typing as npt
Expand All @@ -30,6 +30,8 @@
from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
from TestClasses import IntegerType, KernelShape, Padding, Stride, implies

WmemLiteral = Literal["tcdm", "sram"]


class NnxTestConf(BaseModel):
in_height: PositiveInt
Expand All @@ -48,6 +50,7 @@ class NnxTestConf(BaseModel):
has_norm_quant: bool
has_bias: bool
has_relu: bool
wmem: WmemLiteral = "tcdm"

@model_validator(mode="after") # type: ignore
def check_valid_depthwise_channels(self) -> NnxTestConf:
Expand Down Expand Up @@ -346,8 +349,16 @@ def generate(self, test_name: str, test: NnxTest):
weight_type._bits,
test.conf.depthwise,
)
if test.conf.wmem == "sram":
section = '__attribute__((section(".weightmem_sram")))'
else:
section = "PI_L1"
self.header_writer.generate_vector_files(
"weight", _type="uint8_t", size=weight_init.size, init=weight_init
"weight",
_type="uint8_t",
size=weight_init.size,
init=weight_init,
section=section,
)

# Render scale
Expand Down Expand Up @@ -398,14 +409,18 @@ def generate(self, test_name: str, test: NnxTest):
"offset": weight_offset,
},
"scale": {
"bits": test.conf.scale_type._bits
if test.conf.scale_type is not None
else 0
"bits": (
test.conf.scale_type._bits
if test.conf.scale_type is not None
else 0
)
},
"bias": {
"bits": test.conf.bias_type._bits
if test.conf.bias_type is not None
else 0
"bits": (
test.conf.bias_type._bits
if test.conf.bias_type is not None
else 0
)
},
"padding": {
"top": test.conf.padding.top,
Expand All @@ -420,5 +435,6 @@ def generate(self, test_name: str, test: NnxTest):
"has_norm_quant": test.conf.has_norm_quant,
"has_bias": test.conf.has_bias,
"has_relu": test.conf.has_relu,
f"wmem_{test.conf.wmem}": None,
},
)
Loading
Loading