Skip to content

Commit

Permalink
Merge pull request #1018 from mmrahorovic/hotfix/vvu_estimations
Browse files Browse the repository at this point in the history
VVU estimation function fixes
  • Loading branch information
auphelia authored Mar 29, 2024
2 parents 10fa01e + ae97e38 commit 82faae7
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 79 deletions.
79 changes: 79 additions & 0 deletions src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import math
import numpy as np
import os
from qonnx.core.datatype import DataType
Expand All @@ -47,6 +48,84 @@ def get_nodeattr_types(self):
my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs

def lut_estimation(self):
"""Calculates resource estimations for LUTs based on:
- FINN-R: An End-to-End Deep-Learning Framework for Fast
Exploration of Quantized Neural Networks
- M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
Y. Umuroglu, M. Leeser and K. Vissers
- 12. Sep 2018
"""
# TODO add in/out FIFO contributions
P = self.get_nodeattr("PE")
Q = self.get_nodeattr("SIMD")
wdt = self.get_weight_datatype()
W = wdt.bitwidth()
# determine tdt with input and weight data types
idt = self.get_input_datatype()
A = idt.bitwidth()
# parameters from experiments in paper mentioned above
c0 = 300
c1 = 1.1
c2 = 0
mmode = self.get_nodeattr("mem_mode")
mstyle = self.get_nodeattr("ram_style")
if (mmode == "internal_decoupled" and mstyle == "distributed") or (
mmode == "internal_embedded" and self.calc_wmem() <= 128
):
c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)

# multiplication
res_type = self.get_nodeattr("resType")
if res_type == "dsp":
mult_luts = 0
else:
mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
# adder tree
addertree_luts = (W + A) * (2 * Q - 1)
# accumulator
acc_datatype = self.get_accumulator_datatype()
acc_bits = acc_datatype.bitwidth()
k_h, k_w = self.get_nodeattr("Kernel")
# if accDataType is not set, then it will default to INT32, which would
# be a large overestimate in most (if not all) cases. In this scenario,
# we would use the minimum accumulator as determined by the data types
# bound, derived in https://arxiv.org/abs/2301.13376
alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
acc_bits = min(
acc_datatype.bitwidth(),
np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
)
acc_luts = acc_bits
# thresholds and threshold comparators
thr_luts = 0
comp_luts = 0
noact = self.get_nodeattr("noActivation")
# TODO - add 'ram_style_threshold' node attribute
if noact == 0:
odt = self.get_output_datatype()
B = odt.bitwidth()
thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
comp_luts = (2**B - 1) * acc_bits

return int(
c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
)

def dsp_estimation(self):
# multiplication
P = self.get_nodeattr("PE")
res_type = self.get_nodeattr("resType")
wdt = self.get_weight_datatype()
W = wdt.bitwidth()
idt = self.get_input_datatype()
A = idt.bitwidth()
if res_type == "dsp":
mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling
else:
mult_dsp = 0
return int(mult_dsp)

def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
mem_mode = self.get_nodeattr("mem_mode")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ def lut_estimation(self):
return 0

def dsp_estimation(self):
P = self.get_nodeattr("PE")
Q = self.get_nodeattr("SIMD")
return int(np.ceil(Q / 3))
return int(P * np.ceil(Q / 3))

def instantiate_ip(self, cmd):
# instantiate the RTL IP
Expand Down
78 changes: 0 additions & 78 deletions src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,84 +386,6 @@ def uram_efficiency_estimation(self):
uram_est_capacity = uram_est * 72 * 4096
return wbits / uram_est_capacity

def lut_estimation(self):
"""Calculates resource estimations for LUTs based on:
- FINN-R: An End-to-End Deep-Learning Framework for Fast
Exploration of Quantized Neural Networks
- M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
Y. Umuroglu, M. Leeser and K. Vissers
- 12. Sep 2018
"""
# TODO add in/out FIFO contributions
P = self.get_nodeattr("PE")
Q = self.get_nodeattr("SIMD")
wdt = self.get_weight_datatype()
W = wdt.bitwidth()
# determine tdt with input and weight data types
idt = self.get_input_datatype()
A = idt.bitwidth()
# parameters from experiments in paper mentioned above
c0 = 300
c1 = 1.1
c2 = 0
mmode = self.get_nodeattr("mem_mode")
mstyle = self.get_nodeattr("ram_style")
if (mmode == "internal_decoupled" and mstyle == "distributed") or (
mmode == "internal_embedded" and self.calc_wmem() <= 128
):
c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)

# multiplication
res_type = self.get_nodeattr("resType")
if res_type == "dsp":
mult_luts = 0
else:
mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
# adder tree
addertree_luts = (W + A) * (2 * Q - 1)
# accumulator
acc_datatype = self.get_accumulator_datatype()
acc_bits = acc_datatype.bitwidth()
k_h, k_w = self.get_nodeattr("Kernel")
# if accDataType is not set, then it will default to INT32, which would
# be a large overestimate in most (if not all) cases. In this scenario,
# we would use the minimum accumulator as determined by the data types
# bound, derived in https://arxiv.org/abs/2301.13376
alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
acc_bits = min(
acc_datatype.bitwidth(),
np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
)
acc_luts = acc_bits
# thresholds and threshold comparators
thr_luts = 0
comp_luts = 0
noact = self.get_nodeattr("noActivation")
# TODO - add 'ram_style_threshold' node attribute
if noact == 0:
odt = self.get_output_datatype()
B = odt.bitwidth()
thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
comp_luts = (2**B - 1) * acc_bits

return int(
c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
)

def dsp_estimation(self):
# multiplication
P = self.get_nodeattr("PE")
res_type = self.get_nodeattr("resType")
wdt = self.get_weight_datatype()
W = wdt.bitwidth()
idt = self.get_input_datatype()
A = idt.bitwidth()
if res_type == "dsp":
mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling
else:
mult_dsp = 0
return int(mult_dsp)

def get_exp_cycles(self):
pe = self.get_nodeattr("PE")
simd = self.get_nodeattr("SIMD")
Expand Down

0 comments on commit 82faae7

Please sign in to comment.