From 3116391bf66660f806b45e212b9949c528b4e270 Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Fri, 17 Mar 2023 12:00:42 +0100
Subject: [PATCH] Release 0.7.0 (#80)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create release 0.7.0:

Align CVFPU to RVV requirements (ARA branch merged)
Fix f2i cast edge cases
Fix RDN bug in floating-point multiplications
Fix shift amount width in fma and fma_multi

---------

Co-authored-by: Stefan Mach <smach@iis.ee.ethz.ch>
Co-authored-by: Frank K. Gürkaynak <kgf@ee.ethz.ch>
Co-authored-by: Akilesh Kannan <aklsh@tuta.io>
Co-authored-by: Noah Huetter <noahhuetter@gmail.com>
Co-authored-by: Stefan Mach <stefan.mach@axelera.ai>
Co-authored-by: Mike Thompson <mike@openhwgroup.org>
Co-authored-by: Flavien Solt <flsolt@ethz.ch>
Co-authored-by: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Co-authored-by: Shafiullah <shafi.ullah@10xengineers.ai>
---
 Bender.yml                          |  3 ++
 CITATION.cff                        | 33 +++++++++++++++++
 README.md                           | 29 ++++++++++++++-
 docs/CHANGELOG.md                   | 10 +++++-
 docs/CODEOWNERS                     |  2 +-
 docs/README.md                      | 19 ++++++----
 ips_list.yml                        |  3 ++
 src/fpnew_cast_multi.sv             | 44 ++++++++++++++++++++---
 src/fpnew_classifier.sv             |  2 ++
 src/fpnew_divsqrt_multi.sv          | 56 +++++++++++++++++++++--------
 src/fpnew_fma.sv                    | 27 +++++++++++---
 src/fpnew_fma_multi.sv              | 27 +++++++++++---
 src/fpnew_noncomp.sv                | 11 ++++++
 src/fpnew_opgroup_block.sv          | 18 ++++++++--
 src/fpnew_opgroup_fmt_slice.sv      | 30 ++++++++++++----
 src/fpnew_opgroup_multifmt_slice.sv | 29 +++++++++++++--
 src/fpnew_pkg.sv                    |  3 ++
 src/fpnew_rounding.sv               |  4 +++
 src/fpnew_top.sv                    | 15 +++++++-
 src_files.yml                       |  3 ++
 20 files changed, 316 insertions(+), 52 deletions(-)
 create mode 100644 CITATION.cff

diff --git a/Bender.yml b/Bender.yml
index 9a44eb48..7d3ed561 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -1,3 +1,6 @@
+# Copyright 2019 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
 package:
   name: FPnew
   authors: ["Stefan Mach <smach@iis.ee.ethz.ch>"]
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..7dc7f47e
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,33 @@
+cff-version: 1.2.0
+message: "If you use FPnew, please cite it as below."
+authors:
+- family-names: "Mach"
+  given-names: "Stefan"
+  orcid: "https://orcid.org/0000-0002-3476-8857"
+title: "FPnew: - New Floating-Point Unit with Transprecision Capabilities"
+version: 0.6.6
+url: "https://github.com/pulp-platform/fpnew"
+preferred-citation:
+  type: article
+  authors:
+  - family-names: "Mach"
+    given-names: "Stefan"
+    orcid: "https://orcid.org/0000-0002-3476-8857"
+  - family-names: "Schuiki"
+    given-names: "Fabian"
+    orcid: "https://orcid.org/0000-0002-9923-5031"
+  - family-names: "Zaruba"
+    given-names: "Florian"
+    orcid: "https://orcid.org/0000-0002-8194-6521"
+  - family-names: "Benini"
+    given-names: "Luca"
+    orcid: "https://orcid.org/0000-0001-8068-3806"
+  doi: "10.1109/TVLSI.2020.3044752"
+  journal: "IEEE Transactions on Very Large Scale Integration (VLSI) Systems"
+  month: 12
+  start: 774
+  end: 787
+  title: "FPnew: An Open-Source Multiformat Floating-Point Unit Architecture for Energy-Proportional Transprecision Computing"
+  issue: 4
+  volume: 29
+  year: 2020
diff --git a/README.md b/README.md
index 7bcb9ee0..a377c7df 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@
 
 Parametric floating-point unit with support for standard RISC-V formats and operations as well as transprecision formats, written in SystemVerilog.
 
-Maintainer: Stefan Mach <smach@iis.ee.ethz.ch>
+Maintainer: Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
+Principal Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
 ## Features
 
@@ -138,6 +139,32 @@ Furthermore, this repository tries to adhere to [SemVer](https://semver.org/), a
 
 FPnew is released under the *SolderPad Hardware License*, which is a permissive license based on Apache 2.0. Please refer to the [license file](LICENSE) for further information.
 
+
+## Publication
+
+If you use FPnew in your work, you can cite us:
+
+<details>
+<summary>FPnew Publication</summary>
+<p>
+
+```
+@article{mach2020fpnew,
+  title={Fpnew: An open-source multiformat floating-point unit architecture for energy-proportional transprecision computing},
+  author={Mach, Stefan and Schuiki, Fabian and Zaruba, Florian and Benini, Luca},
+  journal={IEEE Transactions on Very Large Scale Integration (VLSI) Systems},
+  volume={29},
+  number={4},
+  pages={774--787},
+  year={2020},
+  publisher={IEEE}
+}
+```
+
+</p>
+</details>
+
+
 ## Acknowledgement
 
 This project has received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 732631.
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 6c8c5786..3a3e1f83 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -11,14 +11,22 @@ Versions of the IP in the same major relase are "pin-compatible" with each other
 ## [Unreleased]
 
 ### Added
+- Citation file `CITATION.cff`
+- Add support for RISC-V compliant classify in vectorial mode when the vector element width is at least 10 bits
+- Add `mask` input signal to mask exceptions from inactive SIMD elements
+- Add support for rounding toward odd (RISC-V V 1.0 compliant)
+
 ### Changed
+- Code ownership to @lucabertaccini
+
 ### Fixed
+- Fix de-synchronization among vectorial lanes during variable-latency operations (`fdiv`, `fsqrt`)
 
 
 ## [0.6.6] - 2021-04-19
 
 ### Changed
-- [common_cells] Bump common cells version
+- [common_cells] Bump common cells version [(#44)](https://github.com/pulp-platform/fpnew/issues/44)
 
 ## [0.6.5] - 2020-11-06
 
diff --git a/docs/CODEOWNERS b/docs/CODEOWNERS
index 7f376285..6b8f7762 100644
--- a/docs/CODEOWNERS
+++ b/docs/CODEOWNERS
@@ -1,2 +1,2 @@
 # Global owners
-*	@stmach
+*	@lucabertaccini
diff --git a/docs/README.md b/docs/README.md
index 54322ddc..d0c0a91c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -24,13 +24,13 @@ FPnew is a parametric floating-point unit which supports standard RISC-V operati
 The top-level module of the FPU is `fpnew_top` and its interface is further described in this section.
 FPnew uses a synchronous interface using handshaking to transfer data into and out of the FPU.
 
-All array types are packed due to poor support of unpacked arrays in some EDA tools.  
+All array types are packed due to poor support of unpacked arrays in some EDA tools.
 SystemVerilog `interface`s are not used due to poor support in some EDA tools.
 
 
 ### Parameters
 
-The configuration parameters use data types defined in `fpnew_pkg` which are structs containing multi-dimensional arrays of custom enumeration types.  
+The configuration parameters use data types defined in `fpnew_pkg` which are structs containing multi-dimensional arrays of custom enumeration types.
 For more in-depth explanations on how to configure the unit and the layout of the types used, please refer to the [Configuration Section](#configuration).
 
 |  Parameter Name  |                                                         Description                                                          |
@@ -38,11 +38,12 @@ For more in-depth explanations on how to configure the unit and the layout of th
 | `Features`       | Specifies the features of the FPU, such as the set of supported formats and operations.                                      |
 | `Implementation` | Allows to control how the above features are implemented, such as the number of pipeline stages and architecture of subunits |
 | `TagType`        | The SystemVerilog data type of the operation tag                                                                             |
-
+| `TrueSIMDClass`  | If enabled, the result of a classify operation in vectorial mode will be RISC-V compliant if each output has at least 10 bits|
+| `EnableSIMDMask` | Enable the RISC-V floating-point status flags masking of inactive vectorial lanes. When disabled, `simd_mask_i` is inactive  |
 
 ### Ports
 
-Many ports use custom types and enumerations from `fpnew_pkg` to improve code structure internally (see [Data Types](#data-types)).  
+Many ports use custom types and enumerations from `fpnew_pkg` to improve code structure internally (see [Data Types](#data-types)).
 As the width of some input/output signals is defined by the configuration, it is denoted `W` in the following table.
 
 |    Port Name     | Direction |         Type         |                          Description                           |
@@ -58,6 +59,7 @@ As the width of some input/output signals is defined by the configuration, it is
 | `int_fmt_i`      | in        | `int_format_e`       | Integer format                                                 |
 | `vectorial_op_i` | in        | `logic`              | Vectorial operation select                                     |
 | `tag_i`          | in        | `TagType`            | Operation tag input                                            |
+| `simd_mask_i`    | in        | `MaskType`           | Vector mask input for the status flags                         |
 | `in_valid_i`     | in        | `logic`              | Input data valid (see [Handshake](#handshake-interface))       |
 | `in_ready_o`     | out       | `logic`              | Input interface ready (see [Handshake](#handshake-interface))  |
 | `flush_i`        | in        | `logic`              | Synchronous pipeline reset                                     |
@@ -84,6 +86,7 @@ Enumeration of type `logic [2:0]` holding available rounding modes, encoded for
 | `RDN`      | `3'b010` | Toward negative infinity                             |
 | `RUP`      | `3'b011` | Toward positive infinity                             |
 | `RMM`      | `3'b100` | To nearest, tie away from zero                       |
+| `ROD`      | `3'b101` | To odd                                               |
 | `DYN`      | `3'b111` | *RISC-V Dynamic RM, invalid if passed to operations* |
 
 ##### `operation_e` - FP Operation
@@ -197,6 +200,10 @@ Tags are an optional feature of FPnew and can be controlled by setting the `TagT
 In order to disable the use of tags, set `TagType` to `logic` (the default value), and bind the `tag_i` port to a static value.
 Furthermore ensure that your synthesis tool removes static registers.
 
+### Mask for the status flags
+
+This input is meant to be used in vectorial mode. The mask for the status flags is an input vector with `NumLanes` bits, and each bit can mask the status flags of a different FPU vectorial lane. This helps not make the final output flag signal dirty due to status flags from inactive lanes.
+If `simd_mask_i[n] == 1'b0`, the `n`th FPU lane will be masked for this operation and its resulting status flags will not be propagated to the final output status flag.
 
 ## Configuration
 
@@ -324,7 +331,7 @@ Currently, the follwoing unit types are available for the FPU operation groups:
   '{default: MERGED},   // DIVSQRT
   '{default: PARALLEL}, // NONCOMP
   '{default: MERGED}}   // CONV`
-``` 
+```
 (all formats within operation group use same type)
 
 
@@ -348,7 +355,7 @@ The configuration  `pipe_config_t` is an enumeration of type `logic [1:0]` holdi
 ### Adding Custom Formats
 
 In order to add custom FP or integer formats to the FPU, it is necessary to make small changes to `fpnew_pkg`.
-New formats can easily be added by extending the default list of available formats, and/or by changing or removing the defaults. 
+New formats can easily be added by extending the default list of available formats, and/or by changing or removing the defaults.
 
 Namely, the following parameters and types shall be adapted:
 ```
diff --git a/ips_list.yml b/ips_list.yml
index 8d082ca8..17384806 100644
--- a/ips_list.yml
+++ b/ips_list.yml
@@ -1,3 +1,6 @@
+# Copyright 2019 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
 #
 # List of IPs and relative branch/commit-hash/tag.
 # Uses the YAML syntax.
diff --git a/src/fpnew_cast_multi.sv b/src/fpnew_cast_multi.sv
index 9d54c79e..e166d0bf 100644
--- a/src/fpnew_cast_multi.sv
+++ b/src/fpnew_cast_multi.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -38,6 +40,7 @@ module fpnew_cast_multi #(
   input  fpnew_pkg::fp_format_e  dst_fmt_i,
   input  fpnew_pkg::int_format_e int_fmt_i,
   input  TagType                 tag_i,
+  input  logic                   mask_i,
   input  AuxType                 aux_i,
   // Input Handshake
   input  logic                   in_valid_i,
@@ -48,6 +51,7 @@ module fpnew_cast_multi #(
   output fpnew_pkg::status_t     status_o,
   output logic                   extension_bit_o,
   output TagType                 tag_o,
+  output logic                   mask_o,
   output AuxType                 aux_o,
   // Output handshake
   output logic                   out_valid_o,
@@ -114,6 +118,7 @@ module fpnew_cast_multi #(
   fpnew_pkg::fp_format_e  [0:NUM_INP_REGS]                  inp_pipe_dst_fmt_q;
   fpnew_pkg::int_format_e [0:NUM_INP_REGS]                  inp_pipe_int_fmt_q;
   TagType                 [0:NUM_INP_REGS]                  inp_pipe_tag_q;
+  logic                   [0:NUM_INP_REGS]                  inp_pipe_mask_q;
   AuxType                 [0:NUM_INP_REGS]                  inp_pipe_aux_q;
   logic                   [0:NUM_INP_REGS]                  inp_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -129,6 +134,7 @@ module fpnew_cast_multi #(
   assign inp_pipe_dst_fmt_q[0]  = dst_fmt_i;
   assign inp_pipe_int_fmt_q[0]  = int_fmt_i;
   assign inp_pipe_tag_q[0]      = tag_i;
+  assign inp_pipe_mask_q[0]     = mask_i;
   assign inp_pipe_aux_q[0]      = aux_i;
   assign inp_pipe_valid_q[0]    = in_valid_i;
   // Input stage: Propagate pipeline ready signal to updtream circuitry
@@ -155,6 +161,7 @@ module fpnew_cast_multi #(
     `FFL(inp_pipe_dst_fmt_q[i+1],  inp_pipe_dst_fmt_q[i],  reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(inp_pipe_int_fmt_q[i+1],  inp_pipe_int_fmt_q[i],  reg_ena, fpnew_pkg::int_format_e'(0))
     `FFL(inp_pipe_tag_q[i+1],      inp_pipe_tag_q[i],      reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],     inp_pipe_mask_q[i],     reg_ena, '0)
     `FFL(inp_pipe_aux_q[i+1],      inp_pipe_aux_q[i],      reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
@@ -328,6 +335,7 @@ module fpnew_cast_multi #(
   fpnew_pkg::fp_format_e  [0:NUM_MID_REGS]                    mid_pipe_dst_fmt_q;
   fpnew_pkg::int_format_e [0:NUM_MID_REGS]                    mid_pipe_int_fmt_q;
   TagType                 [0:NUM_MID_REGS]                    mid_pipe_tag_q;
+  logic                   [0:NUM_MID_REGS]                    mid_pipe_mask_q;
   AuxType                 [0:NUM_MID_REGS]                    mid_pipe_aux_q;
   logic                   [0:NUM_MID_REGS]                    mid_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -348,6 +356,7 @@ module fpnew_cast_multi #(
   assign mid_pipe_dst_fmt_q[0]    = dst_fmt_q;
   assign mid_pipe_int_fmt_q[0]    = int_fmt_q;
   assign mid_pipe_tag_q[0]        = inp_pipe_tag_q[NUM_INP_REGS];
+  assign mid_pipe_mask_q[0]       = inp_pipe_mask_q[NUM_INP_REGS];
   assign mid_pipe_aux_q[0]        = inp_pipe_aux_q[NUM_INP_REGS];
   assign mid_pipe_valid_q[0]      = inp_pipe_valid_q[NUM_INP_REGS];
   // Input stage: Propagate pipeline ready signal to input pipe
@@ -380,6 +389,7 @@ module fpnew_cast_multi #(
     `FFL(mid_pipe_dst_fmt_q[i+1],    mid_pipe_dst_fmt_q[i],    reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(mid_pipe_int_fmt_q[i+1],    mid_pipe_int_fmt_q[i],    reg_ena, fpnew_pkg::int_format_e'(0))
     `FFL(mid_pipe_tag_q[i+1],        mid_pipe_tag_q[i],        reg_ena, TagType'('0))
+    `FFL(mid_pipe_mask_q[i+1],       mid_pipe_mask_q[i],       reg_ena, '0)
     `FFL(mid_pipe_aux_q[i+1],        mid_pipe_aux_q[i],        reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
@@ -489,6 +499,7 @@ module fpnew_cast_multi #(
   logic [NUM_FORMATS-1:0]            fmt_uf_after_round;
 
   logic [NUM_INT_FORMATS-1:0][WIDTH-1:0] ifmt_pre_round_abs; // per format
+  logic [NUM_INT_FORMATS-1:0]            ifmt_of_after_round;
 
   logic             rounded_sign;
   logic [WIDTH-1:0] rounded_abs; // absolute value of result after rounding
@@ -573,14 +584,33 @@ module fpnew_cast_multi #(
     end
   end
 
-  // Classification after rounding select by destination format
-  assign uf_after_round = fmt_uf_after_round[dst_fmt_q2];
-  assign of_after_round = fmt_of_after_round[dst_fmt_q2];
-
   // Negative integer result needs to be brought into two's complement
   assign rounded_int_res      = rounded_sign ? unsigned'(-rounded_abs) : rounded_abs;
   assign rounded_int_res_zero = (rounded_int_res == '0);
 
+  // Detect integer overflows after rounding (only positives)
+  for (genvar ifmt = 0; ifmt < int'(NUM_INT_FORMATS); ifmt++) begin : gen_int_overflow
+    // Set up some constants
+    localparam int unsigned INT_WIDTH = fpnew_pkg::int_width(fpnew_pkg::int_format_e'(ifmt));
+
+    if (IntFmtConfig[ifmt]) begin : active_format
+      always_comb begin : detect_overflow
+        ifmt_of_after_round[ifmt] = 1'b0;
+        // Int result can overflow if we're at the max exponent
+        if (!rounded_sign && input_exp_q == signed'(INT_WIDTH - 2 + op_mod_q2)) begin
+          // Check whether the rounded MSB differs from unrounded MSB
+          ifmt_of_after_round[ifmt] = ~rounded_int_res[INT_WIDTH-2+op_mod_q2];
+        end
+      end
+    end else begin : inactive_format
+      assign ifmt_of_after_round[ifmt] = fpnew_pkg::DONT_CARE;
+    end
+  end
+
+  // Classification after rounding select by destination format
+  assign uf_after_round = fmt_uf_after_round[dst_fmt_q2];
+  assign of_after_round = dst_is_int_q ? ifmt_of_after_round[int_fmt_q2] : fmt_of_after_round[dst_fmt_q2];
+
   // -------------------------
   // FP Special case handling
   // -------------------------
@@ -664,7 +694,7 @@ module fpnew_cast_multi #(
 
   // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
   assign int_result_is_special = info_q.is_nan | info_q.is_inf |
-                                 of_before_round | ~info_q.is_boxed |
+                                 of_before_round | of_after_round | ~info_q.is_boxed |
                                  (input_sign_q & op_mod_q2 & ~rounded_int_res_zero);
 
   // All integer special cases are invalid
@@ -714,6 +744,7 @@ module fpnew_cast_multi #(
   fpnew_pkg::status_t [0:NUM_OUT_REGS]            out_pipe_status_q;
   logic               [0:NUM_OUT_REGS]            out_pipe_ext_bit_q;
   TagType             [0:NUM_OUT_REGS]            out_pipe_tag_q;
+  logic               [0:NUM_OUT_REGS]            out_pipe_mask_q;
   AuxType             [0:NUM_OUT_REGS]            out_pipe_aux_q;
   logic               [0:NUM_OUT_REGS]            out_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -724,6 +755,7 @@ module fpnew_cast_multi #(
   assign out_pipe_status_q[0]  = status_d;
   assign out_pipe_ext_bit_q[0] = extension_bit;
   assign out_pipe_tag_q[0]     = mid_pipe_tag_q[NUM_MID_REGS];
+  assign out_pipe_mask_q[0]    = mid_pipe_mask_q[NUM_MID_REGS];
   assign out_pipe_aux_q[0]     = mid_pipe_aux_q[NUM_MID_REGS];
   assign out_pipe_valid_q[0]   = mid_pipe_valid_q[NUM_MID_REGS];
   // Input stage: Propagate pipeline ready signal to inside pipe
@@ -745,6 +777,7 @@ module fpnew_cast_multi #(
     `FFL(out_pipe_status_q[i+1],  out_pipe_status_q[i],  reg_ena, '0)
     `FFL(out_pipe_ext_bit_q[i+1], out_pipe_ext_bit_q[i], reg_ena, '0)
     `FFL(out_pipe_tag_q[i+1],     out_pipe_tag_q[i],     reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],    out_pipe_mask_q[i],    reg_ena, '0)
     `FFL(out_pipe_aux_q[i+1],     out_pipe_aux_q[i],     reg_ena, AuxType'('0))
   end
   // Output stage: Ready travels backwards from output side, driven by downstream circuitry
@@ -754,6 +787,7 @@ module fpnew_cast_multi #(
   assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
   assign extension_bit_o = out_pipe_ext_bit_q[NUM_OUT_REGS];
   assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
   assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
   assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
   assign busy_o          = (| {inp_pipe_valid_q, mid_pipe_valid_q, out_pipe_valid_q});
diff --git a/src/fpnew_classifier.sv b/src/fpnew_classifier.sv
index 5e4fab93..a322946d 100644
--- a/src/fpnew_classifier.sv
+++ b/src/fpnew_classifier.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
diff --git a/src/fpnew_divsqrt_multi.sv b/src/fpnew_divsqrt_multi.sv
index 1331f5fe..0f7ea5d5 100644
--- a/src/fpnew_divsqrt_multi.sv
+++ b/src/fpnew_divsqrt_multi.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -33,16 +35,22 @@ module fpnew_divsqrt_multi #(
   input  fpnew_pkg::operation_e       op_i,
   input  fpnew_pkg::fp_format_e       dst_fmt_i,
   input  TagType                      tag_i,
+  input  logic                        mask_i,
   input  AuxType                      aux_i,
   // Input Handshake
   input  logic                        in_valid_i,
   output logic                        in_ready_o,
+  output logic                        divsqrt_done_o,
+  input  logic                        simd_synch_done_i,
+  output logic                        divsqrt_ready_o,
+  input  logic                        simd_synch_rdy_i,
   input  logic                        flush_i,
   // Output signals
   output logic [WIDTH-1:0]            result_o,
   output fpnew_pkg::status_t          status_o,
   output logic                        extension_bit_o,
   output TagType                      tag_o,
+  output logic                        mask_o,
   output AuxType                      aux_o,
   // Output handshake
   output logic                        out_valid_o,
@@ -82,6 +90,7 @@ module fpnew_divsqrt_multi #(
   fpnew_pkg::operation_e [0:NUM_INP_REGS]                       inp_pipe_op_q;
   fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_dst_fmt_q;
   TagType                [0:NUM_INP_REGS]                       inp_pipe_tag_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_mask_q;
   AuxType                [0:NUM_INP_REGS]                       inp_pipe_aux_q;
   logic                  [0:NUM_INP_REGS]                       inp_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -93,6 +102,7 @@ module fpnew_divsqrt_multi #(
   assign inp_pipe_op_q[0]       = op_i;
   assign inp_pipe_dst_fmt_q[0]  = dst_fmt_i;
   assign inp_pipe_tag_q[0]      = tag_i;
+  assign inp_pipe_mask_q[0]     = mask_i;
   assign inp_pipe_aux_q[0]      = aux_i;
   assign inp_pipe_valid_q[0]    = in_valid_i;
   // Input stage: Propagate pipeline ready signal to updtream circuitry
@@ -115,6 +125,7 @@ module fpnew_divsqrt_multi #(
     `FFL(inp_pipe_op_q[i+1],       inp_pipe_op_q[i],       reg_ena, fpnew_pkg::FMADD)
     `FFL(inp_pipe_dst_fmt_q[i+1],  inp_pipe_dst_fmt_q[i],  reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(inp_pipe_tag_q[i+1],      inp_pipe_tag_q[i],      reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],     inp_pipe_mask_q[i],     reg_ena, '0)
     `FFL(inp_pipe_aux_q[i+1],      inp_pipe_aux_q[i],      reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
@@ -152,20 +163,29 @@ module fpnew_divsqrt_multi #(
   // ------------
   // Control FSM
   // ------------
+
   logic in_ready;               // input handshake with upstream
   logic div_valid, sqrt_valid;  // input signalling with unit
-  logic unit_ready, unit_done;  // status signals from unit instance
+  logic unit_ready, unit_done, unit_done_q;  // status signals from unit instance
   logic op_starting;            // high in the cycle a new operation starts
   logic out_valid, out_ready;   // output handshake with downstream
-  logic hold_result;            // whether to put result into hold register
-  logic data_is_held;           // data in hold register is valid
   logic unit_busy;              // valid data in flight
   // FSM states
   typedef enum logic [1:0] {IDLE, BUSY, HOLD} fsm_state_e;
   fsm_state_e state_q, state_d;
 
-  // Upstream ready comes from sanitization FSM
-  assign inp_pipe_ready[NUM_INP_REGS] = in_ready;
+  // Ready synch with other lanes
+  // Bring the FSM-generated ready outside the unit, to synchronize it with the other lanes
+  assign divsqrt_ready_o = in_ready;
+  // Upstream ready comes from sanitization FSM, and it is synched among all the lanes
+  assign inp_pipe_ready[NUM_INP_REGS] = simd_synch_rdy_i;
+
+  // Valid synch with other lanes
+  // When one divsqrt unit completes an operation, keep its done high, waiting for the other lanes
+  // As soon as all the lanes are over, we can clear this FF and start with a new operation
+  `FFLARNC(unit_done_q, unit_done, unit_done, simd_synch_done_i, 1'b0, clk_i, rst_ni);
+  // Tell the other units that this unit has finished now or in the past
+  assign divsqrt_done_o = unit_done_q | unit_done;
 
   // Valids are gated by the FSM ready. Invalid input ops run a sqrt to not lose illegal instr.
   assign div_valid   = in_valid_q & (op_q == fpnew_pkg::DIV) & in_ready & ~flush_i;
@@ -177,8 +197,6 @@ module fpnew_divsqrt_multi #(
     // Default assignments
     in_ready     = 1'b0;
     out_valid    = 1'b0;
-    hold_result  = 1'b0;
-    data_is_held = 1'b0;
     unit_busy    = 1'b0;
     state_d      = state_q;
 
@@ -193,8 +211,8 @@ module fpnew_divsqrt_multi #(
       // Operation in progress
       BUSY: begin
         unit_busy = 1'b1; // data in flight
-        // If the unit is done with processing
-        if (unit_done) begin
+        // If all the lanes are done with processing
+        if (simd_synch_done_i) begin
           out_valid = 1'b1; // try to commit result downstream
           // If downstream accepts our result
           if (out_ready) begin
@@ -205,7 +223,6 @@ module fpnew_divsqrt_multi #(
             end
           // Otherwise if downstream is not ready for the result
           end else begin
-            hold_result = 1'b1; // activate the hold register
             state_d     = HOLD; // wait for the pipeline to take the data
           end
         end
@@ -213,7 +230,6 @@ module fpnew_divsqrt_multi #(
       // Waiting with valid result for downstream
       HOLD: begin
         unit_busy    = 1'b1; // data in flight
-        data_is_held = 1'b1; // data in hold register is valid
         out_valid    = 1'b1; // try to commit result downstream
         // If the result is accepted by downstream
         if (out_ready) begin
@@ -242,11 +258,13 @@ module fpnew_divsqrt_multi #(
   // Hold additional information while the operation is in progress
   logic result_is_fp8_q;
   TagType result_tag_q;
+  logic result_mask_q;
   AuxType result_aux_q;
 
   // Fill the registers everytime a valid operation arrives (load FF, active low asynch rst)
   `FFL(result_is_fp8_q, input_is_fp8,                 op_starting, '0)
   `FFL(result_tag_q,    inp_pipe_tag_q[NUM_INP_REGS], op_starting, '0)
+  `FFL(result_mask_q,   inp_pipe_mask_q[NUM_INP_REGS],op_starting, '0)
   `FFL(result_aux_q,    inp_pipe_aux_q[NUM_INP_REGS], op_starting, '0)
 
   // -----------------
@@ -255,6 +273,7 @@ module fpnew_divsqrt_multi #(
   logic [63:0]        unit_result;
   logic [WIDTH-1:0]   adjusted_result, held_result_q;
   fpnew_pkg::status_t unit_status, held_status_q;
+  logic               hold_en;
 
   div_sqrt_top_mvp i_divsqrt_lei (
    .Clk_CI           ( clk_i               ),
@@ -276,9 +295,12 @@ module fpnew_divsqrt_multi #(
   // Adjust result width and fix FP8
   assign adjusted_result = result_is_fp8_q ? unit_result >> 8 : unit_result;
 
+  // Hold the result when one lane has finished execution, except when all the lanes finish together
+  // and the result can be accepted downstream
+  assign hold_en = unit_done & (~simd_synch_done_i | ~out_ready);
   // The Hold register (load, no reset)
-  `FFLNR(held_result_q, adjusted_result, hold_result, clk_i)
-  `FFLNR(held_status_q, unit_status,     hold_result, clk_i)
+  `FFLNR(held_result_q, adjusted_result, hold_en, clk_i)
+  `FFLNR(held_status_q, unit_status,     hold_en, clk_i)
 
   // --------------
   // Output Select
@@ -286,8 +308,8 @@ module fpnew_divsqrt_multi #(
   logic [WIDTH-1:0]   result_d;
   fpnew_pkg::status_t status_d;
   // Prioritize hold register data
-  assign result_d = data_is_held ? held_result_q : adjusted_result;
-  assign status_d = data_is_held ? held_status_q : unit_status;
+  assign result_d = unit_done_q ? held_result_q : adjusted_result;
+  assign status_d = unit_done_q ? held_status_q : unit_status;
 
   // ----------------
   // Output Pipeline
@@ -296,6 +318,7 @@ module fpnew_divsqrt_multi #(
   logic               [0:NUM_OUT_REGS][WIDTH-1:0] out_pipe_result_q;
   fpnew_pkg::status_t [0:NUM_OUT_REGS]            out_pipe_status_q;
   TagType             [0:NUM_OUT_REGS]            out_pipe_tag_q;
+  logic               [0:NUM_OUT_REGS]            out_pipe_mask_q;
   AuxType             [0:NUM_OUT_REGS]            out_pipe_aux_q;
   logic               [0:NUM_OUT_REGS]            out_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -305,6 +328,7 @@ module fpnew_divsqrt_multi #(
   assign out_pipe_result_q[0] = result_d;
   assign out_pipe_status_q[0] = status_d;
   assign out_pipe_tag_q[0]    = result_tag_q;
+  assign out_pipe_mask_q[0]   = result_mask_q;
   assign out_pipe_aux_q[0]    = result_aux_q;
   assign out_pipe_valid_q[0]  = out_valid;
   // Input stage: Propagate pipeline ready signal to inside pipe
@@ -325,6 +349,7 @@ module fpnew_divsqrt_multi #(
     `FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
     `FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
     `FFL(out_pipe_tag_q[i+1],    out_pipe_tag_q[i],    reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],   out_pipe_mask_q[i],   reg_ena, '0)
     `FFL(out_pipe_aux_q[i+1],    out_pipe_aux_q[i],    reg_ena, AuxType'('0))
   end
   // Output stage: Ready travels backwards from output side, driven by downstream circuitry
@@ -334,6 +359,7 @@ module fpnew_divsqrt_multi #(
   assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
   assign extension_bit_o = 1'b1; // always NaN-Box result
   assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
   assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
   assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
   assign busy_o          = (| {inp_pipe_valid_q, unit_busy, out_pipe_valid_q});
diff --git a/src/fpnew_fma.sv b/src/fpnew_fma.sv
index f9fa813b..c29e7b3e 100644
--- a/src/fpnew_fma.sv
+++ b/src/fpnew_fma.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -31,6 +33,7 @@ module fpnew_fma #(
   input fpnew_pkg::operation_e     op_i,
   input logic                      op_mod_i,
   input TagType                    tag_i,
+  input logic                      mask_i,
   input AuxType                    aux_i,
   // Input Handshake
   input  logic                     in_valid_i,
@@ -41,6 +44,7 @@ module fpnew_fma #(
   output fpnew_pkg::status_t       status_o,
   output logic                     extension_bit_o,
   output TagType                   tag_o,
+  output logic                     mask_o,
   output AuxType                   aux_o,
   // Output handshake
   output logic                     out_valid_o,
@@ -64,8 +68,8 @@ module fpnew_fma #(
   // datapath leakage. This is either given by the exponent bits or the width of the LZC result.
   // In most reasonable FP formats the internal exponent will be wider than the LZC result.
   localparam int unsigned EXP_WIDTH = unsigned'(fpnew_pkg::maximum(EXP_BITS + 2, LZC_RESULT_WIDTH));
-  // Shift amount width: maximum internal mantissa size is 3p+3 bits
-  localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(3 * PRECISION_BITS + 3);
+  // Shift amount width: maximum internal mantissa size is 3p+4 bits
+  localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(3 * PRECISION_BITS + 5);
   // Pipelines
   localparam NUM_INP_REGS = PipeConfig == fpnew_pkg::BEFORE
                             ? NumPipeRegs
@@ -102,6 +106,7 @@ module fpnew_fma #(
   fpnew_pkg::operation_e [0:NUM_INP_REGS]                 inp_pipe_op_q;
   logic                  [0:NUM_INP_REGS]                 inp_pipe_op_mod_q;
   TagType                [0:NUM_INP_REGS]                 inp_pipe_tag_q;
+  logic                  [0:NUM_INP_REGS]                 inp_pipe_mask_q;
   AuxType                [0:NUM_INP_REGS]                 inp_pipe_aux_q;
   logic                  [0:NUM_INP_REGS]                 inp_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -114,6 +119,7 @@ module fpnew_fma #(
   assign inp_pipe_op_q[0]       = op_i;
   assign inp_pipe_op_mod_q[0]   = op_mod_i;
   assign inp_pipe_tag_q[0]      = tag_i;
+  assign inp_pipe_mask_q[0]     = mask_i;
   assign inp_pipe_aux_q[0]      = aux_i;
   assign inp_pipe_valid_q[0]    = in_valid_i;
   // Input stage: Propagate pipeline ready signal to updtream circuitry
@@ -137,6 +143,7 @@ module fpnew_fma #(
     `FFL(inp_pipe_op_q[i+1],       inp_pipe_op_q[i],       reg_ena, fpnew_pkg::FMADD)
     `FFL(inp_pipe_op_mod_q[i+1],   inp_pipe_op_mod_q[i],   reg_ena, '0)
     `FFL(inp_pipe_tag_q[i+1],      inp_pipe_tag_q[i],      reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],     inp_pipe_mask_q[i],     reg_ena, '0)
     `FFL(inp_pipe_aux_q[i+1],      inp_pipe_aux_q[i],      reg_ena, AuxType'('0))
   end
 
@@ -167,7 +174,7 @@ module fpnew_fma #(
   // | FNMSUB   | \c 1        | FNMADD: Invert sign of operands A and C
   // | ADD      | \c 0        | ADD: Set operand A to +1.0
   // | ADD      | \c 1        | SUB: Set operand A to +1.0, invert sign of operand C
-  // | MUL      | \c 0        | MUL: Set operand C to +0.0
+  // | MUL      | \c 0        | MUL: Set operand C to +0.0 or -0.0 depending on the rounding mode
   // | *others* | \c -        | *invalid*
   // \note \c op_mod_q always inverts the sign of the addend.
   always_comb begin : op_select
@@ -190,8 +197,11 @@ module fpnew_fma #(
         operand_a = '{sign: 1'b0, exponent: BIAS, mantissa: '0};
         info_a    = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
       end
-      fpnew_pkg::MUL: begin // Set addend to -0 (for proper rounding with RDN)
-        operand_c = '{sign: 1'b1, exponent: '0, mantissa: '0};
+      fpnew_pkg::MUL: begin // Set addend to +0 or -0, depending whether the rounding mode is RDN
+        if (inp_pipe_rnd_mode_q[NUM_INP_REGS] == fpnew_pkg::RDN)
+          operand_c = '{sign: 1'b0, exponent: '0, mantissa: '0};
+        else
+          operand_c = '{sign: 1'b1, exponent: '0, mantissa: '0};
         info_c    = '{is_zero: 1'b1, is_boxed: 1'b1, default: 1'b0}; //zero, boxed value.
       end
       default: begin // propagate don't cares
@@ -403,6 +413,7 @@ module fpnew_fma #(
   fp_t                   [0:NUM_MID_REGS]                         mid_pipe_spec_res_q;
   fpnew_pkg::status_t    [0:NUM_MID_REGS]                         mid_pipe_spec_stat_q;
   TagType                [0:NUM_MID_REGS]                         mid_pipe_tag_q;
+  logic                  [0:NUM_MID_REGS]                         mid_pipe_mask_q;
   AuxType                [0:NUM_MID_REGS]                         mid_pipe_aux_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -422,6 +433,7 @@ module fpnew_fma #(
   assign mid_pipe_spec_res_q[0]    = special_result;
   assign mid_pipe_spec_stat_q[0]   = special_status;
   assign mid_pipe_tag_q[0]         = inp_pipe_tag_q[NUM_INP_REGS];
+  assign mid_pipe_mask_q[0]        = inp_pipe_mask_q[NUM_INP_REGS];
   assign mid_pipe_aux_q[0]         = inp_pipe_aux_q[NUM_INP_REGS];
   assign mid_pipe_valid_q[0]       = inp_pipe_valid_q[NUM_INP_REGS];
   // Input stage: Propagate pipeline ready signal to input pipe
@@ -453,6 +465,7 @@ module fpnew_fma #(
     `FFL(mid_pipe_spec_res_q[i+1],    mid_pipe_spec_res_q[i],    reg_ena, '0)
     `FFL(mid_pipe_spec_stat_q[i+1],   mid_pipe_spec_stat_q[i],   reg_ena, '0)
     `FFL(mid_pipe_tag_q[i+1],         mid_pipe_tag_q[i],         reg_ena, TagType'('0))
+    `FFL(mid_pipe_mask_q[i+1],        mid_pipe_mask_q[i],        reg_ena, '0)
     `FFL(mid_pipe_aux_q[i+1],         mid_pipe_aux_q[i],         reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
@@ -629,6 +642,7 @@ module fpnew_fma #(
   fp_t                [0:NUM_OUT_REGS] out_pipe_result_q;
   fpnew_pkg::status_t [0:NUM_OUT_REGS] out_pipe_status_q;
   TagType             [0:NUM_OUT_REGS] out_pipe_tag_q;
+  logic               [0:NUM_OUT_REGS] out_pipe_mask_q;
   AuxType             [0:NUM_OUT_REGS] out_pipe_aux_q;
   logic               [0:NUM_OUT_REGS] out_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -638,6 +652,7 @@ module fpnew_fma #(
   assign out_pipe_result_q[0] = result_d;
   assign out_pipe_status_q[0] = status_d;
   assign out_pipe_tag_q[0]    = mid_pipe_tag_q[NUM_MID_REGS];
+  assign out_pipe_mask_q[0]   = mid_pipe_mask_q[NUM_MID_REGS];
   assign out_pipe_aux_q[0]    = mid_pipe_aux_q[NUM_MID_REGS];
   assign out_pipe_valid_q[0]  = mid_pipe_valid_q[NUM_MID_REGS];
   // Input stage: Propagate pipeline ready signal to inside pipe
@@ -658,6 +673,7 @@ module fpnew_fma #(
     `FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
     `FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
     `FFL(out_pipe_tag_q[i+1],    out_pipe_tag_q[i],    reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],   out_pipe_mask_q[i],   reg_ena, '0)
     `FFL(out_pipe_aux_q[i+1],    out_pipe_aux_q[i],    reg_ena, AuxType'('0))
   end
   // Output stage: Ready travels backwards from output side, driven by downstream circuitry
@@ -667,6 +683,7 @@ module fpnew_fma #(
   assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
   assign extension_bit_o = 1'b1; // always NaN-Box result
   assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
   assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
   assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
   assign busy_o          = (| {inp_pipe_valid_q, mid_pipe_valid_q, out_pipe_valid_q});
diff --git a/src/fpnew_fma_multi.sv b/src/fpnew_fma_multi.sv
index 712dfcd9..cceeae3c 100644
--- a/src/fpnew_fma_multi.sv
+++ b/src/fpnew_fma_multi.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -34,6 +36,7 @@ module fpnew_fma_multi #(
   input  fpnew_pkg::fp_format_e       src_fmt_i, // format of the multiplicands
   input  fpnew_pkg::fp_format_e       dst_fmt_i, // format of the addend and result
   input  TagType                      tag_i,
+  input  logic                        mask_i,
   input  AuxType                      aux_i,
   // Input Handshake
   input  logic                        in_valid_i,
@@ -44,6 +47,7 @@ module fpnew_fma_multi #(
   output fpnew_pkg::status_t          status_o,
   output logic                        extension_bit_o,
   output TagType                      tag_o,
+  output logic                        mask_o,
   output AuxType                      aux_o,
   // Output handshake
   output logic                        out_valid_o,
@@ -70,8 +74,8 @@ module fpnew_fma_multi #(
   // datapath leakage. This is either given by the exponent bits or the width of the LZC result.
   // In most reasonable FP formats the internal exponent will be wider than the LZC result.
   localparam int unsigned EXP_WIDTH = fpnew_pkg::maximum(SUPER_EXP_BITS + 2, LZC_RESULT_WIDTH);
-  // Shift amount width: maximum internal mantissa size is 3p+3 bits
-  localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(3 * PRECISION_BITS + 3);
+  // Shift amount width: maximum internal mantissa size is 3p+4 bits
+  localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(3 * PRECISION_BITS + 5);
   // Pipelines
   localparam NUM_INP_REGS = PipeConfig == fpnew_pkg::BEFORE
                             ? NumPipeRegs
@@ -115,6 +119,7 @@ module fpnew_fma_multi #(
   fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_src_fmt_q;
   fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_dst_fmt_q;
   TagType                [0:NUM_INP_REGS]                       inp_pipe_tag_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_mask_q;
   AuxType                [0:NUM_INP_REGS]                       inp_pipe_aux_q;
   logic                  [0:NUM_INP_REGS]                       inp_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -129,6 +134,7 @@ module fpnew_fma_multi #(
   assign inp_pipe_src_fmt_q[0]  = src_fmt_i;
   assign inp_pipe_dst_fmt_q[0]  = dst_fmt_i;
   assign inp_pipe_tag_q[0]      = tag_i;
+  assign inp_pipe_mask_q[0]     = mask_i;
   assign inp_pipe_aux_q[0]      = aux_i;
   assign inp_pipe_valid_q[0]    = in_valid_i;
   // Input stage: Propagate pipeline ready signal to updtream circuitry
@@ -154,6 +160,7 @@ module fpnew_fma_multi #(
     `FFL(inp_pipe_src_fmt_q[i+1],  inp_pipe_src_fmt_q[i],  reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(inp_pipe_dst_fmt_q[i+1],  inp_pipe_dst_fmt_q[i],  reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(inp_pipe_tag_q[i+1],      inp_pipe_tag_q[i],      reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],     inp_pipe_mask_q[i],     reg_ena, '0)
     `FFL(inp_pipe_aux_q[i+1],      inp_pipe_aux_q[i],      reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
@@ -216,7 +223,7 @@ module fpnew_fma_multi #(
   // | FNMSUB   | \c 1        | FNMADD: Invert sign of operands A and C
   // | ADD      | \c 0        | ADD: Set operand A to +1.0
   // | ADD      | \c 1        | SUB: Set operand A to +1.0, invert sign of operand C
-  // | MUL      | \c 0        | MUL: Set operand C to +0.0
+  // | MUL      | \c 0        | MUL: Set operand C to +0.0 or -0.0 depending on the rounding mode
   // | *others* | \c -        | *invalid*
   // \note \c op_mod_q always inverts the sign of the addend.
   always_comb begin : op_select
@@ -239,8 +246,11 @@ module fpnew_fma_multi #(
         operand_a = '{sign: 1'b0, exponent: fpnew_pkg::bias(src_fmt_q), mantissa: '0};
         info_a    = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
       end
-      fpnew_pkg::MUL: begin // Set addend to -0 (for proper rounding with RDN)
-        operand_c = '{sign: 1'b1, exponent: '0, mantissa: '0};
+      fpnew_pkg::MUL: begin // Set addend to +0 or -0, depending whether the rounding mode is RDN
+        if (inp_pipe_rnd_mode_q[NUM_INP_REGS] == fpnew_pkg::RDN)
+          operand_c = '{sign: 1'b0, exponent: '0, mantissa: '0};
+        else
+          operand_c = '{sign: 1'b1, exponent: '0, mantissa: '0};
         info_c    = '{is_zero: 1'b1, is_boxed: 1'b1, default: 1'b0}; //zero, boxed value.
       end
       default: begin // propagate don't cares
@@ -488,6 +498,7 @@ module fpnew_fma_multi #(
   fp_t                   [0:NUM_MID_REGS]                         mid_pipe_spec_res_q;
   fpnew_pkg::status_t    [0:NUM_MID_REGS]                         mid_pipe_spec_stat_q;
   TagType                [0:NUM_MID_REGS]                         mid_pipe_tag_q;
+  logic                  [0:NUM_MID_REGS]                         mid_pipe_mask_q;
   AuxType                [0:NUM_MID_REGS]                         mid_pipe_aux_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -508,6 +519,7 @@ module fpnew_fma_multi #(
   assign mid_pipe_spec_res_q[0]    = special_result;
   assign mid_pipe_spec_stat_q[0]   = special_status;
   assign mid_pipe_tag_q[0]         = inp_pipe_tag_q[NUM_INP_REGS];
+  assign mid_pipe_mask_q[0]        = inp_pipe_mask_q[NUM_INP_REGS];
   assign mid_pipe_aux_q[0]         = inp_pipe_aux_q[NUM_INP_REGS];
   assign mid_pipe_valid_q[0]       = inp_pipe_valid_q[NUM_INP_REGS];
   // Input stage: Propagate pipeline ready signal to input pipe
@@ -540,6 +552,7 @@ module fpnew_fma_multi #(
     `FFL(mid_pipe_spec_res_q[i+1],    mid_pipe_spec_res_q[i],    reg_ena, '0)
     `FFL(mid_pipe_spec_stat_q[i+1],   mid_pipe_spec_stat_q[i],   reg_ena, '0)
     `FFL(mid_pipe_tag_q[i+1],         mid_pipe_tag_q[i],         reg_ena, TagType'('0))
+    `FFL(mid_pipe_mask_q[i+1],        mid_pipe_mask_q[i],        reg_ena, '0)
     `FFL(mid_pipe_aux_q[i+1],         mid_pipe_aux_q[i],         reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
@@ -778,6 +791,7 @@ module fpnew_fma_multi #(
   logic               [0:NUM_OUT_REGS][WIDTH-1:0] out_pipe_result_q;
   fpnew_pkg::status_t [0:NUM_OUT_REGS]            out_pipe_status_q;
   TagType             [0:NUM_OUT_REGS]            out_pipe_tag_q;
+  logic               [0:NUM_OUT_REGS]            out_pipe_mask_q;
   AuxType             [0:NUM_OUT_REGS]            out_pipe_aux_q;
   logic               [0:NUM_OUT_REGS]            out_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -787,6 +801,7 @@ module fpnew_fma_multi #(
   assign out_pipe_result_q[0] = result_d;
   assign out_pipe_status_q[0] = status_d;
   assign out_pipe_tag_q[0]    = mid_pipe_tag_q[NUM_MID_REGS];
+  assign out_pipe_mask_q[0]   = mid_pipe_mask_q[NUM_MID_REGS];
   assign out_pipe_aux_q[0]    = mid_pipe_aux_q[NUM_MID_REGS];
   assign out_pipe_valid_q[0]  = mid_pipe_valid_q[NUM_MID_REGS];
   // Input stage: Propagate pipeline ready signal to inside pipe
@@ -807,6 +822,7 @@ module fpnew_fma_multi #(
     `FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
     `FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
     `FFL(out_pipe_tag_q[i+1],    out_pipe_tag_q[i],    reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],   out_pipe_mask_q[i],   reg_ena, '0)
     `FFL(out_pipe_aux_q[i+1],    out_pipe_aux_q[i],    reg_ena, AuxType'('0))
   end
   // Output stage: Ready travels backwards from output side, driven by downstream circuitry
@@ -816,6 +832,7 @@ module fpnew_fma_multi #(
   assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
   assign extension_bit_o = 1'b1; // always NaN-Box result
   assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
   assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
   assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
   assign busy_o          = (| {inp_pipe_valid_q, mid_pipe_valid_q, out_pipe_valid_q});
diff --git a/src/fpnew_noncomp.sv b/src/fpnew_noncomp.sv
index 9e485f9e..8a182617 100644
--- a/src/fpnew_noncomp.sv
+++ b/src/fpnew_noncomp.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -31,6 +33,7 @@ module fpnew_noncomp #(
   input fpnew_pkg::operation_e     op_i,
   input logic                      op_mod_i,
   input TagType                    tag_i,
+  input logic                      mask_i,
   input AuxType                    aux_i,
   // Input Handshake
   input  logic                     in_valid_i,
@@ -43,6 +46,7 @@ module fpnew_noncomp #(
   output fpnew_pkg::classmask_e    class_mask_o,
   output logic                     is_class_o,
   output TagType                   tag_o,
+  output logic                     mask_o,
   output AuxType                   aux_o,
   // Output handshake
   output logic                     out_valid_o,
@@ -87,6 +91,7 @@ module fpnew_noncomp #(
   fpnew_pkg::operation_e [0:NUM_INP_REGS]                 inp_pipe_op_q;
   logic                  [0:NUM_INP_REGS]                 inp_pipe_op_mod_q;
   TagType                [0:NUM_INP_REGS]                 inp_pipe_tag_q;
+  logic                  [0:NUM_INP_REGS]                 inp_pipe_mask_q;
   AuxType                [0:NUM_INP_REGS]                 inp_pipe_aux_q;
   logic                  [0:NUM_INP_REGS]                 inp_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -99,6 +104,7 @@ module fpnew_noncomp #(
   assign inp_pipe_op_q[0]       = op_i;
   assign inp_pipe_op_mod_q[0]   = op_mod_i;
   assign inp_pipe_tag_q[0]      = tag_i;
+  assign inp_pipe_mask_q[0]     = mask_i;
   assign inp_pipe_aux_q[0]      = aux_i;
   assign inp_pipe_valid_q[0]    = in_valid_i;
   // Input stage: Propagate pipeline ready signal to updtream circuitry
@@ -122,6 +128,7 @@ module fpnew_noncomp #(
     `FFL(inp_pipe_op_q[i+1],       inp_pipe_op_q[i],       reg_ena, fpnew_pkg::FMADD)
     `FFL(inp_pipe_op_mod_q[i+1],   inp_pipe_op_mod_q[i],   reg_ena, '0)
     `FFL(inp_pipe_tag_q[i+1],      inp_pipe_tag_q[i],      reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],     inp_pipe_mask_q[i],     reg_ena, '0)
     `FFL(inp_pipe_aux_q[i+1],      inp_pipe_aux_q[i],      reg_ena, AuxType'('0))
   end
 
@@ -352,6 +359,7 @@ module fpnew_noncomp #(
   fpnew_pkg::classmask_e [0:NUM_OUT_REGS] out_pipe_class_mask_q;
   logic                  [0:NUM_OUT_REGS] out_pipe_is_class_q;
   TagType                [0:NUM_OUT_REGS] out_pipe_tag_q;
+  logic                  [0:NUM_OUT_REGS] out_pipe_mask_q;
   AuxType                [0:NUM_OUT_REGS] out_pipe_aux_q;
   logic                  [0:NUM_OUT_REGS] out_pipe_valid_q;
   // Ready signal is combinatorial for all stages
@@ -364,6 +372,7 @@ module fpnew_noncomp #(
   assign out_pipe_class_mask_q[0]    = class_mask_d;
   assign out_pipe_is_class_q[0]      = is_class_d;
   assign out_pipe_tag_q[0]           = inp_pipe_tag_q[NUM_INP_REGS];
+  assign out_pipe_mask_q[0]          = inp_pipe_mask_q[NUM_INP_REGS];
   assign out_pipe_aux_q[0]           = inp_pipe_aux_q[NUM_INP_REGS];
   assign out_pipe_valid_q[0]         = inp_pipe_valid_q[NUM_INP_REGS];
   // Input stage: Propagate pipeline ready signal to inside pipe
@@ -387,6 +396,7 @@ module fpnew_noncomp #(
     `FFL(out_pipe_class_mask_q[i+1],    out_pipe_class_mask_q[i],    reg_ena, fpnew_pkg::QNAN)
     `FFL(out_pipe_is_class_q[i+1],      out_pipe_is_class_q[i],      reg_ena, '0)
     `FFL(out_pipe_tag_q[i+1],           out_pipe_tag_q[i],           reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],          out_pipe_mask_q[i],          reg_ena, '0)
     `FFL(out_pipe_aux_q[i+1],           out_pipe_aux_q[i],           reg_ena, AuxType'('0))
   end
   // Output stage: Ready travels backwards from output side, driven by downstream circuitry
@@ -398,6 +408,7 @@ module fpnew_noncomp #(
   assign class_mask_o    = out_pipe_class_mask_q[NUM_OUT_REGS];
   assign is_class_o      = out_pipe_is_class_q[NUM_OUT_REGS];
   assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
   assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
   assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
   assign busy_o          = (| {inp_pipe_valid_q, out_pipe_valid_q});
diff --git a/src/fpnew_opgroup_block.sv b/src/fpnew_opgroup_block.sv
index e3be31d4..2633406f 100644
--- a/src/fpnew_opgroup_block.sv
+++ b/src/fpnew_opgroup_block.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -22,9 +24,12 @@ module fpnew_opgroup_block #(
   parameter fpnew_pkg::fmt_unit_types_t FmtUnitTypes  = '{default: fpnew_pkg::PARALLEL},
   parameter fpnew_pkg::pipe_config_t    PipeConfig    = fpnew_pkg::BEFORE,
   parameter type                        TagType       = logic,
+  parameter int unsigned                TrueSIMDClass = 0,
   // Do not change
   localparam int unsigned NUM_FORMATS  = fpnew_pkg::NUM_FP_FORMATS,
-  localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup)
+  localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
+  localparam int unsigned NUM_LANES    = fpnew_pkg::max_num_lanes(Width, FpFmtMask, EnableVectors),
+  localparam type         MaskType     = logic [NUM_LANES-1:0]
 ) (
   input logic                                     clk_i,
   input logic                                     rst_ni,
@@ -39,6 +44,7 @@ module fpnew_opgroup_block #(
   input fpnew_pkg::int_format_e                   int_fmt_i,
   input logic                                     vectorial_op_i,
   input TagType                                   tag_i,
+  input MaskType                                  simd_mask_i,
   // Input Handshake
   input  logic                                    in_valid_i,
   output logic                                    in_ready_o,
@@ -90,6 +96,11 @@ module fpnew_opgroup_block #(
 
       assign in_valid = in_valid_i & (dst_fmt_i == fmt); // enable selected format
 
+      // Forward masks related to the right SIMD lane
+      localparam int unsigned INTERNAL_LANES = fpnew_pkg::num_lanes(Width, fpnew_pkg::fp_format_e'(fmt), EnableVectors);
+      logic [INTERNAL_LANES-1:0] mask_slice;
+      always_comb for (int b = 0; b < INTERNAL_LANES; b++) mask_slice[b] = simd_mask_i[(NUM_LANES/INTERNAL_LANES)*b];
+
       fpnew_opgroup_fmt_slice #(
         .OpGroup       ( OpGroup                      ),
         .FpFormat      ( fpnew_pkg::fp_format_e'(fmt) ),
@@ -97,7 +108,8 @@ module fpnew_opgroup_block #(
         .EnableVectors ( EnableVectors                ),
         .NumPipeRegs   ( FmtPipeRegs[fmt]             ),
         .PipeConfig    ( PipeConfig                   ),
-        .TagType       ( TagType                      )
+        .TagType       ( TagType                      ),
+        .TrueSIMDClass ( TrueSIMDClass                )
       ) i_fmt_slice (
         .clk_i,
         .rst_ni,
@@ -108,6 +120,7 @@ module fpnew_opgroup_block #(
         .op_mod_i,
         .vectorial_op_i,
         .tag_i,
+        .simd_mask_i    ( mask_slice               ),
         .in_valid_i     ( in_valid                 ),
         .in_ready_o     ( fmt_in_ready[fmt]        ),
         .flush_i,
@@ -181,6 +194,7 @@ module fpnew_opgroup_block #(
       .int_fmt_i,
       .vectorial_op_i,
       .tag_i,
+      .simd_mask_i     ( simd_mask_i              ),
       .in_valid_i      ( in_valid                 ),
       .in_ready_o      ( fmt_in_ready[FMT]        ),
       .flush_i,
diff --git a/src/fpnew_opgroup_fmt_slice.sv b/src/fpnew_opgroup_fmt_slice.sv
index fda2a57f..35fbe484 100644
--- a/src/fpnew_opgroup_fmt_slice.sv
+++ b/src/fpnew_opgroup_fmt_slice.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -20,8 +22,11 @@ module fpnew_opgroup_fmt_slice #(
   parameter int unsigned             NumPipeRegs   = 0,
   parameter fpnew_pkg::pipe_config_t PipeConfig    = fpnew_pkg::BEFORE,
   parameter type                     TagType       = logic,
+  parameter int unsigned             TrueSIMDClass = 0,
   // Do not change
-  localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup)
+  localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
+  localparam int unsigned NUM_LANES    = fpnew_pkg::num_lanes(Width, FpFormat, EnableVectors),
+  localparam type         MaskType     = logic [NUM_LANES-1:0]
 ) (
   input logic                               clk_i,
   input logic                               rst_ni,
@@ -33,6 +38,7 @@ module fpnew_opgroup_fmt_slice #(
   input logic                               op_mod_i,
   input logic                               vectorial_op_i,
   input TagType                             tag_i,
+  input MaskType                            simd_mask_i,
   // Input Handshake
   input  logic                              in_valid_i,
   output logic                              in_ready_o,
@@ -50,7 +56,7 @@ module fpnew_opgroup_fmt_slice #(
 );
 
   localparam int unsigned FP_WIDTH  = fpnew_pkg::fp_width(FpFormat);
-  localparam int unsigned NUM_LANES = fpnew_pkg::num_lanes(Width, FpFormat, EnableVectors);
+  localparam int unsigned SIMD_WIDTH = unsigned'(Width/NUM_LANES);
 
 
   logic [NUM_LANES-1:0] lane_in_ready, lane_out_valid; // Handshake signals for the lanes
@@ -63,6 +69,7 @@ module fpnew_opgroup_fmt_slice #(
   logic                  [NUM_LANES-1:0] lane_ext_bit; // only the first one is actually used
   fpnew_pkg::classmask_e [NUM_LANES-1:0] lane_class_mask;
   TagType                [NUM_LANES-1:0] lane_tags; // only the first one is actually used
+  logic                  [NUM_LANES-1:0] lane_masks;
   logic                  [NUM_LANES-1:0] lane_vectorial, lane_busy, lane_is_class; // dito
 
   logic result_is_vector, result_is_class;
@@ -113,6 +120,7 @@ module fpnew_opgroup_fmt_slice #(
           .op_i,
           .op_mod_i,
           .tag_i,
+          .mask_i          ( simd_mask_i[lane]    ),
           .aux_i           ( vectorial_op         ), // Remember whether operation was vectorial
           .in_valid_i      ( in_valid             ),
           .in_ready_o      ( lane_in_ready[lane]  ),
@@ -121,6 +129,7 @@ module fpnew_opgroup_fmt_slice #(
           .status_o        ( op_status            ),
           .extension_bit_o ( lane_ext_bit[lane]   ),
           .tag_o           ( lane_tags[lane]      ),
+          .mask_o          ( lane_masks[lane]     ),
           .aux_o           ( lane_vectorial[lane] ),
           .out_valid_o     ( out_valid            ),
           .out_ready_i     ( out_ready            ),
@@ -174,6 +183,7 @@ module fpnew_opgroup_fmt_slice #(
           .op_i,
           .op_mod_i,
           .tag_i,
+          .mask_i          ( simd_mask_i[lane]     ),
           .aux_i           ( vectorial_op          ), // Remember whether operation was vectorial
           .in_valid_i      ( in_valid              ),
           .in_ready_o      ( lane_in_ready[lane]   ),
@@ -184,6 +194,7 @@ module fpnew_opgroup_fmt_slice #(
           .class_mask_o    ( lane_class_mask[lane] ),
           .is_class_o      ( lane_is_class[lane]   ),
           .tag_o           ( lane_tags[lane]       ),
+          .mask_o          ( lane_masks[lane]      ),
           .aux_o           ( lane_vectorial[lane]  ),
           .out_valid_o     ( out_valid             ),
           .out_ready_i     ( out_ready             ),
@@ -213,7 +224,10 @@ module fpnew_opgroup_fmt_slice #(
     assign slice_result[(unsigned'(lane)+1)*FP_WIDTH-1:unsigned'(lane)*FP_WIDTH] = local_result;
 
     // Create Classification results
-    if ((lane+1)*8 <= Width) begin : vectorial_class // vectorial class blocks are 8bits in size
+    if (TrueSIMDClass && SIMD_WIDTH >= 10) begin : vectorial_true_class // true vectorial class blocks are 10bits in size
+      assign slice_vec_class_result[lane*SIMD_WIDTH +: 10] = lane_class_mask[lane];
+      assign slice_vec_class_result[(lane+1)*SIMD_WIDTH-1 -: SIMD_WIDTH-10] = '0;
+    end else if ((lane+1)*8 <= Width) begin : vectorial_class // vectorial class blocks are 8bits in size
       assign local_sign = (lane_class_mask[lane] == fpnew_pkg::NEGINF ||
                            lane_class_mask[lane] == fpnew_pkg::NEGNORM ||
                            lane_class_mask[lane] == fpnew_pkg::NEGSUBNORM ||
@@ -246,9 +260,11 @@ module fpnew_opgroup_fmt_slice #(
 
   localparam int unsigned CLASS_VEC_BITS = (NUM_LANES*8 > Width) ? 8 * (Width / 8) : NUM_LANES*8;
 
-  // Pad out unused vec_class bits
-  if (CLASS_VEC_BITS < Width) begin : pad_vectorial_class
-    assign slice_vec_class_result[Width-1:CLASS_VEC_BITS] = '0;
+  // Pad out unused vec_class bits if each classify result is on 8 bits
+  if (!(TrueSIMDClass && SIMD_WIDTH >= 10)) begin
+    if (CLASS_VEC_BITS < Width) begin : pad_vectorial_class
+      assign slice_vec_class_result[Width-1:CLASS_VEC_BITS] = '0;
+    end
   end
 
   // localparam logic [Width-1:0] CLASS_VEC_MASK = 2**CLASS_VEC_BITS - 1;
@@ -270,7 +286,7 @@ module fpnew_opgroup_fmt_slice #(
     automatic fpnew_pkg::status_t temp_status;
     temp_status = '0;
     for (int i = 0; i < int'(NUM_LANES); i++)
-      temp_status |= lane_status[i];
+      temp_status |= lane_status[i] & {5{lane_masks[i]}};
     status_o = temp_status;
   end
 endmodule
diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv
index cc0dc465..08facb83 100644
--- a/src/fpnew_opgroup_multifmt_slice.sv
+++ b/src/fpnew_opgroup_multifmt_slice.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -25,7 +27,9 @@ module fpnew_opgroup_multifmt_slice #(
   parameter type                     TagType       = logic,
   // Do not change
   localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
-  localparam int unsigned NUM_FORMATS  = fpnew_pkg::NUM_FP_FORMATS
+  localparam int unsigned NUM_FORMATS  = fpnew_pkg::NUM_FP_FORMATS,
+  localparam int unsigned NUM_SIMD_LANES = fpnew_pkg::max_num_lanes(Width, FpFmtConfig, EnableVectors),
+  localparam type         MaskType     = logic [NUM_SIMD_LANES-1:0]
 ) (
   input logic                                     clk_i,
   input logic                                     rst_ni,
@@ -40,6 +44,7 @@ module fpnew_opgroup_multifmt_slice #(
   input fpnew_pkg::int_format_e                   int_fmt_i,
   input logic                                     vectorial_op_i,
   input TagType                                   tag_i,
+  input MaskType                                  simd_mask_i,
   // Input Handshake
   input  logic                                    in_valid_i,
   output logic                                    in_ready_o,
@@ -65,7 +70,7 @@ module fpnew_opgroup_multifmt_slice #(
       fpnew_pkg::maximum($clog2(NUM_FORMATS), $clog2(NUM_INT_FORMATS));
   localparam int unsigned AUX_BITS = FMT_BITS + 2; // also add vectorial and integer flags
 
-  logic [NUM_LANES-1:0] lane_in_ready, lane_out_valid; // Handshake signals for the lanes
+  logic [NUM_LANES-1:0] lane_in_ready, lane_out_valid, divsqrt_done, divsqrt_ready; // Handshake signals for the lanes
   logic                 vectorial_op;
   logic [FMT_BITS-1:0]  dst_fmt; // destination format to pass along with operation
   logic [AUX_BITS-1:0]  aux_data;
@@ -86,6 +91,7 @@ module fpnew_opgroup_multifmt_slice #(
   fpnew_pkg::status_t [NUM_LANES-1:0]   lane_status;
   logic   [NUM_LANES-1:0]               lane_ext_bit; // only the first one is actually used
   TagType [NUM_LANES-1:0]               lane_tags; // only the first one is actually used
+  logic   [NUM_LANES-1:0]               lane_masks;
   logic   [NUM_LANES-1:0][AUX_BITS-1:0] lane_aux; // only the first one is actually used
   logic   [NUM_LANES-1:0]               lane_busy; // dito
 
@@ -94,6 +100,8 @@ module fpnew_opgroup_multifmt_slice #(
   logic                result_fmt_is_int, result_is_cpk;
   logic [1:0]          result_vec_op; // info for vectorial results (for packing)
 
+  logic simd_synch_rdy, simd_synch_done;
+
   // -----------
   // Input Side
   // -----------
@@ -213,6 +221,7 @@ module fpnew_opgroup_multifmt_slice #(
           .src_fmt_i,
           .dst_fmt_i,
           .tag_i,
+          .mask_i          ( simd_mask_i[lane]   ),
           .aux_i           ( aux_data            ),
           .in_valid_i      ( in_valid            ),
           .in_ready_o      ( lane_in_ready[lane] ),
@@ -221,6 +230,7 @@ module fpnew_opgroup_multifmt_slice #(
           .status_o        ( op_status           ),
           .extension_bit_o ( lane_ext_bit[lane]  ),
           .tag_o           ( lane_tags[lane]     ),
+          .mask_o          ( lane_masks[lane]    ),
           .aux_o           ( lane_aux[lane]      ),
           .out_valid_o     ( out_valid           ),
           .out_ready_i     ( out_ready           ),
@@ -243,14 +253,20 @@ module fpnew_opgroup_multifmt_slice #(
           .op_i,
           .dst_fmt_i,
           .tag_i,
+          .mask_i          ( simd_mask_i[lane]   ),
           .aux_i           ( aux_data            ),
           .in_valid_i      ( in_valid            ),
           .in_ready_o      ( lane_in_ready[lane] ),
+          .divsqrt_done_o   ( divsqrt_done[lane] ),
+          .simd_synch_done_i( simd_synch_done    ),
+          .divsqrt_ready_o  ( divsqrt_ready[lane]),
+          .simd_synch_rdy_i( simd_synch_rdy    ),
           .flush_i,
           .result_o        ( op_result           ),
           .status_o        ( op_status           ),
           .extension_bit_o ( lane_ext_bit[lane]  ),
           .tag_o           ( lane_tags[lane]     ),
+          .mask_o          ( lane_masks[lane]    ),
           .aux_o           ( lane_aux[lane]      ),
           .out_valid_o     ( out_valid           ),
           .out_ready_i     ( out_ready           ),
@@ -278,6 +294,7 @@ module fpnew_opgroup_multifmt_slice #(
           .dst_fmt_i,
           .int_fmt_i,
           .tag_i,
+          .mask_i          ( simd_mask_i[lane]   ),
           .aux_i           ( aux_data            ),
           .in_valid_i      ( in_valid            ),
           .in_ready_o      ( lane_in_ready[lane] ),
@@ -286,6 +303,7 @@ module fpnew_opgroup_multifmt_slice #(
           .status_o        ( op_status           ),
           .extension_bit_o ( lane_ext_bit[lane]  ),
           .tag_o           ( lane_tags[lane]     ),
+          .mask_o          ( lane_masks[lane]    ),
           .aux_o           ( lane_aux[lane]      ),
           .out_valid_o     ( out_valid           ),
           .out_ready_i     ( out_ready           ),
@@ -399,6 +417,10 @@ module fpnew_opgroup_multifmt_slice #(
     assign {result_vec_op, result_is_cpk} = '0;
   end
 
+  // Synch lanes if there is more than one
+  assign simd_synch_rdy  = EnableVectors ? &divsqrt_ready : divsqrt_ready[0];
+  assign simd_synch_done = EnableVectors ? &divsqrt_done  : divsqrt_done[0];
+
   // ------------
   // Output Side
   // ------------
@@ -420,7 +442,8 @@ module fpnew_opgroup_multifmt_slice #(
     automatic fpnew_pkg::status_t temp_status;
     temp_status = '0;
     for (int i = 0; i < int'(NUM_LANES); i++)
-      temp_status |= lane_status[i];
+      temp_status |= lane_status[i] & {5{lane_masks[i]}};
     status_o = temp_status;
   end
+
 endmodule
diff --git a/src/fpnew_pkg.sv b/src/fpnew_pkg.sv
index 0d5153f5..7addc3e9 100644
--- a/src/fpnew_pkg.sv
+++ b/src/fpnew_pkg.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -131,6 +133,7 @@ package fpnew_pkg;
     RDN = 3'b010,
     RUP = 3'b011,
     RMM = 3'b100,
+    ROD = 3'b101,  // This mode is not defined in RISC-V FP-SPEC
     DYN = 3'b111
   } roundmode_e;
 
diff --git a/src/fpnew_rounding.sv b/src/fpnew_rounding.sv
index 60f63bb7..4e677209 100644
--- a/src/fpnew_rounding.sv
+++ b/src/fpnew_rounding.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -38,6 +40,7 @@ module fpnew_rounding #(
   //    010    |   RDN    | Round Down (towards -\infty)
   //    011    |   RUP    | Round Up (towards \infty)
   //    100    |   RMM    | Round to Nearest, ties to Max Magnitude
+  //    101    |   ROD    | Round towards odd (this mode is not define in RISC-V FP-SPEC)
   //  others   |          | *invalid*
   always_comb begin : rounding_decision
     unique case (rnd_mode_i)
@@ -53,6 +56,7 @@ module fpnew_rounding #(
       fpnew_pkg::RDN: round_up = (| round_sticky_bits_i) ? sign_i  : 1'b0; // to 0 if +, away if -
       fpnew_pkg::RUP: round_up = (| round_sticky_bits_i) ? ~sign_i : 1'b0; // to 0 if -, away if +
       fpnew_pkg::RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
+      fpnew_pkg::ROD: round_up = ~abs_value_i[0] & (| round_sticky_bits_i);
       default: round_up = fpnew_pkg::DONT_CARE; // propagate x
     endcase
   end
diff --git a/src/fpnew_top.sv b/src/fpnew_top.sv
index 581f25fb..f6116a5d 100644
--- a/src/fpnew_top.sv
+++ b/src/fpnew_top.sv
@@ -8,6 +8,8 @@
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
 
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
@@ -16,7 +18,11 @@ module fpnew_top #(
   parameter fpnew_pkg::fpu_features_t       Features       = fpnew_pkg::RV64D_Xsflt,
   parameter fpnew_pkg::fpu_implementation_t Implementation = fpnew_pkg::DEFAULT_NOREGS,
   parameter type                            TagType        = logic,
+  parameter int unsigned                    TrueSIMDClass  = 0,
+  parameter int unsigned                    EnableSIMDMask = 0,
   // Do not change
+  localparam int unsigned NumLanes     = fpnew_pkg::max_num_lanes(Features.Width, Features.FpFmtMask, Features.EnableVectors),
+  localparam type         MaskType     = logic [NumLanes-1:0],
   localparam int unsigned WIDTH        = Features.Width,
   localparam int unsigned NUM_OPERANDS = 3
 ) (
@@ -32,6 +38,7 @@ module fpnew_top #(
   input fpnew_pkg::int_format_e             int_fmt_i,
   input logic                               vectorial_op_i,
   input TagType                             tag_i,
+  input MaskType                            simd_mask_i,
   // Input Handshake
   input  logic                              in_valid_i,
   output logic                              in_ready_o,
@@ -85,6 +92,10 @@ module fpnew_top #(
     end
   end
 
+  // Filter out the mask if not used
+  MaskType simd_mask;
+  assign simd_mask = simd_mask_i | ~{NumLanes{logic'(EnableSIMDMask)}};
+
   // -------------------------
   // Generate Operation Blocks
   // -------------------------
@@ -111,7 +122,8 @@ module fpnew_top #(
       .FmtPipeRegs   ( Implementation.PipeRegs[opgrp]  ),
       .FmtUnitTypes  ( Implementation.UnitTypes[opgrp] ),
       .PipeConfig    ( Implementation.PipeConfig       ),
-      .TagType       ( TagType                         )
+      .TagType       ( TagType                         ),
+      .TrueSIMDClass ( TrueSIMDClass                   )
     ) i_opgroup_block (
       .clk_i,
       .rst_ni,
@@ -125,6 +137,7 @@ module fpnew_top #(
       .int_fmt_i,
       .vectorial_op_i,
       .tag_i,
+      .simd_mask_i     ( simd_mask             ),
       .in_valid_i      ( in_valid              ),
       .in_ready_o      ( opgrp_in_ready[opgrp] ),
       .flush_i,
diff --git a/src_files.yml b/src_files.yml
index 1931258f..3694c2a0 100644
--- a/src_files.yml
+++ b/src_files.yml
@@ -1,3 +1,6 @@
+# Copyright 2019 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
 fpnew:
   incdirs: [
     ../common_cells/include,