diff --git a/tests/unit/compiler/venom/test_memmerging.py b/tests/unit/compiler/venom/test_memmerging.py index 3da09f76fb..70f5a0a9c6 100644 --- a/tests/unit/compiler/venom/test_memmerging.py +++ b/tests/unit/compiler/venom/test_memmerging.py @@ -767,6 +767,21 @@ def test_memmerging_double_use(): _check_pre_post(pre, post) +def test_existing_mcopy_overlap_nochange(): + """ + Check that mcopy which already contains an overlap does not get optimized + """ + if not version_check(begin="cancun"): + return + + pre = """ + _global: + mcopy 32, 33, 2 + return %1 + """ + _check_no_change(pre) + + @pytest.mark.parametrize("load_opcode,copy_opcode", LOAD_COPY) def test_memmerging_load(load_opcode, copy_opcode): pre = f""" diff --git a/vyper/codegen/core.py b/vyper/codegen/core.py index aaf6f35047..1e2134c28f 100644 --- a/vyper/codegen/core.py +++ b/vyper/codegen/core.py @@ -177,10 +177,44 @@ def make_byte_array_copier(dst, src): # batch copy the bytearray (including length word) using copy_bytes len_ = add_ofst(get_bytearray_length(src), 32) max_bytes = src.typ.maxlen + 32 + + if _prefer_copy_maxbound_heuristic(dst, src, item_size=1): + len_ = max_bytes + + # batch copy the entire dynarray, including length word ret = copy_bytes(dst, src, len_, max_bytes) return b1.resolve(ret) +# heuristic to choose +def _prefer_copy_maxbound_heuristic(dst, src, item_size): + if dst.location != MEMORY: + return False + + # a heuristic - it's cheaper to just copy the extra buffer bytes + # than calculate the number of bytes + # copy(dst, src, 32 + itemsize*load(src)) + # => + # copy(dst, src, bound) + # (32 + itemsize*(load(src))) costs 4*3+5 gas + length_calc_cost = 6 * (item_size != 1) # PUSH MUL + + if _opt_codesize(): + # if we are optimizing for codesize, we are ok with a much higher + # gas cost before switching to copy(dst, src, ). + # 5x is chosen based on vibes. + length_calc_cost *= 5 + + src_bound = src.typ.memory_bytes_required + if src.location in (CALLDATA, MEMORY) and src_bound <= (5 + length_calc_cost) * 32: + return True + # threshold is 6 words of data (+ 1 length word that we need to copy anyway) + # dload(src) costs additional 7*3 gas + if src.location == DATA and src_bound <= (12 + length_calc_cost) * 32: + return True + return False + + def bytes_data_ptr(ptr): if ptr.location is None: # pragma: nocover raise CompilerPanic("tried to modify non-pointer type") @@ -276,6 +310,9 @@ def _dynarray_make_setter(dst, src, hi=None): n_bytes = add_ofst(_mul(count, element_size), 32) max_bytes = 32 + src.typ.count * element_size + if _prefer_copy_maxbound_heuristic(dst, src, element_size): + n_bytes = max_bytes + # batch copy the entire dynarray, including length word ret.append(copy_bytes(dst, src, n_bytes, max_bytes)) @@ -1038,7 +1075,18 @@ def _complex_make_setter(left, right, hi=None): assert is_tuple_like(left.typ) keys = left.typ.tuple_keys() - if left.is_pointer and right.is_pointer and right.encoding == Encoding.VYPER: + # performance: if there is any dynamic data, there might be + # unused space between the end of the dynarray and the end of the buffer. + # for instance DynArray[uint256, 100] with runtime length of 5. + # in these cases, we recurse to dynarray make_setter which has its own + # heuristic for when to copy all data. + + # use abi_type.is_dynamic since it is identical to the query "do any children + # have dynamic size" + has_dynamic_data = right.typ.abi_type.is_dynamic() + simple_encoding = right.encoding == Encoding.VYPER + + if left.is_pointer and right.is_pointer and simple_encoding and not has_dynamic_data: # both left and right are pointers, see if we want to batch copy # instead of unrolling the loop. assert left.encoding == Encoding.VYPER diff --git a/vyper/venom/passes/memmerging.py b/vyper/venom/passes/memmerging.py index f9be679d74..182b1f4e71 100644 --- a/vyper/venom/passes/memmerging.py +++ b/vyper/venom/passes/memmerging.py @@ -85,8 +85,8 @@ def merge(self, other: "_Copy"): self.length = new_length self.insts.extend(other.insts) - def __repr__(self) -> str: - return f"({self.src}, {self.src_end}, {self.length}, {self.dst}, {self.dst_end})" + def __repr__(self) -> str: # pragma: nocover + return f"({self.src}, {self.dst}, {self.length})" class MemMergePass(IRPass): @@ -114,7 +114,7 @@ def _optimize_copy(self, bb: IRBasicBlock, copy_opcode: str, load_opcode: str): copy.insts.sort(key=bb.instructions.index) if copy_opcode == "mcopy": - assert not copy.overwrites_self_src() + assert not copy.overwrites_self_src(), (copy, copy.insts, bb) pin_inst = None inst = copy.insts[-1] @@ -277,8 +277,14 @@ def _barrier(): if self._write_after_write_hazard(n_copy): _barrier() # check if the new copy does not overwrites existing data - if not allow_dst_overlaps_src and self._read_after_write_hazard(n_copy): - _barrier() + if not allow_dst_overlaps_src: + if n_copy.overwrites_self_src(): + # continue otherwise we will get an assertion failure + # in _optimize_copy + continue + if self._read_after_write_hazard(n_copy): + _barrier() + self._add_copy(n_copy) elif _volatile_memory(inst):