diff --git a/tests/unit/compiler/venom/test_memmerging.py b/tests/unit/compiler/venom/test_memmerging.py
index 3da09f76fb..70f5a0a9c6 100644
--- a/tests/unit/compiler/venom/test_memmerging.py
+++ b/tests/unit/compiler/venom/test_memmerging.py
@@ -767,6 +767,21 @@ def test_memmerging_double_use():
     _check_pre_post(pre, post)
 
 
+def test_existing_mcopy_overlap_nochange():
+    """
+    Check that mcopy which already contains an overlap does not get optimized
+    """
+    if not version_check(begin="cancun"):
+        return
+
+    pre = """
+    _global:
+        mcopy 32, 33, 2
+        return %1
+    """
+    _check_no_change(pre)
+
+
 @pytest.mark.parametrize("load_opcode,copy_opcode", LOAD_COPY)
 def test_memmerging_load(load_opcode, copy_opcode):
     pre = f"""
diff --git a/vyper/codegen/core.py b/vyper/codegen/core.py
index aaf6f35047..1e2134c28f 100644
--- a/vyper/codegen/core.py
+++ b/vyper/codegen/core.py
@@ -177,10 +177,44 @@ def make_byte_array_copier(dst, src):
         # batch copy the bytearray (including length word) using copy_bytes
         len_ = add_ofst(get_bytearray_length(src), 32)
         max_bytes = src.typ.maxlen + 32
+
+        if _prefer_copy_maxbound_heuristic(dst, src, item_size=1):
+            len_ = max_bytes
+
+        # batch copy the entire dynarray, including length word
         ret = copy_bytes(dst, src, len_, max_bytes)
         return b1.resolve(ret)
 
 
+# heuristic to choose
+def _prefer_copy_maxbound_heuristic(dst, src, item_size):
+    if dst.location != MEMORY:
+        return False
+
+    # a heuristic - it's cheaper to just copy the extra buffer bytes
+    # than calculate the number of bytes
+    # copy(dst, src, 32 + itemsize*load(src))
+    # =>
+    # copy(dst, src, bound)
+    # (32 + itemsize*(load(src))) costs 4*3+5 gas
+    length_calc_cost = 6 * (item_size != 1)  # PUSH MUL
+
+    if _opt_codesize():
+        # if we are optimizing for codesize, we are ok with a much higher
+        # gas cost before switching to copy(dst, src, <precise length>).
+        # 5x is chosen based on vibes.
+        length_calc_cost *= 5
+
+    src_bound = src.typ.memory_bytes_required
+    if src.location in (CALLDATA, MEMORY) and src_bound <= (5 + length_calc_cost) * 32:
+        return True
+    # threshold is 6 words of data (+ 1 length word that we need to copy anyway)
+    # dload(src) costs additional 7*3 gas
+    if src.location == DATA and src_bound <= (12 + length_calc_cost) * 32:
+        return True
+    return False
+
+
 def bytes_data_ptr(ptr):
     if ptr.location is None:  # pragma: nocover
         raise CompilerPanic("tried to modify non-pointer type")
@@ -276,6 +310,9 @@ def _dynarray_make_setter(dst, src, hi=None):
                 n_bytes = add_ofst(_mul(count, element_size), 32)
                 max_bytes = 32 + src.typ.count * element_size
 
+                if _prefer_copy_maxbound_heuristic(dst, src, element_size):
+                    n_bytes = max_bytes
+
                 # batch copy the entire dynarray, including length word
                 ret.append(copy_bytes(dst, src, n_bytes, max_bytes))
 
@@ -1038,7 +1075,18 @@ def _complex_make_setter(left, right, hi=None):
         assert is_tuple_like(left.typ)
         keys = left.typ.tuple_keys()
 
-    if left.is_pointer and right.is_pointer and right.encoding == Encoding.VYPER:
+    # performance: if there is any dynamic data, there might be
+    # unused space between the end of the dynarray and the end of the buffer.
+    # for instance DynArray[uint256, 100] with runtime length of 5.
+    # in these cases, we recurse to dynarray make_setter which has its own
+    # heuristic for when to copy all data.
+
+    # use abi_type.is_dynamic since it is identical to the query "do any children
+    # have dynamic size"
+    has_dynamic_data = right.typ.abi_type.is_dynamic()
+    simple_encoding = right.encoding == Encoding.VYPER
+
+    if left.is_pointer and right.is_pointer and simple_encoding and not has_dynamic_data:
         # both left and right are pointers, see if we want to batch copy
         # instead of unrolling the loop.
         assert left.encoding == Encoding.VYPER
diff --git a/vyper/venom/passes/memmerging.py b/vyper/venom/passes/memmerging.py
index f9be679d74..182b1f4e71 100644
--- a/vyper/venom/passes/memmerging.py
+++ b/vyper/venom/passes/memmerging.py
@@ -85,8 +85,8 @@ def merge(self, other: "_Copy"):
         self.length = new_length
         self.insts.extend(other.insts)
 
-    def __repr__(self) -> str:
-        return f"({self.src}, {self.src_end}, {self.length}, {self.dst}, {self.dst_end})"
+    def __repr__(self) -> str:  # pragma: nocover
+        return f"({self.src}, {self.dst}, {self.length})"
 
 
 class MemMergePass(IRPass):
@@ -114,7 +114,7 @@ def _optimize_copy(self, bb: IRBasicBlock, copy_opcode: str, load_opcode: str):
             copy.insts.sort(key=bb.instructions.index)
 
             if copy_opcode == "mcopy":
-                assert not copy.overwrites_self_src()
+                assert not copy.overwrites_self_src(), (copy, copy.insts, bb)
 
             pin_inst = None
             inst = copy.insts[-1]
@@ -277,8 +277,14 @@ def _barrier():
                 if self._write_after_write_hazard(n_copy):
                     _barrier()
                 # check if the new copy does not overwrites existing data
-                if not allow_dst_overlaps_src and self._read_after_write_hazard(n_copy):
-                    _barrier()
+                if not allow_dst_overlaps_src:
+                    if n_copy.overwrites_self_src():
+                        # continue otherwise we will get an assertion failure
+                        # in _optimize_copy
+                        continue
+                    if self._read_after_write_hazard(n_copy):
+                        _barrier()
+
                 self._add_copy(n_copy)
 
             elif _volatile_memory(inst):