diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..36c09e4
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,45 @@
+name: Benchmark
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install juliaup
+      uses: julia-actions/install-juliaup@v2.1.2
+      with:
+        channel: '1'
+    - name: Update Julia registry 
+      shell: julia --project=. --color=yes {0}
+      run: |
+        using Pkg
+        Pkg.Registry.update()
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        pip install -e .[test] # to put juliapkg.json in sys.path
+        python -c 'import juliacall' # force install of all deps
+    - name: Benchmark
+      run: |
+        pytest -n 0 benchmark/benchmark.py --benchmark-json=benchmark/output.json
+    - name: Store benchmark result
+      uses: benchmark-action/github-action-benchmark@v1
+      with:
+        name: Python Benchmark with pytest-benchmark
+        tool: 'pytest'
+        output-file-path: benchmark/output.json
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        auto-push: true
+        # Show alert with commit comment on detecting possible performance regression
+        alert-threshold: '200%'
+        comment-on-alert: true
+        fail-on-alert: true
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
new file mode 100644
index 0000000..d8894ed
--- /dev/null
+++ b/benchmark/benchmark.py
@@ -0,0 +1,151 @@
+import numpy as np
+import pytest
+from braket.devices import LocalSimulator
+from braket.ir.openqasm import Program
+
+# always the same for repeatability
+np.random.seed(0x1C2C6D66)
+
+batch_size = (10, 100)
+n_qubits = range(3, 16)
+exact_shots_results = (
+    "state_vector",
+    "density_matrix q[0], q[1]",
+    "probability",
+    "expectation z(q[0])",
+    "variance y(q[0])",
+)
+nonzero_shots_results = (
+    "probability",
+    "expectation z(q[0])",
+    "variance y(q[0])",
+    "sample z(q[0])",
+)
+
+
+def ghz(nq: int, result_type: str):
+    source = f"OPENQASM 3.0;\nqubit[{nq}] q;\nh q[0];\n"
+    for q in range(1, nq - 1):
+        source += f"cnot q[0], q[{q}];\n"
+
+    source += f"#pragma braket result {result_type}\n"
+    return source
+
+
+def qft(nq: int, result_type: str):
+    source = f"OPENQASM 3.0;\nqubit[{nq}] q;\n"
+    for q in range(nq - 1):
+        angle = np.pi / 2.0
+        source += f"h q[{q}];\n"
+        for ctrl_q in range(q + 1, nq - 1):
+            source += f"cphaseshift({angle}) q[{ctrl_q}], q[{q}];\n"
+            angle /= 2.0
+
+    source += f"#pragma braket result {result_type}\n"
+    return source
+
+
+def run_sim(oq3_prog, sim, shots):
+    sim.run(oq3_prog, shots=shots)
+    return
+
+
+def run_sim_batch(oq3_prog, sim, shots):
+    sim.run_batch(oq3_prog, shots=shots)
+    return
+
+
+device_ids = ("braket_sv", "braket_sv_v2", "braket_dm", "braket_dm_v2")
+
+generators = (ghz, qft)
+
+
+@pytest.mark.parametrize("device_id", device_ids)
+@pytest.mark.parametrize("nq", n_qubits)
+@pytest.mark.parametrize("exact_results", exact_shots_results)
+@pytest.mark.parametrize("circuit", generators)
+def test_exact_shots(benchmark, device_id, nq, exact_results, circuit):
+    if device_id in ("braket_dm_v2", "braket_dm") and (
+        exact_results in ("state_vector",) or nq > 10
+    ):
+        pytest.skip()
+    if (
+        device_id in ("braket_sv",)
+        and exact_results in ("density_matrix q[0], q[1]",)
+        and nq >= 17
+    ):
+        pytest.skip()
+    result_type = exact_results
+    oq3_prog = Program(source=circuit(nq, result_type))
+    sim = LocalSimulator(device_id)
+    benchmark.pedantic(run_sim, args=(oq3_prog, sim, 0), iterations=5, warmup_rounds=1)
+
+
+@pytest.mark.parametrize("device_id", device_ids)
+@pytest.mark.parametrize("nq", n_qubits)
+@pytest.mark.parametrize("batch_size", batch_size)
+@pytest.mark.parametrize("exact_results", exact_shots_results)
+@pytest.mark.parametrize("circuit", generators)
+def test_exact_shots_batched(
+    benchmark, device_id, nq, batch_size, exact_results, circuit
+):
+    if device_id in ("braket_dm_v2", "braket_dm") and (
+        exact_results in ("state_vector,") or nq >= 5
+    ):
+        pytest.skip()
+    if nq >= 10:
+        pytest.skip()
+    # skip all for now as this is very expensive
+    pytest.skip()
+    result_type = exact_results
+    oq3_prog = [Program(source=circuit(nq, result_type)) for _ in range(batch_size)]
+    sim = LocalSimulator(device_id)
+    benchmark.pedantic(
+        run_sim_batch, args=(oq3_prog, sim, 0), iterations=5, warmup_rounds=1
+    )
+
+
+shots = (100,)
+
+
+@pytest.mark.parametrize("device_id", device_ids)
+@pytest.mark.parametrize("nq", n_qubits)
+@pytest.mark.parametrize("shots", shots)
+@pytest.mark.parametrize("nonzero_shots_results", nonzero_shots_results)
+@pytest.mark.parametrize("circuit", generators)
+def test_nonzero_shots(benchmark, device_id, nq, shots, nonzero_shots_results, circuit):
+    if device_id in ("braket_dm_v2", "braket_dm") and nq > 10:
+        pytest.skip()
+    result_type = nonzero_shots_results
+    oq3_prog = Program(source=circuit(nq, result_type))
+    sim = LocalSimulator(device_id)
+    benchmark.pedantic(
+        run_sim, args=(oq3_prog, sim, shots), iterations=5, warmup_rounds=1
+    )
+    del sim
+
+
+@pytest.mark.parametrize("device_id", device_ids)
+@pytest.mark.parametrize("nq", n_qubits)
+@pytest.mark.parametrize("batch_size", batch_size)
+@pytest.mark.parametrize("shots", shots)
+@pytest.mark.parametrize("nonzero_shots_results", nonzero_shots_results)
+@pytest.mark.parametrize("circuit", generators)
+def test_nonzero_shots_batched(
+    benchmark, device_id, nq, batch_size, shots, nonzero_shots_results, circuit
+):
+    if device_id in ("braket_dm_v2", "braket_dm") and nq >= 5:
+        pytest.skip()
+    if nq >= 10:
+        pytest.skip()
+
+    # skip all for now as this is very expensive
+    pytest.skip()
+
+    result_type = nonzero_shots_results
+    oq3_prog = [Program(source=circuit(nq, result_type)) for _ in range(batch_size)]
+    sim = LocalSimulator(device_id)
+    benchmark.pedantic(
+        run_sim_batch, args=(oq3_prog, sim, shots), iterations=5, warmup_rounds=1
+    )
+    del sim
diff --git a/requirements.txt b/requirements.txt
index 40f36cd..83c0e17 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-juliacall==0.9.22
+juliacall==0.9.23
 numpy
 amazon-braket-schemas>=1.20.2
 amazon-braket-sdk>=1.83.0
diff --git a/src/braket/juliapkg.json b/src/braket/juliapkg.json
index 9ad41c3..407b497 100644
--- a/src/braket/juliapkg.json
+++ b/src/braket/juliapkg.json
@@ -1,9 +1,9 @@
 {
-    "julia": "1.9",
+    "julia": "1.10",
     "packages": {
         "BraketSimulator": {
             "uuid": "76d27892-9a0b-406c-98e4-7c178e9b3dff",
-            "version": "0.0.4"
+            "version": "0.0.5"
         },
         "JSON3": {
             "uuid": "0f8b85d8-7281-11e9-16c2-39a750bddbf1",
diff --git a/src/braket/simulator_v2/base_simulator_v2.py b/src/braket/simulator_v2/base_simulator_v2.py
index 5ab4bc8..faf0799 100644
--- a/src/braket/simulator_v2/base_simulator_v2.py
+++ b/src/braket/simulator_v2/base_simulator_v2.py
@@ -26,7 +26,7 @@ def setup_julia():
     # don't reimport if we don't have to
     if "juliacall" in sys.modules:
         os.environ["PYTHON_JULIACALL_HANDLE_SIGNALS"] = "yes"
-        return sys.modules["juliacall"].Main
+        return
     else:
         for k, default in (
             ("PYTHON_JULIACALL_HANDLE_SIGNALS", "yes"),
@@ -40,40 +40,19 @@ def setup_julia():
         import juliacall
 
         jl = juliacall.Main
-        jl.seval("using JSON3, BraketSimulator")
-        sv_stock_oq3 = """
-        OPENQASM 3.0;
-        input float theta;
-        qubit[2] q;
-        h q[0];
-        cnot q;
-        x q[0];
-        xx(theta) q;
-        yy(theta) q;
-        zz(theta) q;
-        #pragma braket result expectation z(q[0])
-        """
-        dm_stock_oq3 = """
+        jl.seval("using BraketSimulator, JSON3")
+        stock_oq3 = """
         OPENQASM 3.0;
-        input float theta;
         qubit[2] q;
         h q[0];
-        x q[0];
+        cphaseshift(1.5707963267948966) q[1], q[0];
         cnot q;
-        xx(theta) q;
-        yy(theta) q;
-        zz(theta) q;
         #pragma braket noise bit_flip(0.1) q[0]
+        #pragma braket result variance y(q[0])
+        #pragma braket result density_matrix q[0], q[1]
         #pragma braket result probability
         """
-        r = jl.BraketSimulator.simulate(
-            "braket_sv_v2", sv_stock_oq3, '{"theta": 0.1}', 0
-        )
-        jl.JSON3.write(r)
-        r = jl.BraketSimulator.simulate(
-            "braket_dm_v2", dm_stock_oq3, '{"theta": 0.1}', 0
-        )
-        jl.JSON3.write(r)
+        jl.BraketSimulator.simulate("braket_dm_v2", stock_oq3, "{}", 0)
         return
 
 
@@ -86,6 +65,29 @@ def setup_pool():
     return
 
 
+def _handle_mmaped_result(raw_result, mmap_paths, obj_lengths):
+    result = GateModelTaskResult(**raw_result)
+    if mmap_paths:
+        mmap_files = mmap_paths
+        array_lens = obj_lengths
+        mmap_index = 0
+        for result_ind, result_type in enumerate(result.resultTypes):
+            if not result_type.value:
+                d_type = (
+                    np.complex128
+                    if isinstance(result_type.type, (DensityMatrix, StateVector))
+                    else np.float64
+                )
+                result.resultTypes[result_ind].value = np.memmap(
+                    mmap_files[mmap_index],
+                    dtype=d_type,
+                    mode="r",
+                    shape=(array_lens[mmap_index],),
+                )
+                mmap_index += 1
+    return result
+
+
 class BaseLocalSimulatorV2(BaseLocalSimulator):
     def __init__(self, device: str):
         global __JULIA_POOL__
@@ -126,8 +128,8 @@ def run_openqasm(
         except Exception as e:
             _handle_julia_error(e)
 
-        result = GateModelTaskResult(**json.loads(jl_result))
-        jl_result = None
+        loaded_result = json.loads(jl_result[0])
+        result = _handle_mmaped_result(loaded_result, jl_result[1], jl_result[2])
         result.additionalMetadata.action = openqasm_ir
 
         # attach the result types
@@ -165,8 +167,15 @@ def run_multiple(
         except Exception as e:
             _handle_julia_error(e)
 
+        loaded_result = json.loads(jl_results[0])
+        paths_and_lens = json.loads(jl_results[1])
+        results_paths_lens = [
+            (loaded_result[r_ix], paths_and_lens[r_ix][0], paths_and_lens[r_ix][1])
+            for r_ix in range(len(loaded_result))
+        ]
         results = [
-            GateModelTaskResult(**json.loads(jl_result)) for jl_result in jl_results
+            _handle_mmaped_result(*result_path_len)
+            for result_path_len in results_paths_lens
         ]
         jl_results = None
         for p_ix, program in enumerate(programs):
@@ -204,9 +213,9 @@ def reconstruct_complex(v):
             }
         if isinstance(result_type.type, StateVector):
             val = task_result.resultTypes[result_ind].value
-            # complex are stored as tuples of reals
-            fixed_val = [reconstruct_complex(v) for v in val]
-            task_result.resultTypes[result_ind].value = np.asarray(fixed_val)
+            if isinstance(val, list):
+                fixed_val = [reconstruct_complex(v) for v in val]
+                task_result.resultTypes[result_ind].value = np.asarray(fixed_val)
         if isinstance(result_type.type, DensityMatrix):
             val = task_result.resultTypes[result_ind].value
             # complex are stored as tuples of reals
diff --git a/src/braket/simulator_v2/julia_workers.py b/src/braket/simulator_v2/julia_workers.py
index 6189f7d..f85289e 100644
--- a/src/braket/simulator_v2/julia_workers.py
+++ b/src/braket/simulator_v2/julia_workers.py
@@ -8,6 +8,7 @@
 
 def _handle_julia_error(error):
     # in case juliacall isn't loaded
+    print(error)
     if type(error).__name__ == "JuliaError":
         python_exception = getattr(error.exception, "alternate_type", None)
         if python_exception is None:
@@ -29,18 +30,20 @@ def translate_and_run(
     device_id: str, openqasm_ir: OpenQASMProgram, shots: int = 0
 ) -> str:
     jl = sys.modules["juliacall"].Main
-    jl_shots = shots
+    jl.GC.enable(False)
     jl_inputs = json.dumps(openqasm_ir.inputs) if openqasm_ir.inputs else "{}"
     try:
         result = jl.BraketSimulator.simulate(
             device_id,
             openqasm_ir.source,
             jl_inputs,
-            jl_shots,
+            shots,
         )
 
     except Exception as e:
         _handle_julia_error(e)
+    finally:
+        jl.GC.enable(True)
 
     return result