diff --git a/.cirun.yml b/.cirun.yml
new file mode 100644
index 000000000..263d0faaa
--- /dev/null
+++ b/.cirun.yml
@@ -0,0 +1,11 @@
+runners:
+  - name: aws-gpu-runner
+    cloud: aws
+    instance_type: g4dn.xlarge
+    machine_image: ami-067a4ba2816407ee9
+    region: eu-north-1
+    preemptible:
+      - true
+      - false
+    labels:
+      - cirun-aws-gpu
diff --git a/.conda/meta.yaml b/.conda/meta.yaml
index bd971e10e..2f433a28e 100644
--- a/.conda/meta.yaml
+++ b/.conda/meta.yaml
@@ -40,7 +40,7 @@ requirements:
     - numba >=0.41.0
     - pooch >=1.7.0
     - joblib >=1.3.1
-    - logomaker
+    - logomaker !=0.8.5
 
 test:
   source_files:
@@ -57,7 +57,7 @@ test:
   imports:
     - scirpy
   commands:
-    - pytest --pyargs scirpy -m "not extra"
+    - pytest --pyargs scirpy -m "not extra and not gpu"
 
 about:
   home: https://scirpy.scverse.org
diff --git a/.github/workflows/test-gpu.yaml b/.github/workflows/test-gpu.yaml
new file mode 100644
index 000000000..bba741219
--- /dev/null
+++ b/.github/workflows/test-gpu.yaml
@@ -0,0 +1,65 @@
+name: GPU-CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    types:
+      - labeled
+      - opened
+      - synchronize
+
+# Cancel the job if new commits are pushed
+# https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: flying-sheep/check@v1
+        with:
+          success: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'run-gpu-ci') }}
+  test:
+    name: GPU Tests
+    needs: check
+    runs-on: "cirun-aws-gpu--${{ github.run_id }}"
+    timeout-minutes: 30
+
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Nvidia SMI sanity check
+        run: nvidia-smi
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install UV
+        uses: hynek/setup-cached-uv@v2
+        with:
+          cache-dependency-path: pyproject.toml
+
+      - name: Install scirpy
+        run: uv pip install --system -e ".[dev,test,rpack,dandelion,diversity,parasail,cupy]"
+      - name: Pip list
+        run: pip list
+
+      - name: Run test
+        run: pytest -m gpu
+
+      - name: Remove 'run-gpu-ci' Label
+        if: always()
+        uses: actions-ecosystem/action-remove-labels@v1
+        with:
+          labels: "run-gpu-ci"
+          github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2bfe23276..50534c84d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -59,7 +59,7 @@ jobs:
           PLATFORM: ${{ matrix.os }}
           DISPLAY: :42
         run: |
-          coverage run -m pytest -v --color=yes
+          coverage run -m pytest -v --color=yes -m "not gpu"
       - name: Report coverage
         run: |
           coverage report
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f4bdc26b..806b53b4b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,16 @@ and this project adheres to [Semantic Versioning][].
 [keep a changelog]: https://keepachangelog.com/en/1.0.0/
 [semantic versioning]: https://semver.org/spec/v2.0.0.html
 
+## v0.21.0
+
+### Additions
+
+- Add GPU implementation of Hamming distance ([#541](https://github.com/scverse/scirpy/pull/541))
+
+### Documentation
+
+- Add tutorial with tips for large datasets ([#541](https://github.com/scverse/scirpy/pull/541))
+
 ## v0.20.1
 
 ### Fixes
diff --git a/docs/api.rst b/docs/api.rst
index 6b35ad434..14b86aace 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -301,6 +301,7 @@ distance metrics
    ir_dist.metrics.IdentityDistanceCalculator
    ir_dist.metrics.LevenshteinDistanceCalculator
    ir_dist.metrics.HammingDistanceCalculator
+   ir_dist.metrics.GPUHammingDistanceCalculator
    ir_dist.metrics.AlignmentDistanceCalculator
    ir_dist.metrics.FastAlignmentDistanceCalculator
    ir_dist.metrics.TCRdistDistanceCalculator
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 4017dc2d8..6a181420f 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -8,3 +8,4 @@ Tutorials
    tutorials/tutorial_io.ipynb
    tutorials/tutorial_3k_tcr.ipynb
    tutorials/tutorial_5k_bcr.ipynb
+   tutorials/large-datasets.md
diff --git a/docs/tutorials/large-datasets.md b/docs/tutorials/large-datasets.md
new file mode 100644
index 000000000..ebdb8b8d8
--- /dev/null
+++ b/docs/tutorials/large-datasets.md
@@ -0,0 +1,68 @@
+# Working with >1M cells
+
+Scirpy scales to millions of cells on a single workstation. This page is a work-in-progess collection with advice how to
+work with large datasets.
+
+:::{admonition} Use an up-to-date version!
+:class: tip
+
+Scalability has been a major focus of recent developments in Scirpy. Make sure you use the latest version
+when working with large datasets to take advantage of all speedups.
+:::
+
+## Distance metrics
+
+Computing pairwise sequence distances the major bottleneck for large datasets in the scirpy workflow.
+Here is some advice on how to maximize the speed of this step:
+
+## Choose an appropriate distance metric for `pp.ir_dist`
+
+Some distance metrics are significantly faster than others. Here are the distance metrics, roughly ordered by speed:
+
+`identity` > `gpu_hamming` > `hamming` = `normalized_hamming` > `tcrdist` > `levenshtein` > `fastalignment` > `alignment`
+
+TCRdist, fastalignment and alignment produce very similar distance matrices, but tcrdist is by far the fastest. For this
+reason, we'd always recommend to go with `tcrdist`, when looking for a metric taking into account a substitution matrix.
+
+## Multi-machine parallelization with dask
+
+The `hamming`, `normalized_hamming`, `tcrdist`, `levenshtein`, `fastalignment`, and `alignment` metrics are parallelized
+using [joblib](https://joblib.readthedocs.io/en/stable/). This makes it very easy to switch the backend to
+[dask](https://www.dask.org/) to distribute jobs across a multi machine cluster. Note that this comes with a
+considerable overhead for communication between the workers. It's only worthwhile when processing on a single
+machine becomes infeasible.
+
+```python
+from dask.distributed import Client, LocalCluster
+import joblib
+
+# substitute this with a multi-machine cluster...
+cluster = LocalCluster(n_workers=16)
+client = Client(cluster)
+
+with joblib.parallel_config(backend="dask", n_jobs=200, verbose=10):
+    ir.pp.ir_dist(
+        mdata,
+        metric="tcrdist",
+        n_jobs=1, # jobs per worker
+        n_blocks = 20, # number of blocks sent to dask
+    )
+```
+
+## Using GPU acceleration for hamming distance
+
+The Hamming distance metric supports GPU acceleration via [cupy](https://cupy.dev/).
+
+First, install the optional `cupy` dependency:
+
+```
+!pip install scirpy[cupy]
+```
+
+Then simply run
+
+```
+ir.pp.ir_dist(mdata, metric="gpu_hamming")
+```
+
+to take advantage of GPU acceleration.
diff --git a/pyproject.toml b/pyproject.toml
index efff31d84..56f24d293 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,6 +81,9 @@ test = [
     'rectangle-packer',
     'parasail != 1.2.1',
 ]
+cupy = [
+    'cupy-cuda12x',
+]
 dandelion = [
     'sc-dandelion>=0.3.5',
 ]
@@ -112,7 +115,8 @@ xfail_strict = true
 # ]
 markers = [
     "conda: marks a subset of tests to be ran on the Bioconda CI.",
-    "extra: marks tests that require extra dependencies."
+    "extra: marks tests that require extra dependencies.",
+    "gpu: mark test to run on GPU",
 ]
 minversion = 6.0
 norecursedirs = [ '.*', 'build', 'dist', '*.egg', 'data', '__pycache__']
diff --git a/src/scirpy/ir_dist/__init__.py b/src/scirpy/ir_dist/__init__.py
index 4e2a065f7..90d22c06b 100644
--- a/src/scirpy/ir_dist/__init__.py
+++ b/src/scirpy/ir_dist/__init__.py
@@ -35,7 +35,16 @@ def IrNeighbors(*args, **kwargs):
 
 
 MetricType = (
-    Literal["alignment", "fastalignment", "identity", "levenshtein", "hamming", "normalized_hamming", "tcrdist"]
+    Literal[
+        "alignment",
+        "fastalignment",
+        "identity",
+        "levenshtein",
+        "hamming",
+        "gpu_haming",
+        "normalized_hamming",
+        "tcrdist",
+    ]
     | metrics.DistanceCalculator
 )
 
@@ -51,6 +60,8 @@ def IrNeighbors(*args, **kwargs):
         See :class:`~scirpy.ir_dist.metrics.TCRdistDistanceCalculator`.
       * `hamming` -- Hamming distance for CDR3 sequences of equal length.
         See :class:`~scirpy.ir_dist.metrics.HammingDistanceCalculator`.
+      * `gpu_hamming` -- Hamming distance for CDR3 sequences of equal length calculated with a GPU.
+        See :class:`~scirpy.ir_dist.metrics.GPUHammingDistanceCalculator`.
       * `normalized_hamming` -- Normalized Hamming distance (in percent) for CDR3 sequences of equal length.
         See :class:`~scirpy.ir_dist.metrics.HammingDistanceCalculator`.
       * `alignment` -- Distance based on pairwise sequence alignments using the
@@ -105,6 +116,8 @@ def _get_distance_calculator(metric: MetricType, cutoff: int | None, *, n_jobs=-
         dist_calc = metrics.HammingDistanceCalculator(n_jobs=n_jobs, **kwargs)
     elif metric == "normalized_hamming":
         dist_calc = metrics.HammingDistanceCalculator(n_jobs=n_jobs, normalize=True, **kwargs)
+    elif metric == "gpu_hamming":
+        dist_calc = metrics.GPUHammingDistanceCalculator(**kwargs)
     elif metric == "tcrdist":
         dist_calc = metrics.TCRdistDistanceCalculator(n_jobs=n_jobs, **kwargs)
     else:
@@ -184,6 +197,8 @@ def _ir_dist(
         Like `chain_idx_key`, but for `reference`.
     **kwargs
         Arguments are passed to the respective :class:`~scirpy.ir_dist.metrics.DistanceCalculator` class.
+        Check out the distance calculator for the respective metric to see parameters specific to
+        individual distance calculators that can be passed via kwargs.
 
     Returns
     -------
diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py
index f9f320510..56bf96df7 100644
--- a/src/scirpy/ir_dist/metrics.py
+++ b/src/scirpy/ir_dist/metrics.py
@@ -396,7 +396,7 @@ def _seqs2mat(
     mat = -1 * np.ones((len(seqs), max_len), dtype=np.int8)
     L = np.zeros(len(seqs), dtype=np.int8)
     for si, s in enumerate(seqs):
-        L[si] = len(s)
+        L[si] = min(len(s), max_len)
         for aai in range(max_len):
             if aai >= len(s):
                 break
@@ -752,6 +752,375 @@ def _nb_hamming_mat():
     _metric_mat = _hamming_mat
 
 
+class GPUHammingDistanceCalculator(_MetricDistanceCalculator):
+    """Computes pairwise distances between gene sequences based on the "hamming" distance metric with GPU support.
+
+    The code of this class is based on `pwseqdist <https://github.com/agartland/pwseqdist/blob/master/pwseqdist>`_.
+    Reused under MIT license, Copyright (c) 2020 Andrew Fiore-Gartland.
+
+    For performance reasons, the computation of the final result matrix is split up into several blocks. The parameter
+    gpu_n_blocks determines the number of those blocks. The parameter gpu_block_width determines how much GPU memory
+    is reserved for the computed result of each block in SPARSE representation.
+
+    E.g. there is a 1000x1000 (dense represenation) not yet computed result matrix with gpu_n_blocks=10 and gpu_block_width=20.
+    Then the result matrix is computed in 10 blocks of  1000x100 (dense representation). Each of these blocks needs to fit into
+    a 1000x20 block in SPARSE representation once computed and this 1000x20 block needs to fit into GPU memory. So there shouldn't
+    be a resulting row in a block that has more than 20 values <= cutoff.
+
+    The parameter gpu_block_width should be chosen based on the available GPU memory. Choosing lower values for gpu_n_blocks increases
+    the performance but also increases the risk of running out of reserved memory, since the result blocks that need to fit into the
+    reserved GPU memory in sparse representation get bigger.
+
+    Parameters
+    ----------
+    cutoff:
+        Will eleminate distances > cutoff to make efficient
+        use of sparse matrices.
+    gpu_n_blocks:
+        Number of blocks in which the final result matrix should be computed. Each block reserves GPU memory
+        in which the computed result block has to fit in sparse representation. Lower values give better performance
+        but increase the risk of running out of reserved memory. This value should be chosen based on the
+        estimated sparsity of the result matrix and the size of the GPU device memory.
+    gpu_block_width:
+        Maximum width of blocks in which the final result matrix should be computed. Each block reserves GPU memory
+        in which the computed result block has to fit in sparse representation. Higher values allow for a lower
+        number of result blocks (gpu_n_blocks) which increases the performance. This value should be chosen based on
+        the GPU device memory.
+    """
+
+    def __init__(
+        self,
+        *,
+        cutoff: int = 2,
+        gpu_n_blocks: int = 10,
+        gpu_block_width: int = 1000,
+    ):
+        super().__init__(n_jobs=1, n_blocks=1)
+        self.cutoff = cutoff
+        self.gpu_n_blocks = gpu_n_blocks
+        self.gpu_block_width = gpu_block_width
+
+    def _gpu_hamming_mat(
+        self,
+        *,
+        seqs: Sequence[str],
+        seqs2: Sequence[str],
+        is_symmetric: bool = False,
+        start_column: int = 0,
+    ) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray]:
+        """Computes the pairwise hamming distances for sequences in seqs and seqs2 with GPU support.
+
+        Parameters
+        ----------
+        seqs/2:
+            A python sequence of strings representing gene sequences
+        is_symmetric:
+            Determines whether the final result matrix is symmetric, assuming that this function is
+            only used to compute a block of a bigger result matrix
+        start_column:
+            Determines at which column the calculation should be started. This is only used if this function is
+            used to compute a block of a bigger result matrix that is symmetric
+
+        Returns
+        -------
+        data_rows:
+            List with array containing the non-zero data values of the result matrix,
+            needed to create the final scipy CSR result matrix later
+        indices_rows:
+            List with array containing the non-zero entry column indeces of the result matrix,
+            needed to create the final scipy CSR result matrix later
+        row_element_counts:
+            Array with integers that indicate the amount of non-zero values of the result matrix per row,
+            needed to create the final scipy CSR result matrix later
+        row_mins:
+            Always returns a numpy array containing None because the computation of the minimum distance per row is
+            not implemented for the GPU hamming calculator yet.
+        """
+        import cupy as cp
+        from tqdm import tqdm
+
+        seqs_lengths = np.vectorize(len)(seqs)
+        seqs_original_indices = np.argsort(seqs_lengths)
+        seqs = seqs[seqs_original_indices]
+
+        seqs2_lengths = np.vectorize(len)(seqs2)
+        seqs2_original_indices = np.argsort(seqs2_lengths)
+        seqs2 = seqs2[seqs2_original_indices]
+
+        seqs_original_indices = cp.asarray(seqs_original_indices, dtype=np.int32)
+        seqs2_original_indices = cp.asarray(seqs2_original_indices, dtype=np.int32)
+
+        is_symmetric = False
+
+        max_seq_len = max(len(s) for s in (*seqs, *seqs2))
+
+        def _seqs2mat_fast(seqs: Sequence[str], max_len: None | int = None) -> tuple[np.ndarray, np.ndarray]:
+            if max_len is None:
+                max_len = np.max([len(s) for s in seqs])
+            mat = -1 * np.ones((len(seqs), max_len), dtype=np.int8)
+            L = np.zeros(len(seqs), dtype=np.int8)
+            for i, seq in enumerate(seqs):
+                mat[i][0 : len(seq)] = np.frombuffer(seq.encode("ascii"), dtype=np.uint8)
+                L[i] = len(seq)
+            return mat, L
+
+        try:
+            seqs_mat1, seqs_L1 = _seqs2mat_fast(seqs, max_len=max_seq_len)
+            seqs_mat2, seqs_L2 = _seqs2mat_fast(seqs2, max_len=max_seq_len)
+        except UnicodeError:
+            logging.info(
+                "UnicodeError error occurred while converting sequences, retrying with implementation for non ascii sequences"
+            )
+            unique_characters = "".join(sorted({char for string in (*seqs, *seqs2) for char in string}))
+            seqs_mat1, seqs_L1 = _seqs2mat(seqs, alphabet=unique_characters, max_len=max_seq_len)
+            seqs_mat2, seqs_L2 = _seqs2mat(seqs2, alphabet=unique_characters, max_len=max_seq_len)
+
+        hamming_kernel = cp.RawKernel(
+            r"""
+        extern "C" __global__ __launch_bounds__(256)
+        void hamming_kernel(
+            const char* __restrict__ seqs_mat1,
+            const char* __restrict__ seqs_mat2,
+            const int* __restrict__ seqs_L1,
+            const int* seqs_L2,
+            const int* __restrict__ seqs_original_indices,
+            const int* seqs2_original_indices,
+            const int cutoff,
+            char* __restrict__ data,
+            int* __restrict__ indices,
+            int* __restrict__ row_element_counts,
+            const int block_offset,
+            const int seqs_mat1_rows,
+            const int seqs_mat2_rows,
+            const int seqs_mat1_cols,
+            const int seqs_mat2_cols,
+            const int data_cols,
+            const int indices_cols,
+            const bool is_symmetric
+        ) {
+            int row = blockDim.x * blockIdx.x + threadIdx.x;
+            if (row < seqs_mat1_rows) {
+                int seqs_original_index = seqs_original_indices[row];
+                int seq1_len = seqs_L1[row];
+                int row_end_index = 0;
+
+                for (int col = 0; col < seqs_mat2_rows; col++) {
+                    if ((! is_symmetric ) || (col + block_offset) >= row) {
+                        int seq2_len = seqs_L2[col];
+                        char distance = 1;
+
+                        if (seq1_len == seq2_len) {
+                            for (int i = 0; i < seq1_len; i++) {
+                                char val1 = seqs_mat1[i*seqs_mat1_rows+row];
+                                char val2 = seqs_mat2[i*seqs_mat2_rows+col];
+
+                                if(val1 != val2) {
+                                    distance++;
+                                }
+                            }
+                            if (distance <= cutoff + 1) {
+                                int seqs2_original_index = seqs2_original_indices[col];
+                                data[seqs_original_index * data_cols + row_end_index] = distance;
+                                indices[seqs_original_index * indices_cols + row_end_index] = seqs2_original_index;
+                                row_end_index++;
+                            }
+                        }
+                    }
+                }
+                row_element_counts[seqs_original_index] = row_end_index;
+            }
+        }
+        """,
+            "hamming_kernel",
+            options=("--maxrregcount=256",),
+        )
+
+        create_csr_kernel = cp.RawKernel(
+            r"""
+        extern "C" __global__
+        void create_csr_kernel(
+            int* data, int* indices,
+            char* data_matrix, int* indices_matrix,
+            int* indptr, int data_matrix_rows, int data_matrix_cols, int data_rows, int indices_matrix_cols
+        ) {
+            int row = blockDim.x * blockIdx.x + threadIdx.x;
+            int col = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (row < data_matrix_rows && col < data_matrix_cols) {
+                int row_start = indptr[row];
+                int row_end = indptr[row + 1];
+                int row_end_index = row_end - row_start;
+                int data_index = row_start + col;
+
+                if ((data_index < data_rows) && (col < row_end_index)) {
+                    data[data_index] = data_matrix[row * data_matrix_cols + col];
+                    indices[data_index] = indices_matrix[row * indices_matrix_cols + col];
+                }
+            }
+        }
+        """,
+            "create_csr_kernel",
+        )
+
+        def calc_block_gpu(
+            seqs_mat1, seqs_mat2_block, seqs_L1_block, seqs_L2, seqs2_original_indices_blocks, block_offset
+        ):
+            import cupy as cp
+
+            d_seqs_mat1 = cp.asarray(seqs_mat1.astype(np.int8))
+            d_seqs_mat2 = cp.asarray(seqs_mat2_block.astype(np.int8))
+            d_seqs_L1 = cp.asarray(seqs_L1_block.astype(np.int32))
+            d_seqs_L2 = cp.asarray(seqs_L2.astype(np.int32))
+
+            # Due to performance reasons and since we expect the result matrix to be very sparse, we
+            # set a maximum result width for the current block
+            max_block_width = self.gpu_block_width
+
+            d_data_matrix = cp.empty((seqs_mat1.shape[0], max_block_width), dtype=cp.int8)
+            d_indices_matrix = cp.empty((seqs_mat1.shape[0], max_block_width), dtype=np.int32)
+            d_row_element_counts = cp.zeros(seqs_mat1.shape[0], dtype=np.int32)
+
+            threads_per_block = 256
+            blocks_per_grid = (seqs_mat1.shape[0] + (threads_per_block - 1)) // threads_per_block
+
+            seqs_mat1_rows, seqs_mat1_cols = seqs_mat1.shape
+            seqs_mat2_rows, seqs_mat2_cols = seqs_mat2_block.shape
+            d_data_matrix_cols = max_block_width
+            d_indices_matrix_cols = max_block_width
+
+            d_seqs_mat1_transposed = cp.transpose(d_seqs_mat1).copy()
+            d_seqs_mat2_transposed = cp.transpose(d_seqs_mat2).copy()
+
+            hamming_kernel(
+                (blocks_per_grid,),
+                (threads_per_block,),
+                (
+                    d_seqs_mat1_transposed,
+                    d_seqs_mat2_transposed,
+                    d_seqs_L1,
+                    d_seqs_L2,
+                    seqs_original_indices,
+                    seqs2_original_indices_blocks,
+                    self.cutoff,
+                    d_data_matrix,
+                    d_indices_matrix,
+                    d_row_element_counts,
+                    block_offset,
+                    seqs_mat1_rows,
+                    seqs_mat2_rows,
+                    seqs_mat1_cols,
+                    seqs_mat2_cols,
+                    d_data_matrix_cols,
+                    d_indices_matrix_cols,
+                    is_symmetric,
+                ),
+            )
+
+            row_element_counts = d_row_element_counts.get()
+            row_max_len = np.max(row_element_counts)
+            row_element_sum = np.sum(row_element_counts, dtype=np.int64)
+
+            assert (
+                row_max_len <= max_block_width
+            ), f"""ERROR: The chosen result block width is too small to hold all result values of the current block.
+            Chosen width: {max_block_width}, Necessary width: {row_max_len}."""
+
+            assert (
+                row_element_sum <= np.iinfo(np.int32).max
+            ), f"""ERROR: There are too many result values to be held by the resulting CSR matrix of the current block.
+            Current number: {row_element_sum}, Maximum number: {np.iinfo(np.int32).max}.
+            Consider choosing a smaller cutoff to resolve this issue."""
+
+            indptr = np.zeros(seqs_mat1.shape[0] + 1, dtype=np.int32)
+            indptr[1:] = np.cumsum(row_element_counts)
+            d_indptr = cp.asarray(indptr)
+
+            n_elements = indptr[-1]
+            data = np.zeros(n_elements, dtype=np.int32)
+            d_data = cp.zeros_like(data)
+
+            indices = np.zeros(n_elements, dtype=np.int32)
+            d_indices = cp.zeros_like(indices)
+
+            threads_per_block = (1, 256)
+            blocks_per_grid_x = (d_data_matrix.shape[0] + threads_per_block[0] - 1) // threads_per_block[0]
+            blocks_per_grid_y = (d_data_matrix.shape[1] + threads_per_block[1] - 1) // threads_per_block[1]
+            blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
+
+            create_csr_kernel(
+                (blocks_per_grid_x, blocks_per_grid_y),
+                threads_per_block,
+                (
+                    d_data,
+                    d_indices,
+                    d_data_matrix,
+                    d_indices_matrix,
+                    d_indptr,
+                    d_data_matrix.shape[0],
+                    d_data_matrix.shape[1],
+                    d_data.shape[0],
+                    d_indices_matrix.shape[1],
+                ),
+            )
+
+            data = d_data.get()
+            indptr = d_indptr.get()
+            indices = d_indices.get()
+
+            res = csr_matrix((data, indices, indptr), shape=(seqs_mat1.shape[0], seqs_mat2.shape[0]))
+            return res
+
+        # Set the number of blocks for the calculation. A higher number can be more memory friendly, whereas
+        # a lower number can improve the performance.
+        n_blocks = self.gpu_n_blocks
+
+        seqs_mat2_blocks = np.array_split(seqs_mat2, n_blocks)
+        seqs_L2_blocks = np.array_split(seqs_L2, n_blocks)
+        seqs2_original_indices_blocks = np.array_split(seqs2_original_indices, n_blocks)
+        result_blocks = [None] * n_blocks
+
+        block_offset = start_column
+
+        logging.info(
+            f"\nStart GPU calculations for {n_blocks} sparse matrix result blocks of max width {self.gpu_block_width}:"
+        )
+
+        for i in tqdm(range(0, n_blocks), desc="Processing", unit="block"):
+            result_blocks[i] = calc_block_gpu(
+                seqs_mat1,
+                seqs_mat2_blocks[i],
+                seqs_L1,
+                seqs_L2_blocks[i],
+                seqs2_original_indices_blocks[i],
+                block_offset,
+            )
+            block_offset += seqs_mat2_blocks[i].shape[0]
+
+        num_elements = 0
+        for i in range(0, len(result_blocks)):
+            num_elements += result_blocks[i].indptr[-1]
+
+        assert (
+            num_elements <= np.iinfo(np.int32).max
+        ), f"""ERROR: The overall number of result values is too high to construct the final CSR matrix by combining
+        the already calculated blocks.
+        Current number: {num_elements}, Maximum number: {np.iinfo(np.int32).max}.
+        Consider choosing a smaller cutoff to resolve this issue."""
+
+        result_sparse = result_blocks[0]
+        for i in range(1, len(result_blocks)):
+            result_sparse += result_blocks[i]
+
+        row_element_counts_gpu = np.diff(result_sparse.indptr)
+
+        result_sparse.sort_indices()
+
+        # Returns the results in a way that fits the current interface, could be improved later
+        return [result_sparse.data], [result_sparse.indices], row_element_counts_gpu, np.array([None])
+
+    _metric_mat = _gpu_hamming_mat
+
+
 class TCRdistDistanceCalculator(_MetricDistanceCalculator):
     """Computes pairwise distances between TCR CDR3 sequences based on the "tcrdist" distance metric.
 
@@ -837,9 +1206,6 @@ def _tcrdist_mat(
         ----------
         seqs/2:
             A python sequence of strings representing gene sequences
-        seqs_L1/2:
-            A vector containing the length of each sequence in the respective seqs_mat matrix,
-            without the padding in seqs_mat
         is_symmetric:
             Determines whether the final result matrix is symmetric, assuming that this function is
             only used to compute a block of a bigger result matrix
diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py
index a7b7da4e2..29fe02328 100644
--- a/src/scirpy/tests/test_ir_dist_metrics.py
+++ b/src/scirpy/tests/test_ir_dist_metrics.py
@@ -10,6 +10,7 @@
     AlignmentDistanceCalculator,
     DistanceCalculator,
     FastAlignmentDistanceCalculator,
+    GPUHammingDistanceCalculator,
     HammingDistanceCalculator,
     IdentityDistanceCalculator,
     LevenshteinDistanceCalculator,
@@ -743,3 +744,20 @@ def test_tcrdist_histogram_not_implemented():
         tcrdist_calculator = TCRdistDistanceCalculator(histogram=True)
         seqs = np.array(["AAAA", "AA", "AABB", "ABA"])
         _ = tcrdist_calculator.calc_dist_mat(seqs, seqs)
+
+
+@pytest.mark.gpu
+def test_gpu_hamming_reference():
+    # test hamming distance against reference implementation
+    from . import TESTDATA
+
+    seqs = np.load(TESTDATA / "hamming_test_data/hamming_WU3k_seqs.npy")
+    reference_result = scipy.sparse.load_npz(TESTDATA / "hamming_test_data/hamming_WU3k_csr_result.npz")
+
+    gpu_hamming_calculator = GPUHammingDistanceCalculator(cutoff=2, gpu_n_blocks=5, gpu_block_width=500)
+    res = gpu_hamming_calculator.calc_dist_mat(seqs, seqs)
+
+    assert np.array_equal(res.data, reference_result.data)
+    assert np.array_equal(res.indices, reference_result.indices)
+    assert np.array_equal(res.indptr, reference_result.indptr)
+    assert np.array_equal(res.todense(), reference_result.todense())