From cde338a57b8de47dc570430044efa8ba8e416bdd Mon Sep 17 00:00:00 2001
From: Zhiyuan Chen <this@zyc.ai>
Date: Sat, 7 Sep 2024 02:14:26 +0800
Subject: [PATCH] add data

Signed-off-by: Zhiyuan Chen <this@zyc.ai>
---
 .github/workflows/push.yaml                  |   9 +-
 .gitmodules                                  |   3 +
 data                                         |   1 +
 demo/data/local-file.py                      |  19 +
 demo/{ => models}/direct-access.py           |   0
 demo/{ => models}/multimolecule-automodel.py |   0
 demo/{ => models}/transformers-automodel.py  |   0
 demo/{ => models}/vanilla.py                 |   0
 docs/docs/data/dataset.md                    |   9 +
 docs/docs/data/index.md                      |   9 +
 docs/mkdocs.yml                              |   5 +
 multimolecule/__init__.py                    |   8 +-
 multimolecule/data/README.md                 |  21 +
 multimolecule/data/README.zh.md              |  21 +
 multimolecule/data/__init__.py               |  20 +
 multimolecule/data/dataset.py                | 411 +++++++++++++++++++
 multimolecule/data/utils.py                  | 125 ++++++
 multimolecule/defaults.py                    |  23 ++
 multimolecule/models/README.md               |   8 +-
 multimolecule/models/README.zh.md            |   8 +-
 multimolecule/module/heads/nucleotide.py     |   2 +-
 multimolecule/module/heads/registry.py       |   2 +-
 multimolecule/module/heads/token.py          |   2 +-
 multimolecule/tasks/__init__.py              |  19 +
 multimolecule/tasks/task.py                  |  52 +++
 pyproject.toml                               |   3 +-
 requirements.txt                             |   5 +
 tests/data/test_dataset.py                   | 201 +++++++++
 28 files changed, 969 insertions(+), 17 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 data
 create mode 100644 demo/data/local-file.py
 rename demo/{ => models}/direct-access.py (100%)
 rename demo/{ => models}/multimolecule-automodel.py (100%)
 rename demo/{ => models}/transformers-automodel.py (100%)
 rename demo/{ => models}/vanilla.py (100%)
 create mode 100644 docs/docs/data/dataset.md
 create mode 100644 docs/docs/data/index.md
 create mode 100644 multimolecule/data/README.md
 create mode 100644 multimolecule/data/README.zh.md
 create mode 100644 multimolecule/data/__init__.py
 create mode 100644 multimolecule/data/dataset.py
 create mode 100644 multimolecule/data/utils.py
 create mode 100644 multimolecule/defaults.py
 create mode 100644 multimolecule/tasks/__init__.py
 create mode 100644 multimolecule/tasks/task.py
 create mode 100644 requirements.txt
 create mode 100644 tests/data/test_dataset.py

diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
index 707f80b2..b591ed92 100644
--- a/.github/workflows/push.yaml
+++ b/.github/workflows/push.yaml
@@ -18,14 +18,16 @@ jobs:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: true
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
           cache: "pip"
       - name: Install dependencies for testing
-        run: pip install pytest pytest-cov torch torchvision
+        run: pip install pytest pytest-cov
       - name: Install module
-        run: pip install -e .
+        run: pip install -r requirements.txt && pip install -e .
       - name: pytest
         run: pytest --cov=materialx --cov-report=xml --cov-report=html .
       - name: Upload coverage report for documentation
@@ -83,11 +85,11 @@ jobs:
   release:
     if: startsWith(github.event.ref, 'refs/tags/v')
     needs: [lint, test]
+    environment: pypi
     permissions:
       contents: write
       id-token: write
     runs-on: ubuntu-latest
-    environment: pypi
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
@@ -110,6 +112,7 @@ jobs:
   develop:
     if: contains(fromJson('["refs/heads/master", "refs/heads/main"]'), github.ref)
     needs: [lint, test]
+    environment: pypi
     permissions:
       contents: write
     runs-on: ubuntu-latest
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..aa89b5f4
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "data"]
+	path = data
+	url = git@github.com:MultiMolecule/data.git
diff --git a/data b/data
new file mode 160000
index 00000000..0ee715c7
--- /dev/null
+++ b/data
@@ -0,0 +1 @@
+Subproject commit 0ee715c795df8d00cebe627961e1ed153aed42ac
diff --git a/demo/data/local-file.py b/demo/data/local-file.py
new file mode 100644
index 00000000..d12fb669
--- /dev/null
+++ b/demo/data/local-file.py
@@ -0,0 +1,19 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from multimolecule.data import Dataset
+
+data = Dataset("data/rna/5utr.csv", split="train", pretrained="multimolecule/rna")
diff --git a/demo/direct-access.py b/demo/models/direct-access.py
similarity index 100%
rename from demo/direct-access.py
rename to demo/models/direct-access.py
diff --git a/demo/multimolecule-automodel.py b/demo/models/multimolecule-automodel.py
similarity index 100%
rename from demo/multimolecule-automodel.py
rename to demo/models/multimolecule-automodel.py
diff --git a/demo/transformers-automodel.py b/demo/models/transformers-automodel.py
similarity index 100%
rename from demo/transformers-automodel.py
rename to demo/models/transformers-automodel.py
diff --git a/demo/vanilla.py b/demo/models/vanilla.py
similarity index 100%
rename from demo/vanilla.py
rename to demo/models/vanilla.py
diff --git a/docs/docs/data/dataset.md b/docs/docs/data/dataset.md
new file mode 100644
index 00000000..58508f35
--- /dev/null
+++ b/docs/docs/data/dataset.md
@@ -0,0 +1,9 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# Dataset
+
+::: multimolecule.data.Dataset
diff --git a/docs/docs/data/index.md b/docs/docs/data/index.md
new file mode 100644
index 00000000..c84872ac
--- /dev/null
+++ b/docs/docs/data/index.md
@@ -0,0 +1,9 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# data
+
+--8<-- "multimolecule/data/README.md:8:"
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 57532e19..ead43c12 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -9,6 +9,9 @@ repo_url: https://github.com/DLS5-Omics/multimolecule
 
 nav:
   - index.md
+  - data:
+      - data/index.md
+      - Dataset: data/dataset.md
   - module:
       - module/index.md
       - heads: module/heads.md
@@ -182,6 +185,8 @@ plugins:
             - https://docs.python.org/3/objects.inv
             - https://pytorch.org/docs/stable/objects.inv
             - https://huggingface.co/docs/transformers/master/en/objects.inv
+            - https://huggingface.co/docs/datasets/master/en/objects.inv
+            - https://pandas.pydata.org/docs/objects.inv
             - https://danling.org/objects.inv
             - https://chanfig.danling.org/objects.inv
   - section-index
diff --git a/multimolecule/__init__.py b/multimolecule/__init__.py
index 5f0dc995..c168ce48 100644
--- a/multimolecule/__init__.py
+++ b/multimolecule/__init__.py
@@ -14,7 +14,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-from . import models, tokenisers
+from .data import Dataset
 from .models import (
     AutoModelForContactPrediction,
     AutoModelForNucleotidePrediction,
@@ -130,17 +130,18 @@
     TokenKMerHead,
     TokenPredictionHead,
 )
+from .tasks import Task, TaskLevel, TaskType
 from .tokenisers import Alphabet, DnaTokenizer, DotBracketTokenizer, ProteinTokenizer, RnaTokenizer, Tokenizer
 from .utils import count_parameters
 
 __all__ = [
     "modeling_auto",
     "modeling_outputs",
+    "Dataset",
     "PreTrainedConfig",
     "HeadConfig",
     "BaseHeadConfig",
     "MaskedLMHeadConfig",
-    "tokenisers",
     "DnaTokenizer",
     "RnaTokenizer",
     "ProteinTokenizer",
@@ -254,4 +255,7 @@
     "SinusoidalEmbedding",
     "Criterion",
     "count_parameters",
+    "Task",
+    "TaskLevel",
+    "TaskType",
 ]
diff --git a/multimolecule/data/README.md b/multimolecule/data/README.md
new file mode 100644
index 00000000..cb2bfc99
--- /dev/null
+++ b/multimolecule/data/README.md
@@ -0,0 +1,21 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# data
+
+`data` provides a collection of data processing utilities for handling data.
+
+While :hugs: [`datasets`](https://huggingface.co/docs/datasets) is a powerful library for managing datasets, it is a general-purpose tool that may not cover all the specific functionalities of scientific applications.
+
+The `data` package is designed to complement [`datasets`](https://huggingface.co/docs/datasets) by offering additional data processing utilities that are commonly used in scientific tasks.
+
+## Usage
+
+### Load from local data file
+
+```python
+--8<-- "demo/data/local-file.py:17:"
+```
diff --git a/multimolecule/data/README.zh.md b/multimolecule/data/README.zh.md
new file mode 100644
index 00000000..a5533e07
--- /dev/null
+++ b/multimolecule/data/README.zh.md
@@ -0,0 +1,21 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# data
+
+`data` 提供了一系列用于处理数据的实用工具。
+
+尽管 :hugs: [`datasets`](https://huggingface.co/docs/datasets) 是一个强大的管理数据集的库，但它是一个通用工具，可能无法涵盖科学应用程序的所有特定功能。
+
+`data` 包旨在通过提供在科学任务中常用的数据处理实用程序来补充 [`datasets`](https://huggingface.co/docs/datasets)。
+
+## Usage
+
+### 从本地数据文件加载
+
+```python
+--8<-- "demo/data/local-file.py:17:"
+```
diff --git a/multimolecule/data/__init__.py b/multimolecule/data/__init__.py
new file mode 100644
index 00000000..62196c10
--- /dev/null
+++ b/multimolecule/data/__init__.py
@@ -0,0 +1,20 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .dataset import Dataset
+from .utils import no_collate
+
+__all__ = ["Dataset", "no_collate"]
diff --git a/multimolecule/data/dataset.py b/multimolecule/data/dataset.py
new file mode 100644
index 00000000..5aec6302
--- /dev/null
+++ b/multimolecule/data/dataset.py
@@ -0,0 +1,411 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import Any, List
+from warnings import warn
+
+import danling as dl
+import datasets
+import pyarrow as pa
+import torch
+from chanfig import NestedDict
+from danling import NestedTensor
+from datasets.table import Table
+from pandas import DataFrame
+from torch import Tensor
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from multimolecule import defaults
+from multimolecule.tasks import Task
+
+from .utils import infer_discrete_map, infer_task, map_value
+
+# from multimolecule.tokenisers.dot_bracket.utils import STANDARD_ALPHABET as DOT_BRACKET_ALPHABET
+
+
+class Dataset(datasets.Dataset):
+    r"""
+    The base class for all datasets.
+
+    Dataset is a subclass of [`datasets.Dataset`][] that provides additional functionality for handling structured data.
+    It has three main features:
+
+    - column identification: identify the special columns (sequence and structure columns) in the dataset.
+    - tokenization: tokenize the sequence columns in the dataset using a pretrained tokenizer.
+    - task inference: infer the task type and level of each label column in the dataset.
+
+    Attributes:
+        tasks: A nested dictionary of the inferred tasks for each label column in the dataset.
+        tokenizer: The pretrained tokenizer to use for tokenization.
+        truncation: Whether to truncate sequences that exceed the maximum length of the tokenizer.
+        max_length: The maximum length of the input sequences.
+        data_cols: The names of all columns in the dataset.
+        feature_cols: The names of the feature columns in the dataset.
+        label_cols: The names of the label columns in the dataset.
+        sequence_cols: The names of the sequence columns in the dataset.
+        column_names_map: A mapping of column names to new column names.
+        preprocess: Whether to preprocess the dataset.
+
+    Args:
+        data: The dataset. This can be a path to a file, a tag on the Hugging Face Hub, a pyarrow.Table,
+            a [dict][], a [list][], or a [pandas.DataFrame][].
+        split: The split of the dataset.
+        tokenizer: A pretrained tokenizer to use for tokenization.
+            Either `tokenizer` or `pretrained` must be specified.
+        pretrained: The name of a pretrained tokenizer to use for tokenization.
+            Either `tokenizer` or `pretrained` must be specified.
+        feature_cols: The names of the feature columns in the dataset.
+            Will be inferred automatically if not specified.
+        label_cols: The names of the label columns in the dataset.
+            Will be inferred automatically if not specified.
+        preprocess: Whether to preprocess the dataset.
+            Preprocessing involves pre-tokenizing the sequences using the tokenizer.
+            Defaults to `True`.
+        auto_rename_cols: Whether to automatically rename columns to standard names.
+            Only works when there is exactly one feature column / one label column.
+            You can control the naming through `multimolecule.defaults.SEQUENCE_COL_NAME` and
+            `multimolecule.defaults.LABEL_COL_NAME`.
+            For more refined control, use `column_names_map`.
+        column_names_map: A mapping of column names to new column names.
+            This is useful for renaming columns to inputs that are expected by a model.
+            Defaults to `None`.
+        truncation: Whether to truncate sequences that exceed the maximum length of the tokenizer.
+            Defaults to `False`.
+        max_length: The maximum length of the input sequences.
+            Defaults to the `model_max_length` of the tokenizer.
+        info: The dataset info.
+        indices_table: The indices table.
+        fingerprint: The fingerprint of the dataset.
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    truncation: bool = False
+    max_length: int
+
+    _id_cols: List
+    _feature_cols: List
+    _label_cols: List
+
+    _sequence_cols: List
+
+    _discrete_map: Mapping
+
+    preprocess: bool = True
+    auto_rename_cols: bool = False
+    column_names_map: Mapping[str, str] | None = None
+
+    def __init__(
+        self,
+        data: Table | DataFrame | dict | list | str,
+        split: datasets.NamedSplit,
+        tokenizer: PreTrainedTokenizerBase | None = None,
+        pretrained: str | None = None,
+        feature_cols: List | None = None,
+        label_cols: List | None = None,
+        id_cols: List | None = None,
+        preprocess: bool | None = None,
+        auto_rename_cols: bool | None = None,
+        column_names_map: Mapping[str, str] | None = None,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        info: datasets.DatasetInfo | None = None,
+        indices_table: Table | None = None,
+        fingerprint: str | None = None,
+        nan_process: str = "ignore",
+        fill_value: str | int | float = 0,
+        discrete_map: Mapping[str, int] | None = None,
+    ):
+        arrow_table = self.build_table(
+            data, split, feature_cols, label_cols, nan_process=nan_process, fill_value=fill_value
+        )
+        super().__init__(
+            arrow_table=arrow_table, split=split, info=info, indices_table=indices_table, fingerprint=fingerprint
+        )
+        self.identify_special_cols(feature_cols=feature_cols, label_cols=label_cols, id_cols=id_cols)
+        self.infer_discrete_map(discrete_map)
+        self.post(
+            tokenizer=tokenizer,
+            pretrained=pretrained,
+            preprocess=preprocess,
+            truncation=truncation,
+            max_length=max_length,
+            auto_rename_cols=auto_rename_cols,
+            column_names_map=column_names_map,
+        )
+
+    def build_table(
+        self,
+        data: Table | DataFrame | dict | str,
+        split: datasets.NamedSplit,
+        feature_cols: List | None = None,
+        label_cols: List | None = None,
+        nan_process: str | None = "ignore",
+        fill_value: str | int | float = 0,
+    ) -> datasets.table.Table:
+        if isinstance(data, str):
+            try:
+                data = datasets.load_dataset(data, split=split).data
+            except FileNotFoundError:
+                data = dl.load_pandas(data)
+                if isinstance(data, DataFrame):
+                    data = data.loc[:, ~data.columns.str.contains("^Unnamed")]
+                    data = pa.Table.from_pandas(data)
+        elif isinstance(data, dict):
+            data = pa.Table.from_pydict(data)
+        elif isinstance(data, list):
+            data = pa.Table.from_pylist(data)
+        elif isinstance(data, DataFrame):
+            data = pa.Table.from_pandas(data)
+        if feature_cols is not None and label_cols is not None:
+            data = data.select(feature_cols + label_cols)
+        data = self.process_nan(data, nan_process=nan_process, fill_value=fill_value)
+        return data
+
+    def post(
+        self,
+        tokenizer: PreTrainedTokenizerBase | None = None,
+        pretrained: str | None = None,
+        max_length: int | None = None,
+        truncation: bool | None = None,
+        preprocess: bool | None = None,
+        auto_rename_cols: bool | None = None,
+        column_names_map: Mapping[str, str] | None = None,
+    ) -> None:
+        r"""
+        Perform pre-processing steps after initialization.
+
+        It first identifies the special columns (sequence and structure columns) in the dataset.
+        Then it sets the feature and label columns based on the input arguments.
+        If `auto_rename_cols` is `True`, it will automatically rename the columns to model inputs.
+        Finally, it sets the [`transform`][datasets.Dataset.set_transform] function based on the `preprocess` flag.
+        """
+        if tokenizer is None:
+            if pretrained is None:
+                raise ValueError("tokenizer and pretrained can not be both None.")
+            tokenizer = AutoTokenizer.from_pretrained(pretrained)
+        if max_length is None:
+            max_length = tokenizer.model_max_length
+        else:
+            tokenizer.model_max_length = max_length
+        self.max_length = max_length
+        if truncation is not None:
+            self.truncation = truncation
+        self.tokenizer = tokenizer
+        if preprocess is not None:
+            self.preprocess = preprocess
+        if auto_rename_cols is not None:
+            self.auto_rename_cols = auto_rename_cols
+        if self.auto_rename_cols:
+            if column_names_map is not None:
+                raise ValueError("auto_rename_cols and column_names_map are mutually exclusive.")
+            column_names_map = {}
+            if len(self.feature_cols) == 1:
+                column_names_map[self.feature_cols[0]] = defaults.SEQUENCE_COL_NAME
+            if len(self.label_cols) == 1:
+                column_names_map[self.label_cols[0]] = defaults.LABEL_COL_NAME
+        self.column_names_map = column_names_map
+        if self.column_names_map:
+            self.rename_columns(self.column_names_map)
+
+        if self.preprocess:
+            self.update(self.map(self.tokenization))
+            self.update(self.map(self.map_discrete))
+            self.set_transform(self.torch_transform)
+        else:
+            self.set_transform(self.tokenize_transform)
+
+    @cached_property
+    def tasks(self) -> NestedDict:
+        return self.infer_tasks()
+
+    def torch_transform(self, batch: Mapping) -> Mapping:
+        r"""
+        Default [`transform`][datasets.Dataset.set_transform] function when `preprocess` is `True`.
+
+        See Also:
+            [`collate`](multimolecule.Dataset.collate)
+        """
+        return {k: self.collate(k, v) for k, v in batch.items()}
+
+    def tokenize_transform(self, batch: Mapping) -> Mapping:
+        r"""
+        Default [`transform`][datasets.Dataset.set_transform] function when `preprocess` is `False`.
+
+        See Also:
+            [`collate`](multimolecule.Dataset.collate)
+        """
+        return {k: self.collate(k, v) for k, v in batch.items()}
+
+    def collate(self, col: str, data: Any) -> Tensor | NestedTensor | None:
+        r"""
+        Collate the data for a column.
+
+        If the column is a sequence column, it will tokenize the data if `tokenize` is `True`.
+        Otherwise, it will return a tensor or nested tensor.
+        """
+        if col in self.sequence_cols:
+            if isinstance(data[0], str):
+                data = self.tokenize(data)
+            return dl.tensor(data) if len(data) == 1 else NestedTensor(data)
+        if col in self.discrete_map and not self.preprocess:
+            data = map_value(data, self.discrete_map[col])
+        if isinstance(data[0], str):
+            return data
+        try:
+            return torch.tensor(data)
+        except ValueError:
+            return NestedTensor(data)
+
+    def infer_tasks(self, sequence_col: str | None = None) -> NestedDict:
+        return NestedDict({col: self.infer_task(col, sequence_col) for col in self.label_cols})
+
+    def infer_task(self, label_col: str, sequence_col: str | None = None) -> Task:
+        if sequence_col is None:
+            if len(self.sequence_cols) != 1:
+                raise ValueError("sequence_col must be specified if there are multiple sequence columns.")
+            sequence_col = self.sequence_cols[0]
+        sequence = self._data.column(sequence_col)
+        column = self._data.column(label_col)
+        return infer_task(sequence, column)
+
+    def infer_discrete_map(self, discrete_map: Mapping | None = None):
+        self._discrete_map = discrete_map or NestedDict()
+        data_cols = [i for i in self.data_cols if i not in self.discrete_map and i not in self.sequence_cols]
+        for col in data_cols:
+            discrete_map = infer_discrete_map(self._data.column(col))
+            if discrete_map:
+                self._discrete_map[col] = discrete_map  # type: ignore[index]
+
+    def __getitems__(self, keys: int | slice | Iterable[int]) -> Any:
+        return self.__getitem__(keys)
+
+    def identify_special_cols(
+        self, feature_cols: List | None = None, label_cols: List | None = None, id_cols: List | None = None
+    ) -> Sequence:
+        all_cols = self.data.column_names
+        self._id_cols = id_cols or [i for i in all_cols if i in defaults.ID_COL_NAMES]
+
+        string_cols = [k for k, v in self.features.items() if k not in self.id_cols and v.dtype == "string"]
+        self._sequence_cols = [i for i in string_cols if i in defaults.SEQUENCE_COL_NAMES]
+
+        data_cols = [i for i in all_cols if i not in self.id_cols]
+        if label_cols is None:
+            if feature_cols is None:
+                feature_cols = [i for i in data_cols if i in defaults.SEQUENCE_COL_NAMES]
+            label_cols = [i for i in data_cols if i not in feature_cols]
+        self._label_cols = label_cols
+        if feature_cols is None:
+            feature_cols = [i for i in data_cols if i not in self.label_cols]
+        self._feature_cols = feature_cols
+        missing_feature_cols = set(self.feature_cols).difference(data_cols)
+        if missing_feature_cols:
+            raise ValueError(f"{missing_feature_cols} are specified in feature_cols, but not found in dataset.")
+        missing_label_cols = set(self.label_cols).difference(data_cols)
+        if missing_label_cols:
+            raise ValueError(f"{missing_label_cols} are specified in label_cols, but not found in dataset.")
+        return string_cols
+
+    def tokenization(self, data: Mapping[str, str]) -> Mapping[str, Tensor]:
+        return {col: self.tokenize(data[col]) for col in self.sequence_cols}
+
+    def map_discrete(self, data: Mapping) -> Mapping:
+        return {name: map_value(data[name], mapping) for name, mapping in self.discrete_map.items()}
+
+    def tokenize(self, string: str) -> Tensor:
+        return self.tokenizer(string, return_attention_mask=False, truncation=self.truncation)["input_ids"]
+
+    def update(self, dataset: datasets.Dataset):
+        r"""
+        Perform an in-place update of the dataset.
+
+        This method is used to update the dataset after changes have been made to the underlying data.
+        It updates the format columns, data, info, and fingerprint of the dataset.
+        """
+        # pylint: disable=W0212
+        # Why datasets won't support in-place changes?
+        # It's just impossible to extend.
+        self._format_columns = dataset._format_columns
+        self._data = dataset._data
+        self._info = dataset._info
+        self._fingerprint = dataset._fingerprint
+
+    def rename_columns(self, column_mapping: Mapping[str, str], new_fingerprint: str | None = None) -> datasets.Dataset:
+        self.update(super().rename_columns(column_mapping, new_fingerprint=new_fingerprint))
+        self._id_cols = [column_mapping.get(i, i) for i in self.id_cols]
+        self._feature_cols = [column_mapping.get(i, i) for i in self.feature_cols]
+        self._label_cols = [column_mapping.get(i, i) for i in self.label_cols]
+        self._sequence_cols = [column_mapping.get(i, i) for i in self.sequence_cols]
+        return self
+
+    def rename_column(
+        self, original_column_name: str, new_column_name: str, new_fingerprint: str | None = None
+    ) -> datasets.Dataset:
+        self.update(super().rename_column(original_column_name, new_column_name, new_fingerprint))
+        self._id_cols = [new_column_name if i == original_column_name else i for i in self.id_cols]
+        self._feature_cols = [new_column_name if i == original_column_name else i for i in self.feature_cols]
+        self._label_cols = [new_column_name if i == original_column_name else i for i in self.label_cols]
+        self._sequence_cols = [new_column_name if i == original_column_name else i for i in self.sequence_cols]
+        return self
+
+    def process_nan(self, data: Table, nan_process: str | None, fill_value: str | int | float = 0) -> Table:
+        if nan_process == "ignore":
+            return data
+        data = data.to_pandas()
+        data = data.replace([float("inf"), -float("inf")], float("nan"))
+        if data.isnull().values.any():
+            if nan_process is None or nan_process == "error":
+                raise ValueError("NaN / inf values have been found in the dataset.")
+            warn(
+                "NaN / inf values have been found in the dataset.\n"
+                "While we can handle them, the data type of the corresponding column may be set to float, "
+                "which can and very likely will disrupt the auto task recognition.\n"
+                "It is recommended to address these values before loading the dataset."
+            )
+            if nan_process == "drop":
+                data = data.dropna()
+            elif nan_process == "fill":
+                data = data.fillna(fill_value)
+            else:
+                raise ValueError(f"Invalid nan_process: {nan_process}")
+        return pa.Table.from_pandas(data)
+
+    @property
+    def id_cols(self) -> List:
+        return self._id_cols
+
+    @property
+    def data_cols(self) -> List:
+        return self.feature_cols + self.label_cols
+
+    @property
+    def feature_cols(self) -> List:
+        return self._feature_cols
+
+    @property
+    def label_cols(self) -> List:
+        return self._label_cols
+
+    @property
+    def sequence_cols(self) -> List:
+        return self._sequence_cols
+
+    @property
+    def discrete_map(self) -> Mapping:
+        return self._discrete_map
diff --git a/multimolecule/data/utils.py b/multimolecule/data/utils.py
new file mode 100644
index 00000000..f411e615
--- /dev/null
+++ b/multimolecule/data/utils.py
@@ -0,0 +1,125 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import Any, Tuple
+
+import pyarrow as pa
+from pyarrow import Array, ChunkedArray, ListArray, StringArray
+
+from multimolecule import defaults
+from multimolecule.tasks import Task, TaskLevel, TaskType
+
+
+def no_collate(batch: Any) -> Any:
+    return batch
+
+
+def infer_task(sequence: ChunkedArray | ListArray, column: Array | ChunkedArray | ListArray) -> Task:
+    if isinstance(sequence, ChunkedArray) and sequence.num_chunks == 1:
+        sequence = sequence.chunks[0]
+    if isinstance(column, ChunkedArray) and column.num_chunks == 1:
+        column = column.chunks[0]
+    flattened, levels = flatten_column(column)
+    dtype = flattened.type
+    unique = flattened.unique()
+    num_elem = len(sequence)
+    num_tokens, num_contacts = get_num_tokens(sequence)
+
+    if levels == 0 and len(sequence) == len(column):
+        level = TaskLevel.Sequence
+        num_labels = len(flattened) // num_elem
+    elif levels > 0:
+        num_rows = defaults.TASK_INFERENCE_NUM_ROWS
+        sequence, column = sequence[:num_rows], column[:num_rows]
+        if len(flattened) % num_contacts == 0:
+            level = TaskLevel.Contact
+            num_labels = len(flattened) // num_contacts
+        elif len(flattened) % num_tokens == 0:
+            level = TaskLevel.Nucleotide
+            num_labels = len(flattened) // num_tokens
+        elif len(flattened) % num_elem == 0:
+            level = TaskLevel.Sequence
+            num_labels = len(flattened) // num_elem
+        else:
+            raise ValueError("Unable to infer task: unsupported column structure")
+    else:
+        raise ValueError("Unable to infer task: unsupported column structure")
+
+    if pa.types.is_floating(dtype):
+        return Task(TaskType.Regression, level=level, num_labels=num_labels)
+    if pa.types.is_integer(dtype):
+        if len(unique) == 2:
+            if len(flattened) in (num_elem, num_tokens, num_contacts):
+                return Task(TaskType.Binary, level=level, num_labels=1)
+            return Task(TaskType.MultiLabel, level=level, num_labels=num_labels)
+        if len(unique) / len(column) > defaults.LABLE_TYPE_THRESHOLD:
+            return Task(TaskType.Regression, level=level, num_labels=num_labels)
+        return Task(TaskType.MultiClass, level=level, num_labels=len(unique))
+    raise ValueError(f"Unable to infer task: unsupported dtype {dtype}")
+
+
+def infer_discrete_map(column: Array | ChunkedArray | ListArray) -> dict[str, int] | None:
+    if pa.types.is_floating(column.type):
+        return None
+    flattened, _ = flatten_column(column)
+    if pa.types.is_floating(flattened.type):
+        return None
+    if isinstance(flattened, (ChunkedArray, ListArray, StringArray)):
+        unique = set()
+        for i in flattened:
+            unique.update(i.as_py())
+    else:
+        unique = flattened.unique().to_pylist()
+    ret = {j: i for i, j in enumerate(sorted(unique))}
+    if list(ret.keys()) == list(ret.values()):
+        return None
+    return ret
+
+
+def map_value(value: Any, mapping: dict[str, int] | None) -> Any:
+    if mapping is None:
+        return value
+    if isinstance(value, list) and isinstance(value[0], Iterable):
+        return [[mapping[i] for i in j] for j in value]
+    if isinstance(value, Iterable):
+        return [mapping[i] for i in value]
+    return mapping[value]
+
+
+def flatten_column(column: Array | ChunkedArray | ListArray) -> Tuple[Array, int]:
+    levels = 0
+    while isinstance(column, (ChunkedArray, ListArray)):
+        if isinstance(column, ChunkedArray):
+            column = column.combine_chunks()
+        elif isinstance(column, ListArray):
+            column = column.flatten()
+            levels += 1
+    return column, levels
+
+
+def get_num_tokens(sequence: Array | ListArray) -> Tuple[int, int]:
+    if isinstance(sequence, StringArray):
+        return sum(len(i.as_py()) for i in sequence), sum(len(i.as_py()) ** 2 for i in sequence)
+    # remove <bos> and <eos> tokens in length calculation
+    offset = 0
+    if len({i[0] for i in sequence}) == 1:
+        offset += 1
+    if len({i[-1] for i in sequence}) == 1:
+        offset += 1
+    return sum((len(i) - offset) for i in sequence), sum((len(i) - offset) ** 2 for i in sequence)
diff --git a/multimolecule/defaults.py b/multimolecule/defaults.py
new file mode 100644
index 00000000..eeb176a9
--- /dev/null
+++ b/multimolecule/defaults.py
@@ -0,0 +1,23 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+ID_COL_NAMES = ["id", "idx", "index"]
+SEQUENCE_COL_NAMES = ["input_ids", "sequence", "seq"]
+LABEL_COL_NAMES = ["label", "labels"]
+SEQUENCE_COL_NAME = "input_ids"
+LABEL_COL_NAME = "labels"
+LABLE_TYPE_THRESHOLD = 0.5
+TASK_INFERENCE_NUM_ROWS = 100
diff --git a/multimolecule/models/README.md b/multimolecule/models/README.md
index c7109808..9ef2f7fe 100644
--- a/multimolecule/models/README.md
+++ b/multimolecule/models/README.md
@@ -41,7 +41,7 @@ Similar to [Token Classification](https://huggingface.co/docs/transformers/en/ta
 ### Build with `multimolecule.AutoModel`s
 
 ```python
---8<-- "demo/multimolecule-automodel.py:17:"
+--8<-- "demo/models/multimolecule-automodel.py:17:"
 ```
 
 ### Direct Access
@@ -49,7 +49,7 @@ Similar to [Token Classification](https://huggingface.co/docs/transformers/en/ta
 All models can be directly loaded with the `from_pretrained` method.
 
 ```python
---8<-- "demo/direct-access.py:17:"
+--8<-- "demo/models/direct-access.py:17:"
 ```
 
 ### Build with [`transformers.AutoModel`][]s
@@ -57,7 +57,7 @@ All models can be directly loaded with the `from_pretrained` method.
 While we use a different naming convention for model classes, the models are still registered to corresponding [`transformers.AutoModel`][]s.
 
 ```python
---8<-- "demo/transformers-automodel.py:17:"
+--8<-- "demo/models/transformers-automodel.py:17:"
 ```
 
 !!! danger "`import multimolecule` before use"
@@ -76,7 +76,7 @@ While we use a different naming convention for model classes, the models are sti
 You can also initialize a vanilla model using the model class.
 
 ```python
---8<-- "demo/vanilla.py:17:"
+--8<-- "demo/models/vanilla.py:17:"
 ```
 
 ## Available Models
diff --git a/multimolecule/models/README.zh.md b/multimolecule/models/README.zh.md
index 1e632f43..70ee7a28 100644
--- a/multimolecule/models/README.zh.md
+++ b/multimolecule/models/README.zh.md
@@ -41,7 +41,7 @@ date: 2024-05-04
 ### 使用 `multimolecule.AutoModel` 构建
 
 ```python
---8<-- "demo/multimolecule-automodel.py:17:"
+--8<-- "demo/models/multimolecule-automodel.py:17:"
 ```
 
 ### 直接访问
@@ -49,7 +49,7 @@ date: 2024-05-04
 所有模型可以通过 `from_pretrained` 方法直接加载。
 
 ```python
---8<-- "demo/direct-access.py:17:"
+--8<-- "demo/models/direct-access.py:17:"
 ```
 
 ### 使用 [`transformers.AutoModel`][] 构建
@@ -57,7 +57,7 @@ date: 2024-05-04
 虽然我们为模型类使用了不同的命名约定，但模型仍然注册到相应的 [`transformers.AutoModel`][] 中。
 
 ```python
---8<-- "demo/transformers-automodel.py:17:"
+--8<-- "demo/models/transformers-automodel.py:17:"
 ```
 
 !!! danger "使用前先 `import multimolecule`"
@@ -76,7 +76,7 @@ date: 2024-05-04
 你也可以使用模型类初始化一个基础模型。
 
 ```python
---8<-- "demo/vanilla.py:17:"
+--8<-- "demo/models/vanilla.py:17:"
 ```
 
 ## 可用模型
diff --git a/multimolecule/module/heads/nucleotide.py b/multimolecule/module/heads/nucleotide.py
index 5b8ec212..50e93fb0 100644
--- a/multimolecule/module/heads/nucleotide.py
+++ b/multimolecule/module/heads/nucleotide.py
@@ -37,7 +37,7 @@
 NucleotideHeadRegistryHF = ConfigRegistry(key="tokenizer_type")
 
 
-@HeadRegistry.register("nucleotide.single")
+@HeadRegistry.nucleotide.register("single", default=True)
 @NucleotideHeadRegistryHF.register("single", default=True)
 class NucleotidePredictionHead(PredictionHead):
     r"""
diff --git a/multimolecule/module/heads/registry.py b/multimolecule/module/heads/registry.py
index 5a7e12fa..e5393e4e 100644
--- a/multimolecule/module/heads/registry.py
+++ b/multimolecule/module/heads/registry.py
@@ -16,4 +16,4 @@
 
 from chanfig import Registry
 
-HeadRegistry = Registry(fallback=True)
+HeadRegistry = Registry(default_factory=Registry, fallback=True)
diff --git a/multimolecule/module/heads/token.py b/multimolecule/module/heads/token.py
index fe823619..158621bf 100644
--- a/multimolecule/module/heads/token.py
+++ b/multimolecule/module/heads/token.py
@@ -36,7 +36,7 @@
 TokenHeadRegistryHF = ConfigRegistry(key="tokenizer_type")
 
 
-@HeadRegistry.register("token.single")
+@HeadRegistry.token.register("single", default=True)
 @TokenHeadRegistryHF.register("single", default=True)
 class TokenPredictionHead(PredictionHead):
     r"""
diff --git a/multimolecule/tasks/__init__.py b/multimolecule/tasks/__init__.py
new file mode 100644
index 00000000..55c77033
--- /dev/null
+++ b/multimolecule/tasks/__init__.py
@@ -0,0 +1,19 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .task import Task, TaskLevel, TaskType
+
+__all__ = ["Task", "TaskType", "TaskLevel"]
diff --git a/multimolecule/tasks/task.py b/multimolecule/tasks/task.py
new file mode 100644
index 00000000..e2473ab0
--- /dev/null
+++ b/multimolecule/tasks/task.py
@@ -0,0 +1,52 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import auto
+
+try:
+    from enum import StrEnum
+except ImportError:
+    from strenum import LowercaseStrEnum as StrEnum  # type: ignore[no-redef]
+
+
+class TaskType(StrEnum):
+    Binary = auto()
+    MultiClass = auto()
+    MultiLabel = auto()
+    Regression = auto()
+
+
+class TaskLevel(StrEnum):
+    Sequence = auto()
+    Nucleotide = auto()
+    Contact = auto()
+    # Token = auto()
+
+
+@dataclass
+class Task:
+    type: TaskType
+    level: TaskLevel
+    num_labels: int = 1
+
+    def __post_init__(self):
+        if self.type in (TaskType.Binary) and self.num_labels != 1:
+            raise ValueError(f"num_labels must be 1 for {self.type} task")
+        if self.type in (TaskType.MultiClass, TaskType.MultiLabel) and self.num_labels == 1:
+            raise ValueError(f"num_labels must not be 1 for {self.type} task")
diff --git a/pyproject.toml b/pyproject.toml
index 748b911b..0390c9fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,8 +45,9 @@ dynamic = [
 ]
 dependencies = [
   "accelerate",
-  "chanfig>=0.0.99",
+  "chanfig>=0.0.105",
   "danling>=0.3.6",
+  "datasets",
   "torch",
   "transformers",
 ]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..a40408bc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+biopython
+pandas
+psycopg2
+torch
+torchvision
diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py
new file mode 100644
index 00000000..80559fc4
--- /dev/null
+++ b/tests/data/test_dataset.py
@@ -0,0 +1,201 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+from functools import partial
+
+import danling as dl
+import pytest
+import torch
+
+from multimolecule import Dataset, Task, TaskLevel, TaskType
+
+
+@pytest.mark.lfs
+class TestRNADataset:
+
+    pretrained = "multimolecule/rna"
+    root = os.path.join("data", "rna")
+
+    @pytest.mark.parametrize("preprocess", [True, False])
+    def test_5utr(self, preprocess: bool):
+        file = os.path.join(self.root, "5utr.csv")
+        dataset = Dataset(file, split="train", pretrained=self.pretrained, preprocess=preprocess, auto_rename_cols=True)
+        task = Task(type=TaskType.Regression, level=TaskLevel.Sequence)
+        elem = dataset[0]
+        assert isinstance(elem["input_ids"], dl.PNTensor)
+        assert isinstance(elem["labels"], torch.FloatTensor)
+        batch = dataset[list(range(3))]
+        assert isinstance(batch["input_ids"], dl.NestedTensor)
+        assert isinstance(batch["labels"], torch.FloatTensor)
+        assert dataset.tasks["labels"] == task
+
+    @pytest.mark.parametrize("preprocess", [True, False])
+    def test_ncrna(self, preprocess: bool):
+        file = os.path.join(self.root, "ncrna.csv")
+        dataset = Dataset(file, split="train", pretrained=self.pretrained, preprocess=preprocess, auto_rename_cols=True)
+        task = Task(type=TaskType.MultiClass, level=TaskLevel.Sequence, num_labels=13)
+        elem = dataset[0]
+        assert isinstance(elem["input_ids"], dl.PNTensor)
+        assert isinstance(elem["labels"], torch.LongTensor)
+        batch = dataset[list(range(3))]
+        assert isinstance(batch["input_ids"], dl.NestedTensor)
+        assert isinstance(batch["labels"], torch.LongTensor)
+        assert dataset.tasks["labels"] == task
+
+    @pytest.mark.parametrize("preprocess", [True, False])
+    def test_rnaswitches(self, preprocess: bool):
+        file = os.path.join(self.root, "rnaswitches.csv")
+        label_cols = ["ON", "OFF", "ON_OFF"]
+        dataset = Dataset(file, split="train", pretrained=self.pretrained, preprocess=preprocess, label_cols=label_cols)
+        task = Task(type=TaskType.Regression, level=TaskLevel.Sequence)
+        elem = dataset[0]
+        assert isinstance(elem["sequence"], dl.PNTensor)
+        assert isinstance(elem["ON"], torch.FloatTensor)
+        assert isinstance(elem["OFF"], torch.FloatTensor)
+        batch = dataset[list(range(3))]
+        assert isinstance(batch["sequence"], dl.NestedTensor)
+        assert isinstance(batch["ON_OFF"], torch.FloatTensor)
+        for t in dataset.tasks.values():
+            assert t == task
+
+    @pytest.mark.parametrize("preprocess", [True, False])
+    def test_modifications(self, preprocess: bool):
+        file = os.path.join(self.root, "modifications.json")
+        dataset = Dataset(file, split="train", pretrained=self.pretrained, preprocess=preprocess)
+        task = Task(type=TaskType.MultiLabel, level=TaskLevel.Sequence, num_labels=12)
+        elem = dataset[0]
+        assert isinstance(elem["sequence"], dl.PNTensor)
+        assert isinstance(elem["label"], torch.LongTensor)
+        batch = dataset[list(range(3))]
+        assert isinstance(batch["sequence"], dl.NestedTensor)
+        assert isinstance(batch["label"], torch.LongTensor)
+        assert dataset.tasks["label"] == task
+
+    @pytest.mark.parametrize("preprocess", [True, False])
+    def test_degradation(self, preprocess: bool):
+        file = os.path.join(self.root, "degradation.json")
+        feature_cols = ["sequence"]  # , "structure", "predicted_loop_type"]
+        label_cols = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C", "deg_pH10", "deg_50C"]
+        dataset = Dataset(
+            file,
+            split="train",
+            pretrained=self.pretrained,
+            preprocess=preprocess,
+            feature_cols=feature_cols,
+            label_cols=label_cols,
+        )
+        task = Task(type=TaskType.Regression, level=TaskLevel.Sequence, num_labels=68)
+        elem = dataset[0]
+        assert isinstance(elem["sequence"], dl.PNTensor)
+        assert isinstance(elem["deg_pH10"], torch.FloatTensor)
+        assert isinstance(elem["deg_50C"], torch.FloatTensor)
+        batch = dataset[list(range(3))]
+        assert isinstance(batch["sequence"], dl.NestedTensor)
+        assert isinstance(batch["reactivity"], torch.FloatTensor)
+        for t in dataset.tasks.values():
+            assert t == task
+
+    @pytest.mark.parametrize("preprocess", [True, False])
+    def test_spliceai(self, preprocess: bool):
+        file = os.path.join(self.root, "spliceai.json")
+        feature_cols = ["sequence"]
+        label_cols = ["splice_ai"]
+        dataset = Dataset(
+            file,
+            split="train",
+            pretrained=self.pretrained,
+            preprocess=preprocess,
+            feature_cols=feature_cols,
+            label_cols=label_cols,
+        )
+        task = Task(type=TaskType.Binary, level=TaskLevel.Nucleotide, num_labels=1)
+        elem = dataset[0]
+        assert isinstance(elem["sequence"], dl.PNTensor)
+        assert isinstance(elem["splice_ai"], torch.LongTensor)
+        batch = dataset[list(range(3))]
+        assert isinstance(batch["sequence"], dl.NestedTensor)
+        assert isinstance(batch["splice_ai"], torch.LongTensor)
+        for t in dataset.tasks.values():
+            assert t == task
+
+
+@pytest.mark.lfs
+class TestSyntheticDataset:
+
+    pretrained = "multimolecule/rna"
+    root = os.path.join("data", "synthetic")
+
+    def test_null(self):
+        file = os.path.join(self.root, "null.csv")
+        dataset_factory = partial(Dataset, file, split="train", pretrained=self.pretrained)
+        dataset = dataset_factory(nan_process="ignore")
+        assert len(dataset) == 67
+        with pytest.raises(RuntimeError):
+            dataset[0]
+        with pytest.raises(ValueError):
+            dataset = dataset_factory(nan_process="raise")
+        dataset = dataset_factory(nan_process="fill", fill_value=0)
+        assert dataset[0]["label"] == 0
+        dataset = dataset_factory(nan_process="fill", fill_value=1)
+        assert dataset[0]["label"] == 1
+        dataset = dataset_factory(nan_process="drop")
+        assert len(dataset) == 61
+
+    def test_rna_task_recognition_json(self):
+        file = os.path.join(self.root, "rna.json")
+        dataset = Dataset(file, split="train", pretrained=self.pretrained)
+        assert dataset.tasks["sequence_binary"] == Task(type=TaskType.Binary, level=TaskLevel.Sequence, num_labels=1)
+        assert dataset.tasks["sequence_multiclass"] == Task(
+            type=TaskType.MultiClass, level=TaskLevel.Sequence, num_labels=7
+        )
+        assert dataset.tasks["sequence_multilabel"] == Task(
+            type=TaskType.MultiLabel, level=TaskLevel.Sequence, num_labels=7
+        )
+        assert dataset.tasks["sequence_multireg"] == Task(
+            type=TaskType.Regression, level=TaskLevel.Sequence, num_labels=7
+        )
+        assert dataset.tasks["sequence_regression"] == Task(
+            type=TaskType.Regression, level=TaskLevel.Sequence, num_labels=1
+        )
+        assert dataset.tasks["nucleotide_binary"] == Task(
+            type=TaskType.Binary, level=TaskLevel.Nucleotide, num_labels=1
+        )
+        assert dataset.tasks["nucleotide_multiclass"] == Task(
+            type=TaskType.MultiClass, level=TaskLevel.Nucleotide, num_labels=5
+        )
+        assert dataset.tasks["nucleotide_multilabel"] == Task(
+            type=TaskType.MultiLabel, level=TaskLevel.Nucleotide, num_labels=5
+        )
+        assert dataset.tasks["nucleotide_multireg"] == Task(
+            type=TaskType.Regression, level=TaskLevel.Nucleotide, num_labels=5
+        )
+        assert dataset.tasks["nucleotide_regression"] == Task(
+            type=TaskType.Regression, level=TaskLevel.Nucleotide, num_labels=1
+        )
+        assert dataset.tasks["contact_binary"] == Task(type=TaskType.Binary, level=TaskLevel.Contact, num_labels=1)
+        assert dataset.tasks["contact_multiclass"] == Task(
+            type=TaskType.MultiClass, level=TaskLevel.Contact, num_labels=3
+        )
+        assert dataset.tasks["contact_multilabel"] == Task(
+            type=TaskType.MultiLabel, level=TaskLevel.Contact, num_labels=3
+        )
+        assert dataset.tasks["contact_multireg"] == Task(
+            type=TaskType.Regression, level=TaskLevel.Contact, num_labels=3
+        )
+        assert dataset.tasks["contact_regression"] == Task(
+            type=TaskType.Regression, level=TaskLevel.Contact, num_labels=1
+        )