diff --git a/docs/docs/datasets/eternabench-cm.md b/docs/docs/datasets/eternabench-cm.md
new file mode 100644
index 00000000..f46f9071
--- /dev/null
+++ b/docs/docs/datasets/eternabench-cm.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# EternaBench-CM
+
+--8<-- "multimolecule/datasets/eternabench_cm/README.md:21:"
diff --git a/docs/docs/datasets/eternabench-external.md b/docs/docs/datasets/eternabench-external.md
new file mode 100644
index 00000000..a039a807
--- /dev/null
+++ b/docs/docs/datasets/eternabench-external.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# EternaBench-External
+
+--8<-- "multimolecule/datasets/eternabench_external/README.md:21:"
diff --git a/docs/docs/datasets/eternabench-switch.md b/docs/docs/datasets/eternabench-switch.md
new file mode 100644
index 00000000..b8aa3f72
--- /dev/null
+++ b/docs/docs/datasets/eternabench-switch.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# EternaBench-Switch
+
+--8<-- "multimolecule/datasets/eternabench_switch/README.md:21:"
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 5cef7518..0a7cb1ca 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -23,6 +23,9 @@ nav:
- bpRNA-spot: datasets/bprna-spot.md
- bpRNA-new: datasets/bprna-new.md
- RYOS: datasets/ryos.md
+ - EternaBench-CM: datasets/eternabench-cm.md
+ - EternaBench-Switch: datasets/eternabench-switch.md
+ - EternaBench-External: datasets/eternabench-external.md
- module:
- module/index.md
- heads: module/heads.md
diff --git a/multimolecule/datasets/README.md b/multimolecule/datasets/README.md
index df633d15..fda42caa 100644
--- a/multimolecule/datasets/README.md
+++ b/multimolecule/datasets/README.md
@@ -22,6 +22,9 @@ date: 2024-05-04
- [bpRNA-spot](bprna-spot)
- [bpRNA-new](bprna-new)
- [RYOS](ryos)
+- [EternaBench-CM](eternabench-cm)
+- [EternaBench-Switch](eternabench-switch)
+- [EternaBench-External](eternabench-external)
## Usage
diff --git a/multimolecule/datasets/README.zh.md b/multimolecule/datasets/README.zh.md
index 9150c1dc..a38dd356 100644
--- a/multimolecule/datasets/README.zh.md
+++ b/multimolecule/datasets/README.zh.md
@@ -22,6 +22,9 @@ date: 2024-05-04
- [bpRNA-spot](bprna-spot)
- [bpRNA-new](bprna-new)
- [RYOS](ryos)
+- [EternaBench-CM](eternabench-cm)
+- [EternaBench-Switch](eternabench-switch)
+- [EternaBench-External](eternabench-external)
## 使用
diff --git a/multimolecule/datasets/eternabench_cm/README.md b/multimolecule/datasets/eternabench_cm/README.md
new file mode 100644
index 00000000..aeed1fe8
--- /dev/null
+++ b/multimolecule/datasets/eternabench_cm/README.md
@@ -0,0 +1,110 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 1K.
+
+from __future__ import annotations
+
+import os
+
+import danling as dl
+import pandas as pd
+import torch
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+cols = [
+ "id",
+ "design",
+ "sequence",
+ "secondary_structure",
+ "reactivity",
+ "errors",
+ "signal_to_noise",
+]
+
+
+def convert_dataset_(df: pd.DataFrame):
+ df.signal_to_noise = df.signal_to_noise.str.split(":").str[-1].astype(float)
+ df = df.rename(columns={"ID": "id", "design_name": "design", "structure": "secondary_structure"})
+ df = df.sort_values("id")
+ df = df[cols]
+ return df
+
+
+def convert_dataset(convert_config):
+ train = dl.load_pandas(convert_config.train_path)
+ test = dl.load_pandas(convert_config.test_path)
+ save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)})
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-")
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/eternabench_external/README.md b/multimolecule/datasets/eternabench_external/README.md
new file mode 100644
index 00000000..c865d65a
--- /dev/null
+++ b/multimolecule/datasets/eternabench_external/README.md
@@ -0,0 +1,117 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 1K.
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import danling as dl
+import pandas as pd
+import torch
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+
+cols = ["name", "sequence", "reactivity", "seqpos", "class", "dataset"]
+
+
+def convert_dataset_(df: pd.DataFrame):
+ df.drop("seqpos", axis=1, inplace=True)
+ df = df.rename(
+ columns={
+ "Class": "class",
+ "Dataset": "dataset",
+ "orig_seqpos": "seqpos",
+ }
+ )
+ df = df.sort_values("name")
+ df = df[cols]
+ return df
+
+
+def convert_dataset(convert_config):
+ df = dl.load_pandas(convert_config.dataset_path)
+ fd = convert_dataset_(df)
+ save_dataset(convert_config, {"test": fd})
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+
+ def post(self):
+ if not self.output_path:
+ dataset_name = Path(self.dataset_path).stem
+ seq_length = dataset_name.split("_")[2][6:]
+ self.output_path = os.path.basename(os.path.dirname(__file__)).replace("_", "-") + f".{seq_length}"
+ super().post()
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/eternabench_switch/README.md b/multimolecule/datasets/eternabench_switch/README.md
new file mode 100644
index 00000000..0081cdb9
--- /dev/null
+++ b/multimolecule/datasets/eternabench_switch/README.md
@@ -0,0 +1,144 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 1K.
+
+from __future__ import annotations
+
+import os
+
+import danling as dl
+import pandas as pd
+import torch
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+cols = [
+ "id",
+ "design",
+ "sequence",
+ "activation_ratio",
+ "ligand",
+ "switch",
+ "kd_off",
+ "kd_on",
+ "kd_fmn",
+ "kd_no_fmn",
+ "min_kd_val",
+ "ms2_aptamer",
+ "lig_aptamer",
+ "ms2_lig_aptamer",
+ "log_kd_nolig",
+ "log_kd_lig",
+ "log_kd_nolig_scaled",
+ "log_kd_lig_scaled",
+ "log_AR",
+ "folding_subscore",
+ "num_clusters",
+]
+
+
+def convert_dataset_(df: pd.DataFrame):
+ df = df.rename(
+ columns={
+ "index": "id",
+ "Design": "design",
+ "Activation Ratio": "activation_ratio",
+ "Folding_Subscore": "folding_subscore",
+ "KDOFF": "kd_off",
+ "KDON": "kd_on",
+ "KDFMN": "kd_fmn",
+ "KDnoFMN": "kd_no_fmn",
+ "NumberOfClusters": "num_clusters",
+ "logkd_nolig": "log_kd_nolig",
+ "logkd_lig": "log_kd_lig",
+ "logkd_nolig_scaled": "log_kd_nolig_scaled",
+ "logkd_lig_scaled": "log_kd_lig_scaled",
+ "MS2_aptamer": "ms2_aptamer",
+ "MS2_lig_aptamer": "ms2_lig_aptamer",
+ }
+ )
+ df = df.sort_values("id")
+ df = df[cols]
+ return df
+
+
+def convert_dataset(convert_config):
+ train = dl.load_pandas(convert_config.train_path)
+ test = dl.load_pandas(convert_config.test_path)
+ save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)})
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-")
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)