diff --git a/docs/docs/datasets/archiveii.md b/docs/docs/datasets/archiveii.md
new file mode 100644
index 00000000..14588f82
--- /dev/null
+++ b/docs/docs/datasets/archiveii.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# ArchiveII
+
+--8<-- "multimolecule/datasets/archiveii/README.md:24:"
diff --git a/docs/docs/datasets/rnastralign.md b/docs/docs/datasets/rnastralign.md
new file mode 100644
index 00000000..a08e6399
--- /dev/null
+++ b/docs/docs/datasets/rnastralign.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# RNAStrAlign
+
+--8<-- "multimolecule/datasets/rnastralign/README.md:24:"
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 0a7cb1ca..9c8d52de 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -22,6 +22,8 @@ nav:
- bpRNA-1m: datasets/bprna.md
- bpRNA-spot: datasets/bprna-spot.md
- bpRNA-new: datasets/bprna-new.md
+ - RNAStrAlign: datasets/rnastralign.md
+ - ArchiveII: datasets/archiveii.md
- RYOS: datasets/ryos.md
- EternaBench-CM: datasets/eternabench-cm.md
- EternaBench-Switch: datasets/eternabench-switch.md
diff --git a/multimolecule/datasets/README.md b/multimolecule/datasets/README.md
index fda42caa..07506e7b 100644
--- a/multimolecule/datasets/README.md
+++ b/multimolecule/datasets/README.md
@@ -25,6 +25,8 @@ date: 2024-05-04
- [EternaBench-CM](eternabench-cm)
- [EternaBench-Switch](eternabench-switch)
- [EternaBench-External](eternabench-external)
+- [RNAStrAlign](rnastralign)
+- [ArchiveII](archiveii)
## Usage
diff --git a/multimolecule/datasets/README.zh.md b/multimolecule/datasets/README.zh.md
index a38dd356..18d34840 100644
--- a/multimolecule/datasets/README.zh.md
+++ b/multimolecule/datasets/README.zh.md
@@ -25,6 +25,8 @@ date: 2024-05-04
- [EternaBench-CM](eternabench-cm)
- [EternaBench-Switch](eternabench-switch)
- [EternaBench-External](eternabench-external)
+- [RNAStrAlign](rnastralign)
+- [ArchiveII](archiveii)
## 使用
diff --git a/multimolecule/datasets/archiveii/README.md b/multimolecule/datasets/archiveii/README.md
new file mode 100644
index 00000000..3434e5eb
--- /dev/null
+++ b/multimolecule/datasets/archiveii/README.md
@@ -0,0 +1,100 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 10K.
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_ct(file):
+ if not isinstance(file, Path):
+ file = Path(file)
+ with open(file) as f:
+ lines = f.readlines()
+
+ first_line = lines[0].strip().split()
+ num_bases = int(first_line[0])
+
+ sequence = []
+ dot_bracket = ["."] * num_bases
+
+ for i in range(1, num_bases + 1):
+ line = lines[i].strip().split()
+ sequence.append(line[1])
+ pair_index = int(line[4])
+
+ if pair_index > 0:
+ if int(lines[pair_index].strip().split()[4]) != i:
+ raise ValueError(
+ f"Invalid pairing at position {i}: pair_index {pair_index} does not point back correctly."
+ )
+ if pair_index > i:
+ dot_bracket[i - 1] = "("
+ dot_bracket[pair_index - 1] = ")"
+
+ family, name = file.stem.split("_", 1)
+ if family in ("5s", "16s", "23s"):
+ family = family.upper() + "_rRNA"
+ elif family == "srp":
+ family = family.upper()
+ elif family == "grp1":
+ family = "group_I_intron"
+ elif family == "grp2":
+ family = "group_II_intron"
+ id = family + "-" + name
+
+ return {
+ "id": id,
+ "sequence": "".join(sequence),
+ "secondary_structure": "".join(dot_bracket),
+ "family": family,
+ }
+
+
+def convert_dataset(convert_config):
+ files = [
+ os.path.join(convert_config.dataset_path, f)
+ for f in os.listdir(convert_config.dataset_path)
+ if f.endswith(".ct")
+ ]
+ files.sort()
+ data = [convert_ct(file) for file in tqdm(files, total=len(files))]
+ save_dataset(convert_config, data, filename="test.parquet")
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__))
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/rnastralign/README.md b/multimolecule/datasets/rnastralign/README.md
new file mode 100644
index 00000000..8b46f140
--- /dev/null
+++ b/multimolecule/datasets/rnastralign/README.md
@@ -0,0 +1,102 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 10K.
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_ct(file, family: str):
+ if not isinstance(file, Path):
+ file = Path(file)
+ with open(file) as f:
+ lines = f.read().splitlines()
+
+ first_line = lines[0].strip().split()
+ num_bases = int(first_line[0])
+
+ sequence = []
+ dot_bracket = ["."] * num_bases
+
+ # `N` does not exist in the ct files, so we need to add it
+ if len(lines) < num_bases + 1:
+ for i in range(1, num_bases + 1):
+ if i >= len(lines):
+ lines.append(f"{i} N {i-1} {i+1} 0 i") # noqa: E226
+ if int(lines[i].strip().split()[0]) != i:
+ lines.insert(i, f"{i} N {i-1} {i+1} 0 i") # noqa: E226
+
+ for i in range(1, num_bases + 1):
+ line = lines[i].strip().split()
+ if int(line[0]) != i:
+ raise ValueError(f"Invalid nucleotide index at position {i}: {line[0]} does not match the expected index.")
+ sequence.append(line[1])
+ pair_index = int(line[4])
+
+ if pair_index > 0:
+ if int(lines[pair_index].strip().split()[4]) != i:
+ raise ValueError(
+ f"Invalid pairing at position {i}: pair_index {pair_index} does not point back correctly."
+ )
+ if pair_index > i:
+ dot_bracket[i - 1] = "("
+ dot_bracket[pair_index - 1] = ")"
+
+ parts = list(file.parts)
+ parts = parts[parts.index(family + "_database") :]
+ parts[0] = parts[0][:-9]
+ parts[-1] = parts[-1][:-3]
+
+ return {
+ "id": "-".join(parts),
+ "sequence": "".join(sequence),
+ "secondary_structure": "".join(dot_bracket),
+ "family": family,
+ "subfamily": parts[1] if len(parts) == 3 else None,
+ }
+
+
+def _convert_dataset(family_dir):
+ family_dir = Path(family_dir)
+ family = family_dir.stem[:-9]
+ files = [os.path.join(family_dir, f) for f in os.listdir(family_dir) if f.endswith(".ct")]
+ if not files:
+ for subdir in family_dir.iterdir():
+ if subdir.is_dir():
+ files.extend([os.path.join(subdir, f) for f in os.listdir(subdir) if f.endswith(".ct")])
+ files.sort(key=lambda f: ("".join(filter(str.isalpha, f)), int("".join(filter(str.isdigit, f)))))
+ data = [convert_ct(file, family) for file in tqdm(files, total=len(files))]
+ return data
+
+
+def convert_dataset(convert_config):
+ families = [
+ os.path.join(convert_config.dataset_path, f)
+ for f in os.listdir(convert_config.dataset_path)
+ if f.endswith("_database")
+ ]
+ families.sort()
+ data = [i for family in families for i in _convert_dataset(family)]
+ save_dataset(convert_config, data, filename="train.parquet")
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__))
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)