diff --git a/.codespell-whitelist.txt b/.codespell-whitelist.txt
index 11fec869..d67fd62d 100644
--- a/.codespell-whitelist.txt
+++ b/.codespell-whitelist.txt
@@ -1 +1,5 @@
ser
+marz
+manuel
+wass
+gir
diff --git a/.gitignore b/.gitignore
index 54347b75..efcbf991 100755
--- a/.gitignore
+++ b/.gitignore
@@ -390,6 +390,9 @@ FodyWeavers.xsd
!.vscode/extensions.json
*.code-workspace
+# JetBrains
+.idea/
+
# Local History for Visual Studio Code
.history/
diff --git a/README.md b/README.md
index 01f39a60..62190440 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Welcome to MultiMolecule (浦原), a foundational library designed to accelerate
We understand that AI4Science is a broad field, with researchers from different disciplines employing various practices. Therefore, MultiMolecule is designed with low coupling in mind, meaning that while it offers a full suite of functionalities, each module can be used independently. This allows you to integrate only the components you need into your existing workflows without adding unnecessary complexity. The key functionalities that MultiMolecule provides include:
- [`data`](data): Efficient data handling and preprocessing capabilities to streamline the ingestion and transformation of scientific datasets.
+- [`datasets`](datasets): A collection of widely-used datasets across different scientific domains, providing a solid foundation for training and evaluation.
- [`module`](module): Modular components designed to provide flexibility and reusability across various machine learning tasks.
- [`models`](models): State-of-the-art model architectures optimized for scientific research applications, ensuring high performance and accuracy.
- [`tokenisers`](tokenisers): Advanced tokenization methods to effectively handle complex scientific text and data representations.
diff --git a/README.zh.md b/README.zh.md
index 0aca1ddd..2ecab946 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -13,6 +13,7 @@ date: 2024-05-04 00:00:00
我们理解 AI4Science 是一个广泛的领域,来自不同学科的研究人员使用各种实践方法。因此,MultiMolecule 设计时考虑了低耦合性,这意味着虽然它提供了完整的功能套件,但每个模块都可以独立使用。这使您可以仅将所需组件集成到现有工作流程中,而不会增加不必要的复杂性。MultiMolecule 提供的主要功能包括:
- [`data`](data): 高效的数据处理和预处理功能,以简化科学数据集的摄取和转换。
+- [`datasets`](datasets): 跨不同科学领域的广泛使用数据集集合,为训练和评估提供坚实基础。
- [`module`](module): 旨在提供灵活性和可重用性的模块化组件,适用于各种机器学习任务。
- [`models`](models): 为科学研究应用优化的最先进模型架构,确保高性能和高准确性。
- [`tokenisers`](tokenisers): 先进的分词方法,有效处理复杂的科学文本和数据表示。
diff --git a/demo/data/huggingface-datasets.py b/demo/data/huggingface-datasets.py
new file mode 100644
index 00000000..f39e98f0
--- /dev/null
+++ b/demo/data/huggingface-datasets.py
@@ -0,0 +1,19 @@
+# MultiMolecule
+# Copyright (C) 2024-Present MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from multimolecule.data import Dataset
+
+data = Dataset("multimolecule/bprna-spot", split="train", pretrained="multimolecule/rna")
diff --git a/docs/docs/about/license-faq.md b/docs/docs/about/license-faq.md
new file mode 100644
index 00000000..e8e341ff
--- /dev/null
+++ b/docs/docs/about/license-faq.md
@@ -0,0 +1,109 @@
+# License FAQ
+
+This License FAQ explains the terms and conditions under which you may use the data, models, code, configuration, documentation, and weights provided by the DanLing Team (also known as DanLing) ('we', 'us', or 'our').
+It serves as an addendum to our _[License](license.md)_.
+
+## 0. Summary of Key Points
+
+This summary provides key points from our license, but you can find out more details about any of these topics by clicking the link following each key point and by reading the full license.
+
+
+
+!!! question "What constitutes the 'source code' in MultiMolecule?"
+
+ We consider everything in our repositories to be source code, including data, models, code, configuration, and documentation.
+
+ [:octicons-arrow-right-24: What constitutes the 'source code' in MultiMolecule?](#1-what-constitutes-the-source-code-in-multimolecule)
+
+!!! question "Can I publish research papers using MultiMolecule?"
+
+ It depends.
+
+ You can publish research papers on fully open access journals and conferences or preprint servers following the terms of the *[License](license.md)*.
+
+ You must obtain a separate license from us to publish research papers on closed access journals and conferences.
+
+ [:octicons-arrow-right-24: Can I publish research papers using MultiMolecule?](#2-can-i-publish-research-papers-using-multimolecule)
+
+!!! question "Can I use MultiMolecule for commercial purposes?"
+
+ Yes, you can use MultiMolecule for commercial purposes under the terms of the *[License](license.md)*.
+
+ [:octicons-arrow-right-24: Can I use MultiMolecule for commercial purposes?](#3-can-i-use-multimolecule-for-commercial-purposes)
+
+!!! question "Do people affiliated with certain organizations have specific license terms?"
+
+ Yes, people affiliated with certain organizations have specific license terms.
+
+ [:octicons-arrow-right-24: Do people affiliated with certain organizations have specific license terms?](#4-do-people-affiliated-with-certain-organizations-have-specific-license-terms)
+
+
+
+## 1. What constitutes the "source code" in MultiMolecule?
+
+We consider everything in our repositories to be source code.
+
+The training process of machine learning models is viewed similarly to the compilation process of traditional software.
+As such, the model, code, configuration, documentation, and data used for training are all part of the source code, while the trained model weights are part of the object code.
+
+We also consider research papers and manuscripts a special form of documentation, which are also part of the source code.
+
+## 2. Can I publish research papers using MultiMolecule?
+
+Since research papers are considered a form of source code, publishers are legally required to open-source all materials on their server to comply with the _[License](license.md)_ if they publish papers using MultiMolecule. This is generally impractical for most publishers.
+
+As a special exemption under section 7 of the _[License](license.md)_, we grant permission to publish research papers using MultiMolecule in fully open access journals, conferences, or preprint servers, provided all published manuscripts are made available under the [GNU Free Documentation License (GFDL)](https://www.gnu.org/licenses/fdl.html), or a [Creative Commons license](https://creativecommons.org), or an [OSI-approved license](https://opensource.org/licenses) that permits the sharing of manuscripts.
+
+For publishing in closed access journals or conferences, you must obtain a separate license from us. This typically involves co-authorship, a fee to support the project, or both. Contact us at [multimolecule@zyc.ai](mailto:multimolecule@zyc.ai) for more information.
+
+While not mandatory, we recommend citing the MultiMolecule project in your research papers.
+
+## 3. Can I use MultiMolecule for commercial purposes?
+
+Yes, MultiMolecule can be used for commercial purposes under the _[License](license.md)_. However, you must open-source any modifications to the source code and make them available under the _[License](license.md)_.
+
+If you prefer to use MultiMolecule for commercial purposes without open-sourcing your modifications, you must obtain a separate license from us. This typically involves a fee to support the project. Contact us at [multimolecule@zyc.ai](mailto:multimolecule@zyc.ai) for further details.
+
+## 4. Do people affiliated with certain organizations have specific license terms?
+
+YES!
+
+If you are affiliated with an organization that has a separate license agreement with us, you may be subject to different license terms.
+Please consult your organization's legal department to determine if you are subject to a separate license agreement.
+
+Members of the following organizations automatically receive a non-transferable, non-sublicensable, and non-distributable [MIT License](https://mit-license.org/) to use MultiMolecule:
+
+- [Microsoft Research AI for Science](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai-for-science/)
+- [DP Technology](https://dp.tech/)
+
+This special license is considered an additional term under section 7 of the _[License](license.md)_.
+It is not redistributable, and you are prohibited from creating any independent derivative works.
+Any modifications or derivative works based on this license are automatically considered derivative works of MultiMolecule and must comply with all the terms of the _[License](license.md)_.
+This ensures that third parties cannot bypass the license terms or create separate licenses from derivative works.
+
+## 5. How can I use MultiMolecule if my organization forbids the use of code under the AGPL License?
+
+Some organizations, such as [Google](https://opensource.google/documentation/reference/using/agpl-policy), have policies that prohibit the use of code under the AGPL License.
+
+If you are affiliated with an organization that forbids the use of AGPL-licensed code, you must obtain a separate license from us.
+Contact us at [multimolecule@zyc.ai](mailto:multimolecule@zyc.ai) for more information.
+
+## 6. Can I use MultiMolecule if I am a federal employee of the United States Government?
+
+No.
+
+Code written by federal employees of the United States Government is not protected by copyright under [17 U.S. Code § 105](https://www.law.cornell.edu/uscode/text/17/105).
+
+As a result, federal employees of the United States Government cannot comply with the terms of the _[License](license.md)_.
+
+## 7. Do we make updates to this FAQ?
+
+!!! tip "In Short"
+
+ Yes, we will update this FAQ as necessary to stay compliant with relevant laws.
+
+We may update this license FAQ from time to time.
+The updated version will be indicated by an updated 'Last Revised Time' at the bottom of this license FAQ.
+If we make any material changes, we will notify you by posting the new license FAQ on this page.
+We are unable to notify you directly as we do not collect any contact information from you.
+We encourage you to review this license FAQ frequently to stay informed of how you can use our data, models, code, configuration, documentation, and weights.
diff --git a/docs/docs/about/license-faq.zh.md b/docs/docs/about/license-faq.zh.md
new file mode 100644
index 00000000..86ef6d07
--- /dev/null
+++ b/docs/docs/about/license-faq.zh.md
@@ -0,0 +1,115 @@
+!!! warning "翻译"
+
+ 本文内容为翻译版本,旨在为用户提供方便。
+ 我们已经尽力确保翻译的准确性。
+ 但请注意,翻译内容可能包含错误,仅供参考。
+ 请以英文[原文](https://multimolecule.danling.org/about/license)为准。
+
+ 为满足合规性与执法要求,翻译文档中的任何不准确或歧义之处均不具有约束力,也不具备法律效力。
+
+# 许可协议常见问题解答
+
+本许可协议常见问题解答解释了您可以在何种条件下使用由丹灵团队(也称为丹灵)(“我们”或“我们的”)提供的数据、模型、代码、配置、文档和权重。
+它作为我们的 _[许可协议](license.zh.md)_ 的附加文件。
+
+## 0. 关键点总结
+
+本总结提供了常见问题解答的关键点,但您可以通过点击每个关键点后的链接或使用目录来找到您所查找的部分以了解更多详情。
+
+
+
+!!! question "在 MultiMolecule 中,什么构成了“源代码”?"
+
+ 我们认为我们存储库中的所有内容都是源代码,包括数据、模型、代码、配置和文档。
+
+ [:octicons-arrow-right-24: 在MultiMolecule中,什么构成了“源代码”?](#1-multimolecule)
+
+!!! question "我可以使用 MultiMolecule 发表研究论文吗?"
+
+ 视情况而定。
+
+ 您可以按照 *[许可协议](license.zh.md)* 的条款在完全开放获取的期刊和会议或预印本服务器上发表研究论文。
+
+ 要在封闭获取的期刊和会议上发表研究论文,您必须从我们这里获得单独的许可。
+
+ [:octicons-arrow-right-24: 我可以使用MultiMolecule发表研究论文吗?](#2multimolecule)
+
+!!! question "我可以将 MultiMolecule 用于商业用途吗?"
+
+ 是的,您可以根据 *[许可协议](license.zh.md)* 的条款将MultiMolecule用于商业用途。
+
+ [:octicons-arrow-right-24: 我可以将MultiMolecule用于商业用途吗?](#3-multimolecule)
+
+!!! question "与某些组织有关系的人是否有特定的许可条款?"
+
+ 是的,与某些组织有关系的人有特定的许可条款。
+
+ [:octicons-arrow-right-24: 与某些组织有关系的人是否有特定的许可条款?](#4)
+
+
+
+## 1. 在 MultiMolecule 中,什么构成了“源代码”?
+
+我们认为我们存储库中的所有内容都是源代码。
+
+机器学习模型的训练过程被视作类似于传统软件的编译过程。因此,模型、代码、配置、文档和用于训练的数据都被视为源代码的一部分,而训练出的模型权重则被视为目标代码的一部分。
+
+我们还将研究论文和手稿视为一种特殊的文档形式,它们也是源代码的一部分。
+
+## 2 我可以使用 MultiMolecule 发表研究论文吗?
+
+由于研究论文被视为源代码的一种形式,如果发表使用 MultiMolecule 的论文,出版商必须开源其服务器上的所有材料,以符合 _[许可协议](license.zh.md)_ 的要求。对于大多数出版商来说,这是不切实际的。
+
+作为 _[许可协议](license.zh.md)_ 第 7 条的特别豁免,我们允许在完全开放获取的期刊、会议或预印本服务器上发表使用 MultiMolecule 的研究论文,前提是所有发表的手稿都应按照允许共享手稿的[GNU 自由文档许可协议(GFDL)](https://www.gnu.org/licenses/fdl.html)或[知识共享许可协议](https://creativecommons.org)或[OSI 批准许可协议](https://opensource.org/licenses)提供。
+
+要在封闭获取的期刊或会议上发表论文,您必须从我们这里获得单独的许可。这通常包括共同署名、支持项目的费用或两者兼而有之。请通过 [multimolecule@zyc.ai](mailto:multimolecule@zyc.ai) 与我们联系以获取更多信息。
+
+虽然不是强制性的,但我们建议在研究论文中引用 MultiMolecule 项目。
+
+## 3. 我可以将 MultiMolecule 用于商业用途吗?
+
+是的,您可以根据 _[许可协议](license.zh.md)_ 将 MultiMolecule 用于商业用途。但是,您必须开源对源代码的任何修改,并使其在 _[许可协议](license.zh.md)_ 下可用。
+
+如果您希望在不开源修改内容的情况下将 MultiMolecule 用于商业用途,则必须从我们这里获得单独的许可。这通常涉及支持项目的费用。请通过 [multimolecule@zyc.ai](mailto:multimolecule@zyc.ai) 与我们联系以获取更多详细信息。
+
+## 4. 与某些组织有关系的人是否有特定的许可条款?
+
+是的!
+
+如果您与一个与我们有单独许可协议的组织有关系,您可能会受到不同的许可条款的约束。请咨询您组织的法律部门,以确定您是否受制于单独的许可协议。
+
+以下组织的成员自动获得一个不可转让、不可再许可、不可分发的 [MIT 许可协议](https://mit-license.org/) 来使用 MultiMolecule:
+
+- [微软研究院科学智能中心](https://www.microsoft.com/en-us/research/lab/microsoft-research-ai-for-science/)
+- [深势科技](https://dp.tech/)
+
+此特别许可被视为 _[许可协议](license.zh.md)_ 第 7 条中的附加条款。
+它不可再分发,并且您被禁止创建任何独立的衍生作品。
+基于此许可的任何修改或衍生作品将自动被视为 MultiMolecule 的衍生作品,必须遵守 _[许可协议](license.zh.md)_ 的所有条款。
+这确保了第三方无法绕过许可条款或从衍生作品中创建单独的许可协议。
+
+## 5. 如果我的组织禁止使用 AGPL 许可协议下的代码,我该如何使用 MultiMolecule?
+
+一些组织(如[Google](https://opensource.google/documentation/reference/using/agpl-policy))有禁止使用 AGPL 许可协议下代码的政策。
+
+如果您与禁止使用 AGPL 许可协议代码的组织有关系,您必须从我们这里获得单独的许可。请通过 [multimolecule@zyc.ai](mailto:multimolecule@zyc.ai) 与我们联系以获取更多详细信息。
+
+## 6. 如果我是美国联邦政府的雇员,我可以使用 MultiMolecule 吗?
+
+不能。
+
+根据[17 U.S. Code § 105](https://www.law.cornell.edu/uscode/text/17/105),美国联邦政府雇员撰写的代码不受版权保护。
+
+因此,美国联邦政府雇员无法遵守 _[许可协议](license.zh.md)_ 的条款。
+
+## 7. 我们会更新此常见问题解答吗?
+
+!!! tip "简而言之"
+
+ 是的,我们将根据需要更新此常见问题解答以保持与相关法律的一致。
+
+我们可能会不时更新此许可协议常见问题解答。
+更新后的版本将通过更新本页面底部的“最后修订时间”来表示。
+如果我们进行任何重大更改,我们将通过在本页发布新的许可协议常见问题解答来通知您。
+由于我们不收集您的任何联系信息,我们无法直接通知您。
+我们鼓励您经常查看本许可协议常见问题解答,以了解您可以如何使用我们的数据、模型、代码、配置、文档和权重。
diff --git a/docs/docs/about/license.zh.md b/docs/docs/about/license.zh.md
index d3bd714d..a5500646 100644
--- a/docs/docs/about/license.zh.md
+++ b/docs/docs/about/license.zh.md
@@ -90,20 +90,20 @@ GNU Affero通用公共许可证是专门设计来确保在这种情况下,修
您可以根据第 4 节的条款,以源代码的形式传达基于本程序的作品,或根据本程序的修改而产生的作品,但您必须满足以下所有条件:
-a) 作品必须有醒目的声明,说明您修改了它,并给出相关的日期。
-b) 作品必须有醒目的声明,说明它是根据本许可证和根据第7条增加的条件发布的。这一要求修改了第4节中 "保持所有通知的完整性" 的要求。
-c) 您必须根据本许可证将整个作品作为一个整体许可给任何拥有其副本的人。因此,本许可证将与任何适用的第7条附加条款一起,适用于整个作品及其所有部分,无论它们是如何包装的。本许可证不允许以任何其他方式许可该作品,但如果您已经单独收到了这种许可,它也不会使这种许可失效。
+- a) 作品必须有醒目的声明,说明您修改了它,并给出相关的日期。
+- b) 作品必须有醒目的声明,说明它是根据本许可证和根据第7条增加的条件发布的。这一要求修改了第4节中 "保持所有通知的完整性" 的要求。
+- c) 您必须根据本许可证将整个作品作为一个整体许可给任何拥有其副本的人。因此,本许可证将与任何适用的第7条附加条款一起,适用于整个作品及其所有部分,无论它们是如何包装的。本许可证不允许以任何其他方式许可该作品,但如果您已经单独收到了这种许可,它也不会使这种许可失效。
如果一个受保护作品与其他单独和独立的作品的汇编,其性质不是受保护作品的延伸,并且没有与之结合以形成更大的程序,在存储或分发媒介的某一卷上,如果该汇编及其产生的版权没有被用来限制汇编用户的访问或法律权利,超出单个作品允许的范围,则被称为 "聚合"。将一个受保护的作品包含在一个总体中并不导致本许可证适用于总体的其他部分。
### 6. 传递非源形式.
您可以根据第4条和第5条的规定,以目标代码的形式传递被保护的作品,但您也必须根据本许可证的规定,以下列方式之一传递机器可读的相应源代码:
-a) 在实体产品(包括实体销售媒介)中传递目标代码,或体现在实体产品(包括实体销售媒介)中,同时将相应的源代码固定在通常用于软件交换的耐用实体媒介上。
-b) 在实物产品(包括实物销售媒介)中传递目标代码,或在实物产品(包括实物销售媒介)中体现目标代码,并附有一份至少三年有效的书面报价,只要您为该产品型号提供备件或客户支持,就一直有效。向任何拥有目标代码的人提供(1)本许可证所涵盖的产品中所有软件的相应源代码的拷贝,拷贝在通常用于软件交换的耐用物理介质上,其价格不超过贵方实际执行这一传递源代码的合理成本,或者(2)从网络服务器上免费获取相应的源代码拷贝。
-c) 将目标代码的单个副本与提供相应来源的书面提议的副本一起传送。只有在偶尔和非商业性的情况下,并且只有在您收到目标代码和这种提议的情况下,才允许这种选择,符合第6b款的规定。
-d) 通过提供从指定地点(免费或收费)获取目标代码,并以同样的方式通过同一地点提供相应的源码,而不再收费。您不需要要求接受者在复制目标代码的同时复制相应的源代码。如果复制目标代码的地方是一个网络服务器,对应源可以在另一个支持同等复制设施的服务器上(由您或第三方运营),只要您在目标代码旁边保持明确的指示,说明在哪里可以找到对应源。无论对应源在哪个服务器上,您都有义务确保在满足这些要求所需的时间内提供对应源。
-e) 使用点对点传输的方式传送目标代码,但您必须告知其他同行,根据第6d款,目标代码和作品的对应源正在免费提供给公众。
+- a) 在实体产品(包括实体销售媒介)中传递目标代码,或体现在实体产品(包括实体销售媒介)中,同时将相应的源代码固定在通常用于软件交换的耐用实体媒介上。
+- b) 在实物产品(包括实物销售媒介)中传递目标代码,或在实物产品(包括实物销售媒介)中体现目标代码,并附有一份至少三年有效的书面报价,只要您为该产品型号提供备件或客户支持,就一直有效。向任何拥有目标代码的人提供(1)本许可证所涵盖的产品中所有软件的相应源代码的拷贝,拷贝在通常用于软件交换的耐用物理介质上,其价格不超过贵方实际执行这一传递源代码的合理成本,或者(2)从网络服务器上免费获取相应的源代码拷贝。
+- c) 将目标代码的单个副本与提供相应来源的书面提议的副本一起传送。只有在偶尔和非商业性的情况下,并且只有在您收到目标代码和这种提议的情况下,才允许这种选择,符合第6b款的规定。
+- d) 通过提供从指定地点(免费或收费)获取目标代码,并以同样的方式通过同一地点提供相应的源码,而不再收费。您不需要要求接受者在复制目标代码的同时复制相应的源代码。如果复制目标代码的地方是一个网络服务器,对应源可以在另一个支持同等复制设施的服务器上(由您或第三方运营),只要您在目标代码旁边保持明确的指示,说明在哪里可以找到对应源。无论对应源在哪个服务器上,您都有义务确保在满足这些要求所需的时间内提供对应源。
+- e) 使用点对点传输的方式传送目标代码,但您必须告知其他同行,根据第6d款,目标代码和作品的对应源正在免费提供给公众。
目标代码的可分离部分,其源代码作为系统库被排除在相应的源码之外,不需要包括在传达目标代码作品中。
用户产品 "是指(1) "消费品",即通常用于个人、家庭或家居用途的任何有形个人财产,或(2)为纳入住宅而设计或出售的任何东西。在确定一个产品是否是消费品时,有疑问的情况应以有利于承保的方式解决。对于特定用户收到的特定产品,"通常使用" 是指该类产品的典型或常见用途,而不考虑特定用户的地位或特定用户实际使用或期望或预期使用该产品的方式。一个产品是消费类产品,无论该产品是否有大量的商业、工业或非消费类用途,除非这些用途是该产品的唯一重要使用方式。
@@ -123,12 +123,12 @@ e) 使用点对点传输的方式传送目标代码,但您必须告知其他
尽管本许可证有任何其他规定,对于您添加到受保护作品中的材料,您可以(如果得到该材料的版权持有人的授权)用以下条款补充本许可证的条款:
-a) 以不同于本许可证第15条和第16条的条款声明保证或限制责任;或
-b) 要求在该材料或包含该材料的作品所显示的适当法律声明中保留特定的合理法律声明或作者归属;或
-c) 禁止歪曲该材料的来源,或要求以合理的方式将该材料的修改版本标记为与原始版本不同;或
-d) 限制为宣传目的使用该材料的许可人或作者的姓名;或
-e) 拒绝根据商标法授予使用某些商号、商标或服务标志的权利;或
-f) 要求将材料(或材料的修改版本)转交给接受者的任何人对这些合同假设直接加在这些许可人和作者身上的任何责任进行赔偿。
+- a) 以不同于本许可证第15条和第16条的条款声明保证或限制责任;或
+- b) 要求在该材料或包含该材料的作品所显示的适当法律声明中保留特定的合理法律声明或作者归属;或
+- c) 禁止歪曲该材料的来源,或要求以合理的方式将该材料的修改版本标记为与原始版本不同;或
+- d) 限制为宣传目的使用该材料的许可人或作者的姓名;或
+- e) 拒绝根据商标法授予使用某些商号、商标或服务标志的权利;或
+- f) 要求将材料(或材料的修改版本)转交给接受者的任何人对这些合同假设直接加在这些许可人和作者身上的任何责任进行赔偿。
所有其他非许可性的附加条款都被视为第10条意义上的 "进一步限制"。如果您收到的程序或其任何部分包含一个通知,说明它受本许可证的管辖,同时还有一个属于进一步限制的条款,您可以删除该条款。如果许可文件包含进一步的限制,但允许根据本许可证进行再许可或转让,您可以在受保护的作品中添加受该许可文件条款管辖的材料,但进一步的限制在这种再许可或转让中不存在。
如果您按照本节的规定向受保护的作品添加条款,您必须在相关的源文件中声明适用于这些文件的附加条款,或者说明在哪里可以找到适用条款。
diff --git a/docs/docs/about/privacy.md b/docs/docs/about/privacy.md
index 425f5032..dcb695c4 100644
--- a/docs/docs/about/privacy.md
+++ b/docs/docs/about/privacy.md
@@ -1,7 +1,3 @@
-!!! info "Last Revised Date"
-
- This notice was last updated on May 04, 2024.
-
# Privacy Notice
This privacy notice for DanLing Team (also known as DanLing) ('we', 'us', or 'our'), describes how and why we might collect, store, use, and/or share ('process') your information when you use our services ('Services'), such as when you:
@@ -406,7 +402,7 @@ It is very unlikely that you will be able to review, update, or delete the data
Yes, we will update this notice as necessary to stay compliant with relevant laws.
We may update this privacy notice from time to time.
-The updated version will be indicated by an updated 'Last Revised Date' at the top of this privacy notice.
+The updated version will be indicated by an updated 'Last Revised Time' at the bottom of this privacy notice.
If we make any material changes, we will notify you by posting the new privacy notice on this page.
We are unable to notify you directly as we do not collect any contact information from you.
We encourage you to review this privacy notice frequently to stay informed of how we are protecting your information.
diff --git a/docs/docs/about/privacy.zh.md b/docs/docs/about/privacy.zh.md
index aa284090..4e816720 100644
--- a/docs/docs/about/privacy.zh.md
+++ b/docs/docs/about/privacy.zh.md
@@ -7,10 +7,6 @@
为满足合规性与执法要求,翻译文档中的任何不准确或歧义之处均不具有约束力,也不具备法律效力。
-!!! info "最后修订日期"
-
- 本声明最后更新于2024年5月4日。
-
# 隐私声明
本隐私声明适用于丹灵团队(也被称作丹灵)(以下简称“我们”),描述了当您使用我们的服务(“服务”)时,我们如何以及为何可能收集、存储、使用和/或共享(“处理”)您的信息。例如当您:
@@ -417,7 +413,7 @@
是的,我们将根据需要更新此声明以保持与相关法律的一致。
我们可能会不时更新此隐私声明。
-更新后的版本将通过更新顶部的“最后修订日期”来表示。
+更新后的版本将通过更新本页面底部的“最后修订时间”来表示。
如果我们进行任何重大更改,我们将通过在本页发布新的隐私声明来通知您。
由于我们不收集您的任何联系信息,我们无法直接通知您。
我们鼓励您经常查看本隐私声明,以了解我们如何保护您的信息。
diff --git a/docs/docs/datasets/bprna-new.md b/docs/docs/datasets/bprna-new.md
new file mode 100644
index 00000000..7f3bf417
--- /dev/null
+++ b/docs/docs/datasets/bprna-new.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# bpRNA-1m
+
+--8<-- "multimolecule/datasets/bprna-new/README.md:23:"
diff --git a/docs/docs/datasets/bprna-spot.md b/docs/docs/datasets/bprna-spot.md
new file mode 100644
index 00000000..6aa5c662
--- /dev/null
+++ b/docs/docs/datasets/bprna-spot.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# bpRNA-1m
+
+--8<-- "multimolecule/datasets/bprna-spot/README.md:24:"
diff --git a/docs/docs/datasets/bprna.md b/docs/docs/datasets/bprna.md
new file mode 100644
index 00000000..7c22dcc5
--- /dev/null
+++ b/docs/docs/datasets/bprna.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# bpRNA-1m
+
+--8<-- "multimolecule/datasets/bprna/README.md:29:"
diff --git a/docs/docs/datasets/gencode.md b/docs/docs/datasets/gencode.md
new file mode 100644
index 00000000..e8b55618
--- /dev/null
+++ b/docs/docs/datasets/gencode.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# GENCODE
+
+--8<-- "multimolecule/datasets/gencode/README.md:21:"
diff --git a/docs/docs/datasets/index.md b/docs/docs/datasets/index.md
new file mode 100644
index 00000000..6607ea75
--- /dev/null
+++ b/docs/docs/datasets/index.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2022-05-04
+---
+
+# datasets
+
+--8<-- "multimolecule/datasets/README.md:8:"
diff --git a/docs/docs/datasets/index.zh.md b/docs/docs/datasets/index.zh.md
new file mode 100644
index 00000000..608d69e9
--- /dev/null
+++ b/docs/docs/datasets/index.zh.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2022-05-04
+---
+
+# datasets
+
+--8<-- "multimolecule/datasets/README.zh.md:8:"
diff --git a/docs/docs/datasets/rfam.md b/docs/docs/datasets/rfam.md
new file mode 100644
index 00000000..b8f1e7cb
--- /dev/null
+++ b/docs/docs/datasets/rfam.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# Rfam
+
+--8<-- "multimolecule/datasets/rfam/README.md:21:"
diff --git a/docs/docs/datasets/rnacentral.md b/docs/docs/datasets/rnacentral.md
new file mode 100644
index 00000000..25b49aad
--- /dev/null
+++ b/docs/docs/datasets/rnacentral.md
@@ -0,0 +1,9 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# RNAcentral
+
+--8<-- "multimolecule/datasets/rnacentral/README.md:74:"
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index ead43c12..53875f32 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -12,6 +12,16 @@ nav:
- data:
- data/index.md
- Dataset: data/dataset.md
+ - datasets:
+ - datasets/index.md
+ - DNA:
+ - GENCODE: datasets/gencode.md
+ - RNA:
+ - RNAcentral: datasets/rnacentral.md
+ - Rfam: datasets/rfam.md
+ - bpRNA-1m: datasets/bprna.md
+ - bpRNA-spot: datasets/bprna-spot.md
+ - bpRNA-new: datasets/bprna-new.md
- module:
- module/index.md
- heads: module/heads.md
@@ -39,9 +49,10 @@ nav:
- RNA: tokenisers/rna.md
- Protein: tokenisers/protein.md
- DotBracket: tokenisers/dot_bracket.md
- - About:
+ - about:
- about/index.md
- License: about/license.md
+ - License FAQ: about/license-faq.md
- Privacy Notice: about/privacy.md
theme:
diff --git a/multimolecule/data/README.md b/multimolecule/data/README.md
index cb2bfc99..de885c15 100644
--- a/multimolecule/data/README.md
+++ b/multimolecule/data/README.md
@@ -19,3 +19,9 @@ The `data` package is designed to complement [`datasets`](https://huggingface.co
```python
--8<-- "demo/data/local-file.py:17:"
```
+
+### Load from :hugs: [`datasets`](https://huggingface.co/docs/datasets)
+
+```python
+--8<-- "demo/data/huggingface-datasets.py:17:"
+```
diff --git a/multimolecule/data/README.zh.md b/multimolecule/data/README.zh.md
index a5533e07..3873b74e 100644
--- a/multimolecule/data/README.zh.md
+++ b/multimolecule/data/README.zh.md
@@ -12,10 +12,16 @@ date: 2024-05-04
`data` 包旨在通过提供在科学任务中常用的数据处理实用程序来补充 [`datasets`](https://huggingface.co/docs/datasets)。
-## Usage
+## 使用
### 从本地数据文件加载
```python
--8<-- "demo/data/local-file.py:17:"
```
+
+### 从:hugs: [`datasets`](https://huggingface.co/docs/datasets)加载
+
+```python
+--8<-- "demo/data/huggingface-datasets.py:17:"
+```
diff --git a/multimolecule/datasets/README.md b/multimolecule/datasets/README.md
new file mode 100644
index 00000000..f05b95c1
--- /dev/null
+++ b/multimolecule/datasets/README.md
@@ -0,0 +1,31 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# datasets
+
+`datasets` provide a collection of widely used datasets.
+
+## Available Datasets
+
+### DeoxyriboNucleic Acid (DNA)
+
+- [GENCODE](gencode)
+
+### RiboNucleic Acid (RNA)
+
+- [RNACentral](rnacentral)
+- [Rfam](rfam)
+- [bpRNA-1m](bprna)
+- [bpRNA-spot](bprna-spot)
+- [bpRNA-new](bprna-new)
+
+## Usage
+
+### Load with [MultiMolecule][multimolecule.Dataset]
+
+```python
+--8<-- "demo/data/huggingface-datasets.py:17:"
+```
diff --git a/multimolecule/datasets/README.zh.md b/multimolecule/datasets/README.zh.md
new file mode 100644
index 00000000..38d4fb25
--- /dev/null
+++ b/multimolecule/datasets/README.zh.md
@@ -0,0 +1,31 @@
+---
+authors:
+ - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# datasets
+
+`datasets` 提供了一系列广泛使用的数据集。
+
+## 可用数据集
+
+### 脱氧核糖核酸(DNA)
+
+- [GENCODE](gencode)
+
+### 核糖核酸(RNA)
+
+- [RNACentral](rnacentral)
+- [Rfam](rfam)
+- [bpRNA-1m](bprna)
+- [bpRNA-spot](bprna-spot)
+- [bpRNA-new](bprna-new)
+
+## 使用
+
+### 使用 [MultiMolecule][multimolecule.Dataset] 加载
+
+```python
+--8<-- "demo/data/huggingface-datasets.py:17:"
+```
diff --git a/multimolecule/datasets/bprna/README.md b/multimolecule/datasets/bprna/README.md
new file mode 100644
index 00000000..db52c73a
--- /dev/null
+++ b/multimolecule/datasets/bprna/README.md
@@ -0,0 +1,212 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 100K.
+
+from __future__ import annotations
+
+import os
+from collections.abc import Mapping
+
+import torch
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import get_files, save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_sta(file: str) -> Mapping:
+ with open(file) as f:
+ lines = f.read().splitlines()
+ idx = 0
+ while lines[idx].startswith("#"):
+ idx += 1
+ return {
+ "id": lines[0][7:],
+ "sequence": lines[idx],
+ "secondary_structure": lines[idx + 1],
+ "structural_annotation": lines[idx + 2],
+ "functional_annotation": lines[idx + 3],
+ }
+
+
+def convert_dataset(convert_config):
+ files = get_files(convert_config.dataset_path)
+ data = [convert_sta(file) for file in tqdm(files, total=len(files))]
+ save_dataset(convert_config, data)
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__))
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/bprna_new/README.md b/multimolecule/datasets/bprna_new/README.md
new file mode 100644
index 00000000..3f0f16d0
--- /dev/null
+++ b/multimolecule/datasets/bprna_new/README.md
@@ -0,0 +1,68 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 1K.
+
+from __future__ import annotations
+
+import os
+from collections import namedtuple
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import get_files, save_dataset
+
+torch.manual_seed(1016)
+RNA_SS_data = namedtuple("RNA_SS_data", "seq ss_label length name pairs")
+
+
+def convert_bpseq(bpseq):
+ if isinstance(bpseq, str):
+ bpseq = Path(bpseq)
+ with open(bpseq) as f:
+ lines = f.read().splitlines()
+ lines = [[int(i) if i.isdigit() else i for i in j.split()] for j in lines]
+ sequence, structure = [], ["."] * len(lines)
+ for row in lines:
+ index, nucleotide, paired_index = row
+ sequence.append(nucleotide)
+ if paired_index > 0 and index < paired_index:
+ structure[index - 1] = "("
+ structure[paired_index - 1] = ")"
+ return {"id": bpseq.stem.split("-")[0], "sequence": "".join(sequence), "secondary_structure": "".join(structure)}
+
+
+def convert_dataset(convert_config):
+ data = [convert_bpseq(file) for file in tqdm(get_files(convert_config.dataset_path))]
+ save_dataset(convert_config, data)
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-")
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/bprna_spot/README.md b/multimolecule/datasets/bprna_spot/README.md
new file mode 100644
index 00000000..6e589352
--- /dev/null
+++ b/multimolecule/datasets/bprna_spot/README.md
@@ -0,0 +1,94 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 10K.
+
+from __future__ import annotations
+
+import os
+from collections.abc import Mapping
+
+import torch
+from tqdm import tqdm
+
+from multimolecule.datasets.bprna.bprna import convert_sta
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import copy_readme, get_files, push_to_hub, write_data
+
+torch.manual_seed(1016)
+
+
+def save_dataset(convert_config: ConvertConfig, data: Mapping, compression: str = "brotli", level: int = 4):
+ root, output_path = convert_config.root, convert_config.output_path
+ os.makedirs(output_path, exist_ok=True)
+ for name, d in data.items():
+ write_data(d, output_path, name + ".parquet", compression, level)
+ copy_readme(root, output_path)
+ push_to_hub(convert_config, output_path)
+
+
+def _convert_dataset(dataset):
+ files = get_files(dataset)
+ return [convert_sta(file) for file in tqdm(files, total=len(files))]
+
+
+def convert_dataset(convert_config):
+ data = {
+ "train": _convert_dataset(os.path.join(convert_config.dataset_path, "TR0")),
+ "val": _convert_dataset(os.path.join(convert_config.dataset_path, "VL0")),
+ "test": _convert_dataset(os.path.join(convert_config.dataset_path, "TS0")),
+ }
+ save_dataset(convert_config, data)
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-")
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/conversion_utils.py b/multimolecule/datasets/conversion_utils.py
new file mode 100644
index 00000000..14cc9c80
--- /dev/null
+++ b/multimolecule/datasets/conversion_utils.py
@@ -0,0 +1,97 @@
+# MultiMolecule
+# Copyright (C) 2024-Present MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from __future__ import annotations
+
+import os
+import shutil
+
+import pyarrow as pa
+from chanfig import Config
+from pandas import DataFrame
+from pyarrow import Table
+
+try:
+ from huggingface_hub import HfApi
+except ImportError:
+ HfApi = None
+
+
+def get_files(path: str) -> list[str]:
+ files = [os.path.join(path, f) for f in os.listdir(path)]
+ files.sort(key=lambda f: ("".join(filter(str.isalpha, f)), int("".join(filter(str.isdigit, f)))))
+ return files
+
+
+def write_data(
+ data: Table | list | dict | DataFrame,
+ output_path: str,
+ filename: str = "data.parquet",
+ compression: str = "brotli",
+ level: int = 4,
+):
+ if isinstance(data, list):
+ data = Table.from_pylist(data)
+ elif isinstance(data, dict):
+ data = Table.from_pydict(data)
+ elif isinstance(data, DataFrame):
+ data = Table.from_pandas(data)
+ if not isinstance(data, Table):
+ raise ValueError("Data must be a list, dict, pandas DataFrame, or pyarrow Table.")
+
+ pa.parquet.write_table(data, os.path.join(output_path, filename), compression=compression, compression_level=level)
+
+
+def copy_readme(root: str, output_path: str):
+ readme = f"README.{output_path}.md" if f"README.{output_path}.md" in os.listdir(root) else "README.md"
+ shutil.copy2(os.path.join(root, readme), output_path)
+
+
+def push_to_hub(convert_config: ConvertConfig, output_path: str, repo_type: str = "dataset"):
+ if convert_config.push_to_hub:
+ if HfApi is None:
+ raise ImportError("Please install huggingface_hub to push to the hub.")
+ api = HfApi()
+ if convert_config.delete_existing:
+ api.delete_repo(convert_config.repo_id, token=convert_config.token, missing_ok=True)
+ api.create_repo(convert_config.repo_id, token=convert_config.token, exist_ok=True, repo_type=repo_type)
+ api.upload_folder(
+ repo_id=convert_config.repo_id, folder_path=output_path, token=convert_config.token, repo_type=repo_type
+ )
+
+
+def save_dataset(
+ convert_config: ConvertConfig, data: Table | list | dict | DataFrame, compression: str = "brotli", level: int = 4
+):
+ root, output_path = convert_config.root, convert_config.output_path
+ os.makedirs(output_path, exist_ok=True)
+ write_data(data, output_path, compression=compression, level=level)
+ copy_readme(root, output_path)
+ push_to_hub(convert_config, output_path)
+
+
+class ConvertConfig(Config):
+ dataset_path: str
+ root: str
+ output_path: str
+ push_to_hub: bool = False
+ delete_existing: bool = False
+ repo_id: str | None = None
+ token: str | None = None
+
+ def post(self):
+ if self.repo_id is None:
+ self.repo_id = f"multimolecule/{self.output_path}"
diff --git a/multimolecule/datasets/gencode/README.md b/multimolecule/datasets/gencode/README.md
new file mode 100644
index 00000000..f3b61cde
--- /dev/null
+++ b/multimolecule/datasets/gencode/README.md
@@ -0,0 +1,129 @@
+---
+language: dna
+tags:
+ - Biology
+ - DNA
+license:
+ - agpl-3.0
+size_categories:
+ - 100K.
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import torch
+from Bio import SeqIO
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_dataset(convert_config):
+ data = [
+ {
+ "id": record.id,
+ "sequence": str(record.seq),
+ }
+ for record in tqdm(SeqIO.parse(convert_config.dataset_path, format="fasta"))
+ ]
+ data.sort(
+ key=lambda f: ("".join(filter(str.isalpha, f["id"])).lower(), int("0" + "".join(filter(str.isdigit, f["id"]))))
+ )
+ save_dataset(convert_config, data)
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__))
+
+ def post(self):
+ stem = Path(self.dataset_path).stem
+ if stem.startswith("GRCh"):
+ self.output_path = self.output_path + "-human"
+ elif stem.startswith("GRCm"):
+ self.output_path = self.output_path + "-mouse"
+ else:
+ raise ValueError(f"Unknown species: {stem}")
+ super().post()
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/rfam/README.md b/multimolecule/datasets/rfam/README.md
new file mode 100644
index 00000000..1108ec80
--- /dev/null
+++ b/multimolecule/datasets/rfam/README.md
@@ -0,0 +1,199 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 10M [!TIP]
+> The original Rfam dataset is licensed under the [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/) license and is available at [Rfam](https://rfam.org).
+
+## Citation
+
+```bibtex
+@article{kalvari2021rfam,
+ author = {Kalvari, Ioanna and Nawrocki, Eric P and Ontiveros-Palacios, Nancy and Argasinska, Joanna and Lamkiewicz, Kevin and Marz, Manja and Griffiths-Jones, Sam and Toffano-Nioche, Claire and Gautheret, Daniel and Weinberg, Zasha and Rivas, Elena and Eddy, Sean R and Finn, Robert D and Bateman, Alex and Petrov, Anton I},
+ copyright = {http://creativecommons.org/licenses/by/4.0/},
+ journal = {Nucleic Acids Research},
+ language = {en},
+ month = jan,
+ number = {D1},
+ pages = {D192--D200},
+ publisher = {Oxford University Press (OUP)},
+ title = {Rfam 14: expanded coverage of metagenomic, viral and {microRNA} families},
+ volume = 49,
+ year = 2021
+}
+
+@article{hufsky2021computational,
+ author = {Hufsky, Franziska and Lamkiewicz, Kevin and Almeida, Alexandre and Aouacheria, Abdel and Arighi, Cecilia and Bateman, Alex and Baumbach, Jan and Beerenwinkel, Niko and Brandt, Christian and Cacciabue, Marco and Chuguransky, Sara and Drechsel, Oliver and Finn, Robert D and Fritz, Adrian and Fuchs, Stephan and Hattab, Georges and Hauschild, Anne-Christin and Heider, Dominik and Hoffmann, Marie and H{\"o}lzer, Martin and Hoops, Stefan and Kaderali, Lars and Kalvari, Ioanna and von Kleist, Max and Kmiecinski, Ren{\'o} and K{\"u}hnert, Denise and Lasso, Gorka and Libin, Pieter and List, Markus and L{\"o}chel, Hannah F and Martin, Maria J and Martin, Roman and Matschinske, Julian and McHardy, Alice C and Mendes, Pedro and Mistry, Jaina and Navratil, Vincent and Nawrocki, Eric P and O'Toole, {\'A}ine Niamh and Ontiveros-Palacios, Nancy and Petrov, Anton I and Rangel-Pineros, Guillermo and Redaschi, Nicole and Reimering, Susanne and Reinert, Knut and Reyes, Alejandro and Richardson, Lorna and Robertson, David L and Sadegh, Sepideh and Singer, Joshua B and Theys, Kristof and Upton, Chris and Welzel, Marius and Williams, Lowri and Marz, Manja},
+ copyright = {http://creativecommons.org/licenses/by/4.0/},
+ journal = {Briefings in Bioinformatics},
+ month = mar,
+ number = 2,
+ pages = {642--663},
+ publisher = {Oxford University Press (OUP)},
+ title = {Computational strategies to combat {COVID-19}: useful tools to accelerate {SARS-CoV-2} and coronavirus research},
+ volume = 22,
+ year = 2021
+}
+
+@article{kalvari2018noncoding,
+ author = {Kalvari, Ioanna and Nawrocki, Eric P and Argasinska, Joanna and Quinones-Olvera, Natalia and Finn, Robert D and Bateman, Alex and Petrov, Anton I},
+ journal = {Current Protocols in Bioinformatics},
+ month = jun,
+ number = 1,
+ pages = {e51},
+ title = {Non-coding {RNA} analysis using the rfam database},
+ volume = 62,
+ year = 2018
+}
+
+@article{kalvari2018rfam,
+ author = {Kalvari, Ioanna and Argasinska, Joanna and Quinones-Olvera,
+ Natalia and Nawrocki, Eric P and Rivas, Elena and Eddy, Sean R
+ and Bateman, Alex and Finn, Robert D and Petrov, Anton I},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {D1},
+ pages = {D335--D342},
+ title = {Rfam 13.0: shifting to a genome-centric resource for non-coding {RNA} families},
+ volume = 46,
+ year = 2018
+}
+
+@article{nawrocki2015rfam,
+ author = {Nawrocki, Eric P and Burge, Sarah W and Bateman, Alex and Daub, Jennifer and Eberhardt, Ruth Y and Eddy, Sean R and Floden, Evan W and Gardner, Paul P and Jones, Thomas A and Tate, John and Finn, Robert D},
+ copyright = {http://creativecommons.org/licenses/by/4.0/},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {Database issue},
+ pages = {D130--7},
+ publisher = {Oxford University Press (OUP)},
+ title = {Rfam 12.0: updates to the {RNA} families database},
+ volume = 43,
+ year = 2015
+}
+
+@article{burge2013rfam,
+ author = {Burge, Sarah W and Daub, Jennifer and Eberhardt, Ruth and Tate, John and Barquist, Lars and Nawrocki, Eric P and Eddy, Sean R and Gardner, Paul P and Bateman, Alex},
+ copyright = {http://creativecommons.org/licenses/by-nc/3.0/},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {Database issue},
+ pages = {D226--32},
+ publisher = {Oxford University Press (OUP)},
+ title = {Rfam 11.0: 10 years of {RNA} families},
+ volume = 41,
+ year = 2013
+}
+
+@article{gardner2011rfam,
+ author = {Gardner, Paul P and Daub, Jennifer and Tate, John and Moore, Benjamin L and Osuch, Isabelle H and Griffiths-Jones, Sam and Finn, Robert D and Nawrocki, Eric P and Kolbe, Diana L and Eddy, Sean R and Bateman, Alex},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {Database issue},
+ pages = {D141--5},
+ title = {Rfam: Wikipedia, clans and the ``decimal'' release},
+ volume = 39,
+ year = 2011
+}
+
+@article{gardner2009rfam,
+ author = {Gardner, Paul P and Daub, Jennifer and Tate, John G and Nawrocki, Eric P and Kolbe, Diana L and Lindgreen, Stinus and Wilkinson, Adam C and Finn, Robert D and Griffiths-Jones, Sam and Eddy, Sean R and Bateman, Alex},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {Database issue},
+ pages = {D136--40},
+ title = {Rfam: updates to the {RNA} families database},
+ volume = 37,
+ year = 2009
+}
+
+@article{daub2008rna,
+ author = {Daub, Jennifer and Gardner, Paul P and Tate, John and Ramsk{\"o}ld, Daniel and Manske, Magnus and Scott, William G and Weinberg, Zasha and Griffiths-Jones, Sam and Bateman, Alex},
+ journal = {RNA},
+ month = dec,
+ number = 12,
+ pages = {2462--2464},
+ title = {The {RNA} {WikiProject}: community annotation of {RNA} families},
+ volume = 14,
+ year = 2008
+}
+
+@article{griffiths2005rfam,
+ author = {Griffiths-Jones, Sam and Moxon, Simon and Marshall, Mhairi and Khanna, Ajay and Eddy, Sean R. and Bateman, Alex},
+ doi = {10.1093/nar/gki081},
+ eprint = {https://academic.oup.com/nar/article-pdf/33/suppl\_1/D121/7622063/gki081.pdf},
+ issn = {0305-1048},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {suppl_1},
+ pages = {D121-D124},
+ title = {{Rfam: annotating non-coding RNAs in complete genomes}},
+ url = {https://doi.org/10.1093/nar/gki081},
+ volume = {33},
+ year = {2005}
+}
+
+@article{griffiths2003rfam,
+ author = {Griffiths-Jones, Sam and Bateman, Alex and Marshall, Mhairi and Khanna, Ajay and Eddy, Sean R.},
+ doi = {10.1093/nar/gkg006},
+ eprint = {https://academic.oup.com/nar/article-pdf/31/1/439/7125749/gkg006.pdf},
+ issn = {0305-1048},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {1},
+ pages = {439-441},
+ title = {{Rfam: an RNA family database}},
+ url = {https://doi.org/10.1093/nar/gkg006},
+ volume = {31},
+ year = {2003}
+}
+```
diff --git a/multimolecule/datasets/rfam/rfam.py b/multimolecule/datasets/rfam/rfam.py
new file mode 100644
index 00000000..78c92a48
--- /dev/null
+++ b/multimolecule/datasets/rfam/rfam.py
@@ -0,0 +1,57 @@
+# MultiMolecule
+# Copyright (C) 2024-Present MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from __future__ import annotations
+
+import os
+from collections.abc import Mapping
+from pathlib import Path
+
+import torch
+from Bio import SeqIO
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import get_files, save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_rfam(cm: str, fasta: str) -> Mapping:
+ assert Path(cm).stem == Path(fasta).stem
+ with open(cm) as f:
+ lines = f.read().splitlines()
+ return {"id": lines[1].split()[1], "sequences": [str(s.seq) for s in SeqIO.parse(fasta, format="fasta")]}
+
+
+def convert_dataset(convert_config):
+ cms = get_files(os.path.join(convert_config.dataset_path, "cm"))
+ fastas = get_files(os.path.join(convert_config.dataset_path, "fasta"))
+ assert len(cms) == len(fastas)
+ data = [convert_rfam(cm, fasta) for cm, fasta in tqdm(zip(cms, fastas), total=len(cms))]
+ data.sort(key=lambda s: s["id"])
+ save_dataset(convert_config, data)
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__))
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/rnacentral/README.md b/multimolecule/datasets/rnacentral/README.md
new file mode 100644
index 00000000..faa6e2f5
--- /dev/null
+++ b/multimolecule/datasets/rnacentral/README.md
@@ -0,0 +1,208 @@
+---
+language: rna
+tags:
+ - Biology
+ - RNA
+license:
+ - agpl-3.0
+size_categories:
+ - 10M [!TIP]
+> The original RNAcentral dataset is licensed under the [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/) license and is available at [RNAcentral](https://rnacentral.org).
+
+## Citation
+
+```bibtex
+@article{rnacentral2021,
+ author = {{RNAcentral Consortium}},
+ doi = {https://doi.org/10.1093/nar/gkaa921},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {D1},
+ pages = {D212--D220},
+ publisher = {Oxford University Press (OUP)},
+ title = {{RNAcentral} 2021: secondary structure integration, improved sequence search and new member databases},
+ url = {https://academic.oup.com/nar/article/49/D1/D212/5940500},
+ volume = 49,
+ year = 2021
+}
+
+@article{sweeney2020exploring,
+ author = {Sweeney, Blake A. and Tagmazian, Arina A. and Ribas, Carlos E. and Finn, Robert D. and Bateman, Alex and Petrov, Anton I.},
+ doi = {https://doi.org/10.1002/cpbi.104},
+ eprint = {https://currentprotocols.onlinelibrary.wiley.com/doi/pdf/10.1002/cpbi.104},
+ journal = {Current Protocols in Bioinformatics},
+ keywords = {Galaxy, ncRNA, non-coding RNA, RNAcentral, RNA-seq},
+ number = {1},
+ pages = {e104},
+ title = {Exploring Non-Coding RNAs in RNAcentral},
+ url = {https://currentprotocols.onlinelibrary.wiley.com/doi/abs/10.1002/cpbi.104},
+ volume = 71,
+ year = 2020
+}
+
+@article{rnacentral2019,
+ author = {{The RNAcentral Consortium}},
+ doi = {https://doi.org/10.1093/nar/gky1034},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {D1},
+ pages = {D221--D229},
+ publisher = {Oxford University Press (OUP)},
+ title = {{RNAcentral}: a hub of information for non-coding {RNA} sequences},
+ url = {https://academic.oup.com/nar/article/47/D1/D221/5160993},
+ volume = 47,
+ year = 2019
+}
+
+@article{rnacentral2017,
+ author = {{The RNAcentral Consortium} and Petrov, Anton I and Kay, Simon J E and Kalvari, Ioanna and Howe, Kevin L and Gray, Kristian A and Bruford, Elspeth A and Kersey, Paul J and Cochrane, Guy and Finn, Robert D and Bateman, Alex and Kozomara, Ana and Griffiths-Jones, Sam and Frankish, Adam and Zwieb, Christian W and Lau, Britney Y and Williams, Kelly P and Chan, Patricia Pand Lowe, Todd M and Cannone, Jamie J and Gutell, Robin and Machnicka, Magdalena A and Bujnicki, Janusz M and Yoshihama, Maki and Kenmochi, Naoya and Chai, Benli and Cole, James R and Szymanski, Maciej and Karlowski, Wojciech M and Wood, Valerie and Huala, Eva and Berardini, Tanya Z and Zhao, Yi and Chen, Runsheng and Zhu, Weimin and Paraskevopoulou, Maria D and Vlachos, Ioannis S and Hatzigeorgiou, Artemis G and Ma, Lina and Zhang, Zhang and Puetz, Joern and Stadler, Peter F and McDonald, Daniel and Basu, Siddhartha and Fey, Petra and Engel, Stacia R and Cherry, J Michael and Volders, Pieter-Jan and Mestdagh, Pieter and Wower, Jacek and Clark, Michael B and Quek, Xiu Cheng and Dinger, Marcel E},
+ doi = {https://doi.org/10.1093/nar/gkw1008},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {D1},
+ pages = {D128--D134},
+ publisher = {Oxford University Press (OUP)},
+ title = {{RNAcentral}: a comprehensive database of non-coding {RNA} sequences},
+ url = {https://academic.oup.com/nar/article/45/D1/D128/2333921},
+ volume = 45,
+ year = 2017
+}
+
+@article{rnacentral2015,
+ author = {{RNAcentral Consortium} and Petrov, Anton I and Kay, Simon J E and Gibson, Richard and Kulesha, Eugene and Staines, Dan and Bruford, Elspeth A and Wright, Mathew W and Burge, Sarah and Finn, Robert D and Kersey, Paul J and Cochrane, Guy and Bateman, Alex and Griffiths-Jones, Sam and Harrow, Jennifer and Chan, Patricia P and Lowe, Todd M and Zwieb, Christian W and Wower, Jacek and Williams, Kelly P and Hudson, Corey M and Gutell, Robin and Clark, Michael B and Dinger, Marcel and Quek, Xiu Cheng and Bujnicki, Janusz M and Chua, Nam-Hai and Liu, Jun and Wang, Huan and Skogerb{\o}, Geir and Zhao, Yi and Chen, Runsheng and Zhu, Weimin and Cole, James R and Chai, Benli and Huang, Hsien-Da and Huang, His-Yuan and Cherry, J Michael and Hatzigeorgiou, Artemis and Pruitt, Kim D},
+ doi = {https://doi.org/10.1093/nar/gku991},
+ journal = {Nucleic Acids Research},
+ month = jan,
+ number = {Database issue},
+ pages = {D123--D129},
+ title = {{RNAcentral}: an international database of {ncRNA} sequences},
+ url = {https://academic.oup.com/nar/article/43/D1/D123/2439941},
+ volume = 43,
+ year = 2015
+}
+
+@article{bateman2011rnacentral,
+ author = {Bateman, Alex and Agrawal, Shipra and Birney, Ewan and Bruford, Elspeth A and Bujnicki, Janusz M and Cochrane, Guy and Cole, James R and Dinger, Marcel E and Enright, Anton J and Gardner, Paul P and Gautheret, Daniel and Griffiths-Jones, Sam and Harrow, Jen and Herrero, Javier and Holmes, Ian H and Huang, Hsien-Da and Kelly, Krystyna A and Kersey, Paul and Kozomara, Ana and Lowe, Todd M and Marz, Manja and Moxon, Simon andPruitt, Kim D and Samuelsson, Tore and Stadler, Peter F and Vilella, Albert J and Vogel, Jan-Hinnerk and Williams, Kelly P and Wright, Mathew W and Zwieb, Christian},
+ doi = {https://doi.org/10.1261/rna.2750811},
+ journal = {RNA},
+ month = nov,
+ number = 11,
+ pages = {1941--1946},
+ publisher = {Cold Spring Harbor Laboratory},
+ title = {{RNAcentral}: A vision for an international database of {RNA} sequences},
+ url = {https://rnajournal.cshlp.org/content/17/11/1941.long},
+ volume = 17,
+ year = 2011
+}
+```
diff --git a/multimolecule/datasets/rnacentral/modifications.py b/multimolecule/datasets/rnacentral/modifications.py
new file mode 100644
index 00000000..c74f06a9
--- /dev/null
+++ b/multimolecule/datasets/rnacentral/modifications.py
@@ -0,0 +1,84 @@
+# MultiMolecule
+# Copyright (C) 2024-Present MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from __future__ import annotations
+
+import os
+
+import torch
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+from multimolecule.datasets.rnacentral.utils import execute
+
+torch.manual_seed(1016)
+
+command = """
+WITH active_rna_precomputed AS (
+ SELECT *
+ FROM rnc_rna_precomputed
+ WHERE is_active = True AND taxid IS NULL
+)
+SELECT
+ rna.upi AS urs,
+ rna.id AS rna_id,
+ rna.seq_short AS sequence,
+ rnc_modifications.id AS modification_id,
+ rnc_modifications.modification_id AS modification_type,
+ rnc_modifications.position AS modification_position,
+ rnc_modifications.accession AS modification_accession,
+ active_rna_precomputed.rna_type,
+ active_rna_precomputed.so_rna_type
+FROM
+ rnc_modifications
+JOIN
+ rna
+ON
+ rnc_modifications.upi = rna.upi
+JOIN
+ active_rna_precomputed
+ON
+ rnc_modifications.upi = active_rna_precomputed.upi
+"""
+
+
+def convert_dataset(config: ConvertConfig):
+ df = execute(command)
+ df = df.groupby("urs").agg(lambda x: list(x) if x.name.startswith("modification") else x.iloc[0])
+ df.sort_values(["urs", "rna_id"], inplace=True)
+ df.reset_index(inplace=True)
+ modification_columns = [col for col in df.keys() if col.startswith("modification")]
+ modifications = df.apply(
+ lambda row: {
+ k: {col[13:]: row[col][i] for col in modification_columns if col != "modification_id"}
+ for i, k in sorted(enumerate(row["modification_id"]), key=lambda x: x[1])
+ },
+ axis=1,
+ )
+ df.insert(3, "modifications", modifications)
+ df = df.drop(columns=modification_columns)
+ save_dataset(config, df)
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__)) + "-df"
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/rnacentral/rnacentral.py b/multimolecule/datasets/rnacentral/rnacentral.py
new file mode 100644
index 00000000..958e622a
--- /dev/null
+++ b/multimolecule/datasets/rnacentral/rnacentral.py
@@ -0,0 +1,59 @@
+# MultiMolecule
+# Copyright (C) 2024-Present MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from __future__ import annotations
+
+import os
+
+import torch
+from Bio import SeqIO
+from tqdm import tqdm
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+
+torch.manual_seed(1016)
+
+
+def convert_dataset(convert_config):
+ max_seq_len = convert_config.max_seq_len
+ data = [
+ {
+ "urs": record.id,
+ "sequence": str(record.seq) if max_seq_len is None else str(record.seq)[:max_seq_len],
+ "type": record.description.split()[1],
+ "description": record.description,
+ }
+ for record in tqdm(SeqIO.parse(convert_config.dataset_path, format="fasta"))
+ ]
+ save_dataset(convert_config, data)
+
+
+class ConvertConfig(ConvertConfig_):
+ max_seq_len: int | None = None
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__))
+
+ def post(self):
+ if self.max_seq_len is not None:
+ self.output_path = f"{self.output_path}-{self.max_seq_len}"
+ super().post()
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/rnacentral/secondary_structure.py b/multimolecule/datasets/rnacentral/secondary_structure.py
new file mode 100644
index 00000000..60a62236
--- /dev/null
+++ b/multimolecule/datasets/rnacentral/secondary_structure.py
@@ -0,0 +1,82 @@
+# MultiMolecule
+# Copyright (C) 2024-Present MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from __future__ import annotations
+
+import os
+
+import torch
+
+from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_
+from multimolecule.datasets.conversion_utils import save_dataset
+from multimolecule.datasets.rnacentral.utils import execute
+
+torch.manual_seed(1016)
+
+command = """
+WITH active_rna_precomputed AS (
+ SELECT *
+ FROM rnc_rna_precomputed
+ WHERE is_active = True AND taxid IS NULL
+)
+SELECT
+ rna.upi,
+ rna.id AS rna_id,
+ rna.seq_short AS sequence,
+ rnc_modifications.id AS modification_id,
+ rnc_modifications.modification_id AS modification,
+ rnc_modifications.position,
+ rnc_modifications.accession,
+ active_rna_precomputed.rna_type,
+ active_rna_precomputed.last_release,
+ active_rna_precomputed.so_rna_type,
+ active_rna_precomputed.short_description AS description,
+ active_rna_precomputed.databases,
+ rna.crc64,
+ rna.md5
+FROM
+ rnc_modifications
+JOIN
+ rna
+ON
+ rnc_modifications.upi = rna.upi
+JOIN
+ active_rna_precomputed
+ON
+ rnc_modifications.upi = active_rna_precomputed.upi
+"""
+
+UNIQUE_COLUMNS = {"modification_id", "modification", "position"}
+
+
+def convert_dataset(config: ConvertConfig):
+ df = execute(command)
+ df.sort_values(["upi", "modification_id"], inplace=True)
+ df = df.groupby("upi").agg(lambda x: list(x) if x.name in UNIQUE_COLUMNS else x.iloc[0])
+ df.reset_index(inplace=True)
+ df["databases"] = df["databases"].apply(lambda x: x.split(","))
+ save_dataset(config, df)
+
+
+class ConvertConfig(ConvertConfig_):
+ root: str = os.path.dirname(__file__)
+ output_path: str = os.path.basename(os.path.dirname(__file__)) + "-modifications"
+
+
+if __name__ == "__main__":
+ config = ConvertConfig()
+ config.parse() # type: ignore[attr-defined]
+ convert_dataset(config)
diff --git a/multimolecule/datasets/rnacentral/utils.py b/multimolecule/datasets/rnacentral/utils.py
new file mode 100644
index 00000000..24b6e645
--- /dev/null
+++ b/multimolecule/datasets/rnacentral/utils.py
@@ -0,0 +1,71 @@
+# MultiMolecule
+# Copyright (C) 2024-Present MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from __future__ import annotations
+
+from functools import wraps
+from typing import Callable
+
+import pandas as pd
+import psycopg2
+from psycopg2 import InterfaceError, OperationalError
+
+CONN_STRING = "host='hh-pgsql-public.ebi.ac.uk' dbname='pfmegrnargs' user='reader' password='NWDMCE5xdipIjRrp'"
+
+
+def connect(conn_string: str = CONN_STRING) -> Callable:
+ def decorator(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ conn = kwargs.pop("conn", None)
+ while True:
+ try:
+ if conn is None:
+ conn = psycopg2.connect(conn_string)
+ kwargs["conn"] = conn
+ return func(*args, **kwargs)
+ except (OperationalError, InterfaceError):
+ print("Connection lost. Reconnecting...")
+ conn = psycopg2.connect(conn_string)
+
+ return wrapper
+
+ return decorator
+
+
+@connect(conn_string=CONN_STRING)
+def execute(query, filter: Callable | None = None, batch_size: int = 1_000_000, eager_break: bool = True, conn=None):
+ offset = 0
+ data = None
+ query = query.replace(";", "")
+
+ while True:
+ with conn.cursor() as cursor:
+ cursor.execute(f"{query} LIMIT %s OFFSET %s", [batch_size, offset])
+ batch = cursor.fetchall()
+ batch_len = len(batch)
+ if not batch:
+ break
+ batch = pd.DataFrame.from_dict({desc[0]: col for desc, col in zip(cursor.description, zip(*batch))})
+ if filter is not None:
+ batch = filter(batch)
+ data = pd.concat([data, batch]) if data is not None else batch
+ offset += len(batch)
+ print(f"Fetched {offset} data so far...")
+ if eager_break and batch_len < batch_size:
+ break
+
+ return data
diff --git a/multimolecule/models/calm/convert_checkpoint.py b/multimolecule/models/calm/convert_checkpoint.py
index efa50d53..75ba749c 100644
--- a/multimolecule/models/calm/convert_checkpoint.py
+++ b/multimolecule/models/calm/convert_checkpoint.py
@@ -17,7 +17,6 @@
from __future__ import annotations
import os
-from dataclasses import dataclass
import chanfig
import danling as dl
@@ -166,7 +165,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model, tokenizer_config=tokenizer_config)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/multimolecule/models/ernierna/README.ernierna.ss.md b/multimolecule/models/ernierna/README.ernierna.ss.md
index ab48ee4e..592d0761 100644
--- a/multimolecule/models/ernierna/README.ernierna.ss.md
+++ b/multimolecule/models/ernierna/README.ernierna.ss.md
@@ -193,7 +193,8 @@ ERNIE-RNA used Masked Language Modeling (MLM) as the pre-training objective: tak
### Training Data
-The ERNIE-RNA model was pre-trained on [RNAcentral](https://rnacentral.org). RNAcentral is a comprehensive database of non-coding RNA sequences from a wide range of species. It combines 47 different databases, adding up to around 34 million RNA sequences in total.
+The ERNIE-RNA model was pre-trained on [RNAcentral](https://multimolecule.danling.org/datasets/rnacentral/).
+RNAcentral is a free, public resource that offers integrated access to a comprehensive and up-to-date set of non-coding RNA sequences provided by a collaborating group of [Expert Databases](https://rnacentral.org/expert-databases) representing a broad range of organisms and RNA types.
ERNIE-RNA applied [CD-HIT (CD-HIT-EST)](https://sites.google.com/view/cd-hit) with a cut-off at 100% sequence identity to remove redundancy from the RNAcentral, resulting 25 million unique sequences. Sequences longer than 1024 nucleotides were subsequently excluded. The final dataset contains 20.4 million non-redundant RNA sequences.
ERNIE-RNA preprocessed all tokens by replacing "T"s with "S"s.
diff --git a/multimolecule/models/ernierna/README.md b/multimolecule/models/ernierna/README.md
index 702900f6..32a9ba6b 100644
--- a/multimolecule/models/ernierna/README.md
+++ b/multimolecule/models/ernierna/README.md
@@ -192,7 +192,8 @@ ERNIE-RNA used Masked Language Modeling (MLM) as the pre-training objective: tak
### Training Data
-The ERNIE-RNA model was pre-trained on [RNAcentral](https://rnacentral.org). RNAcentral is a comprehensive database of non-coding RNA sequences from a wide range of species. It combines 47 different databases, adding up to around 34 million RNA sequences in total.
+The ERNIE-RNA model was pre-trained on [RNAcentral](https://multimolecule.danling.org/datasets/rnacentral/).
+RNAcentral is a free, public resource that offers integrated access to a comprehensive and up-to-date set of non-coding RNA sequences provided by a collaborating group of [Expert Databases](https://rnacentral.org/expert-databases) representing a broad range of organisms and RNA types.
ERNIE-RNA applied [CD-HIT (CD-HIT-EST)](https://sites.google.com/view/cd-hit) with a cut-off at 100% sequence identity to remove redundancy from the RNAcentral, resulting 25 million unique sequences. Sequences longer than 1024 nucleotides were subsequently excluded. The final dataset contains 20.4 million non-redundant RNA sequences.
ERNIE-RNA preprocessed all tokens by replacing "T"s with "S"s.
diff --git a/multimolecule/models/ernierna/convert_checkpoint.py b/multimolecule/models/ernierna/convert_checkpoint.py
index 3b1ca703..b047d3e3 100644
--- a/multimolecule/models/ernierna/convert_checkpoint.py
+++ b/multimolecule/models/ernierna/convert_checkpoint.py
@@ -17,7 +17,6 @@
from __future__ import annotations
import os
-from dataclasses import dataclass
import torch
@@ -131,7 +130,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model, tokenizer_config=tokenizer_config)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/multimolecule/models/rinalmo/README.md b/multimolecule/models/rinalmo/README.md
index 14384f12..fba09dac 100644
--- a/multimolecule/models/rinalmo/README.md
+++ b/multimolecule/models/rinalmo/README.md
@@ -191,7 +191,8 @@ RiNALMo used Masked Language Modeling (MLM) as the pre-training objective: takin
### Training Data
-The RiNALMo model was pre-trained on a cocktail of databases including [RNAcentral](https://rnacentral.org), [Rfam](https://rfam.org), [Ensembl Genome Browser](https://ensembl.org), and [Nucleotide](https://ncbi.nlm.nih.gov/nucleotide). The training data contains 36 million unique ncRNA sequences.
+The RiNALMo model was pre-trained on a cocktail of databases including [RNAcentral](https://rnacentral.org), [Rfam](https://rfam.org), [Ensembl Genome Browser](https://ensembl.org), and [Nucleotide](https://ncbi.nlm.nih.gov/nucleotide).
+The training data contains 36 million unique ncRNA sequences.
To ensure sequence diversity in each training batch, RiNALMo clustered the sequences with [MMSeqs2](https://github.com/soedinglab/MMseqs2) into 17 million clusters and then sampled each sequence in the batch from a different cluster.
diff --git a/multimolecule/models/rinalmo/convert_checkpoint.py b/multimolecule/models/rinalmo/convert_checkpoint.py
index 65f216b6..868ec5dc 100644
--- a/multimolecule/models/rinalmo/convert_checkpoint.py
+++ b/multimolecule/models/rinalmo/convert_checkpoint.py
@@ -17,7 +17,6 @@
from __future__ import annotations
import os
-from dataclasses import dataclass
import torch
@@ -118,7 +117,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/multimolecule/models/rnabert/README.md b/multimolecule/models/rnabert/README.md
index cb316c0f..4cf22c34 100644
--- a/multimolecule/models/rnabert/README.md
+++ b/multimolecule/models/rnabert/README.md
@@ -199,8 +199,10 @@ RNABERT has two pre-training objectives: masked language modeling (MLM) and stru
### Training Data
-The RNABERT model was pre-trained on [RNAcentral](https://rnacentral.org). RNAcentral is a comprehensive database of non-coding RNA sequences from a wide range of species. It combines 47 different databases, adding up to around 27 million RNA sequences in total. RNABERT used a subset of 76, 237 human ncRNA sequences from RNAcentral for pre-training.
+The RNABERT model was pre-trained on [RNAcentral](https://multimolecule.danling.org/datasets/rnacentral/).
+RNAcentral is a free, public resource that offers integrated access to a comprehensive and up-to-date set of non-coding RNA sequences provided by a collaborating group of [Expert Databases](https://rnacentral.org/expert-databases) representing a broad range of organisms and RNA types.
+RNABERT used a subset of 76, 237 human ncRNA sequences from RNAcentral for pre-training.
RNABERT preprocessed all tokens by replacing "U"s with "T"s.
Note that during model conversions, "T" is replaced with "U". [`RnaTokenizer`][multimolecule.RnaTokenizer] will convert "T"s to "U"s for you, you may disable this behaviour by passing `replace_T_with_U=False`.
diff --git a/multimolecule/models/rnabert/convert_checkpoint.py b/multimolecule/models/rnabert/convert_checkpoint.py
index ed3be38a..178f2f76 100644
--- a/multimolecule/models/rnabert/convert_checkpoint.py
+++ b/multimolecule/models/rnabert/convert_checkpoint.py
@@ -18,7 +18,6 @@
import os
from copy import deepcopy
-from dataclasses import dataclass
import torch
@@ -83,7 +82,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/multimolecule/models/rnaernie/README.md b/multimolecule/models/rnaernie/README.md
index a62663b6..5980583b 100644
--- a/multimolecule/models/rnaernie/README.md
+++ b/multimolecule/models/rnaernie/README.md
@@ -208,7 +208,8 @@ RNAErnie used Masked Language Modeling (MLM) as the pre-training objective: taki
### Training Data
-The RNAErnie model was pre-trained on [RNAcentral](https://rnacentral.org). RNAcentral is a comprehensive database of non-coding RNA sequences from a wide range of species. It combines 47 different databases, adding up to around 34 million RNA sequences in total.
+The RNAErnie model was pre-trained on [RNAcentral](https://multimolecule.danling.org/datasets/rnacentral/).
+RNAcentral is a free, public resource that offers integrated access to a comprehensive and up-to-date set of non-coding RNA sequences provided by a collaborating group of [Expert Databases](https://rnacentral.org/expert-databases) representing a broad range of organisms and RNA types.
RNAErnie used a subset of RNAcentral for pre-training. The subset contains 23 million sequences.
RNAErnie preprocessed all tokens by replacing "T"s with "S"s.
diff --git a/multimolecule/models/rnaernie/convert_checkpoint.py b/multimolecule/models/rnaernie/convert_checkpoint.py
index caeeb933..94231e8c 100644
--- a/multimolecule/models/rnaernie/convert_checkpoint.py
+++ b/multimolecule/models/rnaernie/convert_checkpoint.py
@@ -17,7 +17,6 @@
from __future__ import annotations
import os
-from dataclasses import dataclass
import torch
@@ -130,7 +129,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/multimolecule/models/rnafm/README.md b/multimolecule/models/rnafm/README.md
index b879f8e3..429be6d7 100644
--- a/multimolecule/models/rnafm/README.md
+++ b/multimolecule/models/rnafm/README.md
@@ -224,7 +224,8 @@ RNA-FM used Masked Language Modeling (MLM) as the pre-training objective: taking
### Training Data
-The RNA-FM model was pre-trained on [RNAcentral](https://rnacentral.org). RNAcentral is a comprehensive database of non-coding RNA sequences from a wide range of species. It combines 47 different databases, adding up to around 27 million RNA sequences in total.
+The RNA-FM model was pre-trained on [RNAcentral](https://multimolecule.danling.org/datasets/rnacentral/).
+RNAcentral is a free, public resource that offers integrated access to a comprehensive and up-to-date set of non-coding RNA sequences provided by a collaborating group of [Expert Databases](https://rnacentral.org/expert-databases) representing a broad range of organisms and RNA types.
RNA-FM applied [CD-HIT (CD-HIT-EST)](https://sites.google.com/view/cd-hit) with a cut-off at 100% sequence identity to remove redundancy from the RNAcentral. The final dataset contains 23.7 million non-redundant RNA sequences.
diff --git a/multimolecule/models/rnafm/README.mrnafm.md b/multimolecule/models/rnafm/README.mrnafm.md
index a7ff8a5e..4692ae78 100644
--- a/multimolecule/models/rnafm/README.mrnafm.md
+++ b/multimolecule/models/rnafm/README.mrnafm.md
@@ -224,7 +224,8 @@ RNA-FM used Masked Language Modeling (MLM) as the pre-training objective: taking
### Training Data
-The RNA-FM model was pre-trained on [RNAcentral](https://rnacentral.org). RNAcentral is a comprehensive database of non-coding RNA sequences from a wide range of species. It combines 47 different databases, adding up to around 27 million RNA sequences in total.
+The RNA-FM model was pre-trained on [RNAcentral](https://multimolecule.danling.org/datasets/rnacentral/).
+RNAcentral is a free, public resource that offers integrated access to a comprehensive and up-to-date set of non-coding RNA sequences provided by a collaborating group of [Expert Databases](https://rnacentral.org/expert-databases) representing a broad range of organisms and RNA types.
RNA-FM applied [CD-HIT (CD-HIT-EST)](https://sites.google.com/view/cd-hit) with a cut-off at 100% sequence identity to remove redundancy from the RNAcentral. The final dataset contains 23.7 million non-redundant RNA sequences.
diff --git a/multimolecule/models/rnafm/convert_checkpoint.py b/multimolecule/models/rnafm/convert_checkpoint.py
index 83a51dc5..6b09cbc6 100644
--- a/multimolecule/models/rnafm/convert_checkpoint.py
+++ b/multimolecule/models/rnafm/convert_checkpoint.py
@@ -18,7 +18,6 @@
import os
from copy import deepcopy
-from dataclasses import dataclass
import chanfig
import torch
@@ -213,7 +212,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model, tokenizer_config=tokenizer_config)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/multimolecule/models/rnamsm/README.md b/multimolecule/models/rnamsm/README.md
index 0bd3e878..37e23a1d 100644
--- a/multimolecule/models/rnamsm/README.md
+++ b/multimolecule/models/rnamsm/README.md
@@ -198,7 +198,9 @@ RNA-MSM used Masked Language Modeling (MLM) as the pre-training objective: takin
### Training Data
-The RNA-MSM model was pre-trained on [Rfam](https://rfam.org). Rfam database is a collection of RNA families, each represented by multiple sequence alignments, consensus secondary structures and covariance models. RNA-MSM used Rfam 14.7 which contains 4,069 RNA families.
+The RNA-MSM model was pre-trained on [Rfam](https://rfam.org).
+The Rfam database is a collection of RNA sequence families of structural RNAs including non-coding RNA genes as well as cis-regulatory elements.
+RNA-MSM used Rfam 14.7 which contains 4,069 RNA families.
To avoid potential overfitting in structural inference, RNA-MSM excluded families with experimentally determined structures, such as ribosomal RNAs, transfer RNAs, and small nuclear RNAs. The final dataset contains 3,932 RNA families. The median value for the number of MSA sequences for these families by RNAcmap3 is 2,184.
diff --git a/multimolecule/models/rnamsm/convert_checkpoint.py b/multimolecule/models/rnamsm/convert_checkpoint.py
index 83df7ed9..6702621b 100644
--- a/multimolecule/models/rnamsm/convert_checkpoint.py
+++ b/multimolecule/models/rnamsm/convert_checkpoint.py
@@ -18,7 +18,6 @@
import os
from copy import deepcopy
-from dataclasses import dataclass
import __main__
import chanfig
@@ -92,7 +91,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/multimolecule/models/splicebert/README.md b/multimolecule/models/splicebert/README.md
index 6206e8bc..16b69857 100644
--- a/multimolecule/models/splicebert/README.md
+++ b/multimolecule/models/splicebert/README.md
@@ -226,7 +226,8 @@ SpliceBERT used Masked Language Modeling (MLM) as the pre-training objective: ta
### Training Data
-The SpliceBERT model was pre-trained on messenger RNA precursor sequences from [UCSC Genome Browser](https://genome.ucsc.edu). UCSC Genome Browser provides visualization, analysis, and download of comprehensive vertebrate genome data with aligned annotation tracks (known genes, predicted genes, ESTs, mRNAs, CpG islands, etc.).
+The SpliceBERT model was pre-trained on messenger RNA precursor sequences from [UCSC Genome Browser](https://genome.ucsc.edu).
+UCSC Genome Browser provides visualization, analysis, and download of comprehensive vertebrate genome data with aligned annotation tracks (known genes, predicted genes, ESTs, mRNAs, CpG islands, etc.).
SpliceBERT collected reference genomes and gene annotations from the UCSC Genome Browser for 72 vertebrate species. It applied [bedtools getfasta](https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html) to extract pre-mRNA sequences from the reference genomes based on the gene annotations. The pre-mRNA sequences are then used to pre-train SpliceBERT. The pre-training data contains 2 million pre-mRNA sequences with a total length of 65 billion nucleotides.
diff --git a/multimolecule/models/splicebert/convert_checkpoint.py b/multimolecule/models/splicebert/convert_checkpoint.py
index 8f73c7e1..2903900b 100644
--- a/multimolecule/models/splicebert/convert_checkpoint.py
+++ b/multimolecule/models/splicebert/convert_checkpoint.py
@@ -17,7 +17,6 @@
from __future__ import annotations
import os
-from dataclasses import dataclass
import chanfig
import torch
@@ -92,7 +91,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str | None = None # type: ignore[assignment]
diff --git a/multimolecule/models/utrbert/README.md b/multimolecule/models/utrbert/README.md
index e310c8b2..058f2127 100644
--- a/multimolecule/models/utrbert/README.md
+++ b/multimolecule/models/utrbert/README.md
@@ -233,7 +233,8 @@ output = model(**input, labels=label)
### Training Data
-The 3UTRBERT model was pre-trained on human mRNA transcript sequences from [GENCODE](https://gencodegenes.org). GENCODE aims to identify all gene features in the human genome using a combination of computational analysis, manual annotation, and experimental validation. The GENCODE release 40 used by this work contains 61,544 genes, and 246,624 transcripts.
+The 3UTRBERT model was pre-trained on human mRNA transcript sequences from [GENCODE](https://gencodegenes.org).
+GENCODE aims to identify all gene features in the human genome using a combination of computational analysis, manual annotation, and experimental validation. The GENCODE release 40 used by this work contains 61,544 genes, and 246,624 transcripts.
3UTRBERT collected the human mRNA transcript sequences from GENCODE, including 108,573 unique mRNA transcripts. Only the longest transcript of each gene was used in the pre-training process. 3UTRBERT only used the 3’ untranslated regions (3’UTRs) of the mRNA transcripts for pre-training to avoid codon constrains in the CDS region, and to reduce increased complexity of the entire mRNA transcripts. The average length of the 3’UTRs was 1,227 nucleotides, while the median length was 631 nucleotides. Each 3’UTR sequence was cut to non-overlapping patches of 510 nucleotides. The remaining sequences were padded to the same length.
diff --git a/multimolecule/models/utrbert/convert_checkpoint.py b/multimolecule/models/utrbert/convert_checkpoint.py
index fa2e4745..131771a2 100644
--- a/multimolecule/models/utrbert/convert_checkpoint.py
+++ b/multimolecule/models/utrbert/convert_checkpoint.py
@@ -17,7 +17,6 @@
from __future__ import annotations
import os
-from dataclasses import dataclass
import chanfig
import torch
@@ -96,7 +95,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model, tokenizer_config=tokenizer_config)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str | None = None # type: ignore[assignment]
diff --git a/multimolecule/models/utrlm/convert_checkpoint.py b/multimolecule/models/utrlm/convert_checkpoint.py
index 003234a6..0400d874 100644
--- a/multimolecule/models/utrlm/convert_checkpoint.py
+++ b/multimolecule/models/utrlm/convert_checkpoint.py
@@ -18,7 +18,6 @@
import os
from copy import deepcopy
-from dataclasses import dataclass
import chanfig
import torch
@@ -101,7 +100,6 @@ def convert_checkpoint(convert_config):
save_checkpoint(convert_config, model)
-@dataclass
class ConvertConfig(ConvertConfig_):
root: str = os.path.dirname(__file__)
output_path: str = Config.model_type
diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py
index c6d108d9..9ad5dd51 100644
--- a/tests/data/test_dataset.py
+++ b/tests/data/test_dataset.py
@@ -203,3 +203,22 @@ def test_rna_task_recognition_json(self):
assert dataset.tasks["contact_regression"] == Task(
type=TaskType.Regression, level=TaskLevel.Contact, num_labels=1
)
+
+
+class TestHuggingFaceDataset:
+
+ pretrained = "multimolecule/rna"
+ root = "multimolecule/"
+
+ def test_bprna_spot(self):
+ file = os.path.join(self.root, "bprna-spot")
+ dataset = Dataset(file, split="test", pretrained=self.pretrained, preprocess=True)
+ elem = dataset[0]
+ assert isinstance(elem["sequence"], dl.PNTensor)
+ assert isinstance(elem["secondary_structure"], torch.LongTensor)
+ batch = dataset[list(range(3))]
+ assert isinstance(batch["sequence"], dl.NestedTensor)
+ assert isinstance(batch["secondary_structure"], dl.NestedTensor)
+ assert dataset.tasks["secondary_structure"] == Task("binary", "contact", 1)
+ assert dataset.discrete_map["structural_annotation"] == {"B": 0, "E": 1, "H": 2, "I": 3, "M": 4, "S": 5, "X": 6}
+ assert dataset.discrete_map["functional_annotation"] == {"K": 0, "N": 1}