Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add runner
Browse files Browse the repository at this point in the history
Signed-off-by: Zhiyuan Chen <[email protected]>
ZhiyuanChen committed Dec 16, 2024
1 parent 303d1d9 commit 8ee697a
Showing 19 changed files with 1,118 additions and 1 deletion.
9 changes: 9 additions & 0 deletions docs/docs/data/multitask.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
authors:
- Zhiyuan Chen
date: 2024-05-04
---

# MultiTask

::: multimolecule.data.multitask
9 changes: 9 additions & 0 deletions docs/docs/runners/config.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
authors:
- Zhiyuan Chen
date: 2024-05-04
---

# MultiMoleculeConfig

::: multimolecule.runners.MultiMoleculeConfig
9 changes: 9 additions & 0 deletions docs/docs/runners/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
authors:
- Zhiyuan Chen
date: 2024-05-04
---

# runners

--8<-- "multimolecule/runners/README.md:8:"
9 changes: 9 additions & 0 deletions docs/docs/runners/runner.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
authors:
- Zhiyuan Chen
date: 2024-05-04
---

# MultiMoleculeRunner

::: multimolecule.runners.base_runner.BaseRunner
5 changes: 5 additions & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
@@ -9,9 +9,14 @@ repo_url: https://github.com/DLS5-Omics/multimolecule

nav:
- index.md
- runners:
- runners/index.md
- MultiMoleculeRunner: runners/runner.md
- MultiMoleculeConfig: runners/config.md
- data:
- data/index.md
- Dataset: data/dataset.md
- multitask: data/multitask.md
- datasets:
- datasets/index.md
- DNA:
2 changes: 2 additions & 0 deletions multimolecule/__init__.py
Original file line number Diff line number Diff line change
@@ -20,6 +20,7 @@
# <https://multimolecule.danling.org/about/license-faq>.


from .apis import evaluate, infer, train
from .data import Dataset
from .models import (
AutoModelForContactPrediction,
@@ -130,6 +131,7 @@
TokenKMerHead,
TokenPredictionHead,
)
from .runners import MultiMoleculeConfig, MultiMoleculeRunner
from .tasks import Task, TaskLevel, TaskType
from .tokenisers import Alphabet, DnaTokenizer, DotBracketTokenizer, ProteinTokenizer, RnaTokenizer, Tokenizer
from .utils import count_parameters
19 changes: 19 additions & 0 deletions multimolecule/apis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from .run import evaluate, infer, train

__all__ = ["train", "evaluate", "infer"]
115 changes: 115 additions & 0 deletions multimolecule/apis/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# mypy: disable-error-code="attr-defined"

import atexit
import os
import warnings
from typing import Type

import danling as dl
import torch

from multimolecule.runners import MultiMoleculeConfig, MultiMoleculeRunner

try:
import nni
except ImportError:
nni = None


def train(
config: MultiMoleculeConfig = None, # type: ignore
runner_cls: Type[MultiMoleculeRunner] = MultiMoleculeRunner,
):
if config is None:
config = MultiMoleculeConfig()
config = config.parse(default_config="config", no_default_config_action="warn")
config.interpolate(unsafe_eval=True)
config.training = True
if config.allow_tf32:
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
if config.reduced_precision_reduction:
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
if config.get("nni", False):
if nni is None:
raise ValueError("Unable to retrieve nni parameters, since nni is not installed.")
config.merge(nni.get_next_parameter())
with dl.debug(config.get("debug", False)):
runner = runner_cls(config)
atexit.register(runner.print_result)
atexit.register(runner.save_result)
atexit.register(runner.save_checkpoint)
result = runner.train()
return result


def evaluate(
config: MultiMoleculeConfig = None, # type: ignore
runner_cls: Type[MultiMoleculeRunner] = MultiMoleculeRunner,
):
if config is None:
config = MultiMoleculeConfig.empty()
config = config.parse(default_config="config", no_default_config_action="warn")
config.interpolate(unsafe_eval=True)
config.training = False
if config.allow_tf32:
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
if config.reduced_precision_reduction:
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
if "checkpoint" not in config or not isinstance(config.checkpoint, str):
raise RuntimeError("Please specify `checkpoint` to run evaluate")
for name, data in config.datas.items():
if "evaluation" not in data or not isinstance(data.evaluate, str):
raise RuntimeError(f"Please specify `evaluation` to run evaluate in datas.{name}")
runner = runner_cls(config)
result = runner.evaluate_epoch("evaluation")
print(result)
return result


def infer(
config: MultiMoleculeConfig = None, # type: ignore
runner_cls: Type[MultiMoleculeRunner] = MultiMoleculeRunner,
):
if config is None:
config = MultiMoleculeConfig.empty()
config = config.parse(default_config="config", no_default_config_action="warn")
config.interpolate(unsafe_eval=True)
config.training = False
if config.allow_tf32:
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
if config.reduced_precision_reduction:
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
if "checkpoint" not in config or not isinstance(config.checkpoint, str):
raise RuntimeError("Please specify `checkpoint` to run infer.")
for name, data in config.datas.items():
if "inference" not in data or not isinstance(data.inference, str):
raise RuntimeError(f"Please specify `inference` to run infer in datas.{name}")
if "result_path" not in config or not isinstance(config.result_path, str):
config.result_path = os.path.join(os.getcwd(), "result.json")
warnings.warn("`result_path` is not specified, default to `result.json`.", RuntimeWarning, stacklevel=2)
runner = runner_cls(config)
result = runner.infer()
runner.save(result, config.result_path)
return result
99 changes: 99 additions & 0 deletions multimolecule/apis/stat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import os
import shutil
from statistics import mean
from typing import List

import chanfig
import pandas as pd
from chanfig import NestedDict
from tqdm import tqdm


class Result(NestedDict):
pretrained: str
id: str
seed: int
epoch: int
validation: NestedDict
test: NestedDict


def get_result_stat(experiment_root: str, remove_empty: bool = True) -> List[Result]:
results = []
for root, _, files in tqdm(os.walk(experiment_root)):
if "run.log" in files:
if "best.json" not in files:
if remove_empty:
shutil.rmtree(root)
continue
best = NestedDict.from_json(os.path.join(root, "best.json"))
if "index" not in best:
if remove_empty:
shutil.rmtree(root)
continue
config = NestedDict.from_yaml(os.path.join(root, "trainer.yaml"))
pretrained = config.pretrained.split("/")[-1]
seed = config.seed
result = Result(id=best.id, pretrained=pretrained, seed=seed)
result.validation = NestedDict(
{k: format(mean(v) if isinstance(v, list) else v, ".8f") for k, v in best.validation.all_items()}
)
result.test = NestedDict(
{k: format(mean(v) if isinstance(v, list) else v, ".8f") for k, v in best.test.all_items()}
)
result.epoch = best.index
result.pop("validation.time", None)
result.pop("test.time", None)
result.pop("validation.loss", None)
result.pop("test.loss", None)
result.pop("validation.lr", None)
result.pop("test.lr", None)
results.append(result)
# Remove empty directories, perform twice to remove all empty directories
if remove_empty:
for root, dirs, files in os.walk(experiment_root):
if not files and not dirs:
os.rmdir(root)
for root, dirs, files in os.walk(experiment_root):
if not files and not dirs:
os.rmdir(root)
results.sort(key=lambda x: (x.pretrained, x.seed, x.id))
return results


def write_result_stat(results: List[Result], path: str):
results = [dict(result.all_items()) for result in results] # type: ignore[misc]
df = pd.DataFrame.from_dict(results)
df.insert(len(df.keys()) - 1, "comment", "")
df.fillna("")
df.to_csv(path, index=False)


class Config(chanfig.Config):
experiment_root: str = "experiments"
out_path: str = "result.csv"
remove_empty: bool = True


if __name__ == "__main__":
config = Config().parse()
result_stat = get_result_stat(config.experiment_root, config.remove_empty)
if not len(result_stat) > 0:
raise ValueError("No results found")
write_result_stat(result_stat, config.out_path)
4 changes: 4 additions & 0 deletions multimolecule/data/__init__.py
Original file line number Diff line number Diff line change
@@ -20,9 +20,13 @@
# https://multimolecule.danling.org/about/license-faq

from .dataset import Dataset
from .multitask import DistributedMultiTaskSampler, MultiTaskDataset, MultiTaskSampler
from .utils import no_collate

__all__ = [
"Dataset",
"MultiTaskDataset",
"MultiTaskSampler",
"DistributedMultiTaskSampler",
"no_collate",
]
246 changes: 246 additions & 0 deletions multimolecule/data/multitask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

from bisect import bisect_right
from collections.abc import Iterator, Mapping, Sequence
from copy import deepcopy
from random import choices

import torch
from chanfig import NestedDict
from torch import distributed as dist
from torch.utils import data

from .dataset import Dataset


class MultiTaskDataset(data.ConcatDataset):

datasets: Mapping
dataset_keys: Sequence[str]
dataset_values: Sequence[Dataset]

def __init__(self, datasets: Mapping) -> None:
for key, dataset in datasets.items():
if not isinstance(dataset, Dataset):
raise TypeError(f"Dataset {key} should be an instance of Dataset")
self.datasets = datasets
if not len(self.datasets) > 0:
raise ValueError("MultiTaskDataset should contain at least one dataset")
self.dataset_keys, self.dataset_values = zip(*self.datasets.items())
self.cumulative_sizes = self.cumsum(self.dataset_values)

def __getitems__(self, key: Sequence[int]) -> Mapping:
dataset_idx = bisect_right(self.cumulative_sizes, key[0])
if dataset_idx == 0:
sample_idx = key
else:
sample_idx = [i - self.cumulative_sizes[dataset_idx - 1] for i in key]
batch = self.dataset_values[dataset_idx][sample_idx]
batch["dataset"] = self.dataset_keys[dataset_idx]
return batch

@property
def tasks(self) -> NestedDict:
tasks = NestedDict()
for dataset in self.dataset_values:
for n, t in dataset.tasks.items():
if n not in tasks:
tasks[n] = t
elif tasks[n] != t:
raise ValueError(f"Task {n} has different configurations across datasets")
return tasks

@property
def dataset_tasks(self) -> NestedDict:
return NestedDict({k: v.tasks for k, v in self.datasets.items()})

def __repr__(self) -> str:
return f"MultiTaskDataset({', '.join([str(d) for d in self.datasets])})"


class MultiTaskSampler(data.BatchSampler):
r"""
Ensure all items in a batch comes from the same dataset.
Arguments:
sampler (Sampler): Base sampler.
batch_size (int): Size of mini-batch.
drop_last (bool): If ``True``, the sampler will drop the last batch if
its size would be less than ``batch_size``
"""

datasets: Sequence[Dataset]

def __init__( # pylint: disable=super-init-not-called
self,
dataset: MultiTaskDataset,
batch_size: int,
shuffle: bool = True,
drop_last: bool = False,
sampler_cls: type[data.Sampler] | None = None,
weights: list[int] | None = None,
) -> None:
self.datasets = dataset.dataset_values
self.batch_size = batch_size
self.drop_last = drop_last
self.shuffle = shuffle
if sampler_cls is None:
sampler_cls = data.RandomSampler if shuffle else data.SequentialSampler
self.samplers = [sampler_cls(d) for d in self.datasets] # type: ignore
self.dataset_sizes = [len(d) for d in self.datasets] # type: ignore
self.cumulative_sizes = dataset.cumulative_sizes
self.num_datasets = len(self.datasets)
self.weights = weights if weights is not None else self.dataset_sizes

def __iter__(self):
sampler_iters = [(i, iter(s)) for i, s in enumerate(self.samplers)]
sampler_weights = deepcopy(self.weights)
sampler_idx = 0
# Implemented based on the benchmarking in https://github.com/pytorch/pytorch/pull/76951
if self.drop_last:
while sampler_iters:
if self.shuffle:
sampler_idx = choices(range(len(sampler_iters)), weights=sampler_weights)[0]
sampler_id, sampler_iter = sampler_iters[sampler_idx]
cumulative_size = self.cumulative_sizes[sampler_id - 1] if sampler_id > 0 else 0
try:
batch = [next(sampler_iter) + cumulative_size for _ in range(self.batch_size)]
yield batch
except StopIteration:
sampler_iters.pop(sampler_idx)
sampler_weights.pop(sampler_idx)
else:
while sampler_iters:
if self.shuffle:
sampler_idx = choices(range(len(sampler_iters)), weights=sampler_weights)[0]
sampler_id, sampler_iter = sampler_iters[sampler_idx]
cumulative_size = self.cumulative_sizes[sampler_id - 1] if sampler_id > 0 else 0
batch = [0] * self.batch_size
idx_in_batch = 0
try:
for _ in range(self.batch_size):
batch[idx_in_batch] = next(sampler_iter) + cumulative_size
idx_in_batch += 1
yield batch
idx_in_batch = 0 # noqa: SIM113
batch = [0] * self.batch_size
except StopIteration:
sampler_iters.pop(sampler_idx)
sampler_weights.pop(sampler_idx)
if idx_in_batch > 0:
yield batch[:idx_in_batch]

def __len__(self):
batch_size = self.batch_size
if self.drop_last:
return sum(len(d) // batch_size for d in self.datasets)
return sum((len(d) + batch_size - 1) // batch_size for d in self.datasets)


class DistributedMultiTaskSampler(MultiTaskSampler): # pylint: disable=too-few-public-methods
r"""
Distributed version of MultiTaskSampler, which ensures that all GPUs sample data from the
same sub-dataset in each step without requiring additional communication.
The dataset selection is based on a random seed mechanism that is synchronized across epochs.
See Also:
[MultiTaskSampler][MultiTaskSampler]
"""

def __init__(
self,
dataset: MultiTaskDataset,
batch_size: int,
shuffle: bool = True,
drop_last: bool = False,
sampler_cls: type[data.Sampler] = data.RandomSampler,
weights: list[int] | None = None,
seed: int = 0,
) -> None:
super().__init__(dataset, batch_size, shuffle, drop_last, sampler_cls, weights)
self.samplers = [data.DistributedSampler(d, shuffle=shuffle, drop_last=drop_last) for d in self.datasets]
self.seed = seed
self.epoch = 0

def set_epoch(self, epoch: int):
"""
Sets the epoch for deterministic shuffling.
"""
self.epoch = epoch
for sampler in self.samplers:
sampler.set_epoch(epoch)

def _get_sampler_idx(self, high: int) -> int:
"""
Determines which sampler (i.e., sub-dataset) to use based on the seed and epoch.
"""
g = torch.Generator()
g.manual_seed(self.seed + self.epoch)
sampler_idx = torch.randint(low=0, high=high, size=(1,), generator=g).item()
return sampler_idx

def __iter__(self) -> Iterator:
sampler_iters = [(i, iter(s)) for i, s in enumerate(self.samplers)]
sampler_weights = deepcopy(self.weights)

if self.drop_last:
while sampler_iters:
# Sample the same sub-dataset across all GPUs using the seeded index
sampler_idx = self._get_sampler_idx(len(sampler_iters))
sampler_id, sampler_iter = sampler_iters[sampler_idx]
cumulative_size = self.cumulative_sizes[sampler_id - 1] if sampler_id > 0 else 0
try:
batch = [next(sampler_iter) + cumulative_size for _ in range(self.batch_size)]
yield batch
except StopIteration:
sampler_iters.pop(sampler_idx)
sampler_weights.pop(sampler_idx)
else:
while sampler_iters:
# Sample the same sub-dataset across all GPUs using the seeded index
sampler_idx = self._get_sampler_idx(len(sampler_iters))
sampler_id, sampler_iter = sampler_iters[sampler_idx]
cumulative_size = self.cumulative_sizes[sampler_id - 1] if sampler_id > 0 else 0
batch = [0] * self.batch_size
idx_in_batch = 0
try:
for _ in range(self.batch_size):
batch[idx_in_batch] = next(sampler_iter) + cumulative_size
idx_in_batch += 1
yield batch
idx_in_batch = 0 # noqa: SIM113
batch = [0] * self.batch_size
except StopIteration:
sampler_iters.pop(sampler_idx)
sampler_weights.pop(sampler_idx)
if idx_in_batch > 0:
yield batch[:idx_in_batch]

def __len__(self) -> int:
batch_size = self.batch_size * self.world_size
if self.drop_last:
return sum(len(d) // batch_size for d in self.datasets)
return sum((len(d) + batch_size - 1) // batch_size for d in self.datasets)

@property
def world_size(self) -> int:
r"""Return the number of processes in the current process group."""
if dist.is_available() and dist.is_initialized():
return dist.get_world_size()
return 1
9 changes: 9 additions & 0 deletions multimolecule/runners/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
authors:
- Zhiyuan Chen
date: 2024-05-04
---

# runners

`runners` provide an easy-to-use interface for running experiments.
20 changes: 20 additions & 0 deletions multimolecule/runners/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from .config import MultiMoleculeConfig
from .runner import MultiMoleculeRunner

__all__ = ["MultiMoleculeConfig", "MultiMoleculeRunner"]
357 changes: 357 additions & 0 deletions multimolecule/runners/base_runner.py

Large diffs are not rendered by default.

105 changes: 105 additions & 0 deletions multimolecule/runners/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

import os
from pathlib import Path
from typing import List

from chanfig import Config
from transformers import PretrainedConfig


class DataConfig(Config):
root: str = "."
train: str | None
validation: str | None
test: str | None
feature_cols: List | None = None
label_cols: List | None = None
truncation: bool = True


class OptimConfig(Config):
name: str = "AdamW"
lr: float = 1e-3
weight_decay: float = 1e-2
pretrained_ratio: float = 1e-2


class EmaConfig(Config):
enabled: bool = False
beta: float = 0.999
update_after_step: int = 0
update_every: int = 10


class MultiMoleculeConfig(Config):
name: str
seed: int = 1016

balance: str = "ew"
platform: str = "torch"
training: bool = True

pretrained: str | None
use_pretrained: bool = True
transformers: PretrainedConfig
epoch_end: int = 20

data: DataConfig

tensorboard: bool = True
save_interval: int = 10

art: bool = True
allow_tf32: bool = True
reduced_precision_reduction: bool = False

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.datas = Config(default_factory=DataConfig)
self.dataloader.batch_size = 32
self.optim = OptimConfig()
self.ema = EmaConfig()
self.sched.final_lr = 0

def post(self):
if "pretrained" not in self and "checkpoint" not in self:
raise ValueError("Either one of `pretrained` or `checkpoint` must be specified")
if "data" in self:
if self.datas:
raise ValueError("Only one of `data` or `datas` can be specified, but not both")
del self.datas
if "pretrained" in self:
self["network.backbone.sequence.name"] = self.get("pretrained")
self.name = self.get_name()
self["network.backbone.sequence.use_pretrained"] = self.use_pretrained

def get_name(self) -> str:
pretrained = self.get("pretrained")
if os.path.exists(pretrained):
path = Path(pretrained)
if os.path.isfile(pretrained):
pretrained = str(path.relative_to(path.parents[1]).with_suffix(""))
else:
pretrained = path.stem
name = pretrained.replace("/", "--")
if "optim" in self:
optim_name = self.optim.get("name", "no")
name += f"-{self.optim.lr}@{optim_name}"
return name + f"-{self.seed}"
37 changes: 37 additions & 0 deletions multimolecule/runners/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from chanfig import Registry as Registry_
from danling.metrics import binary_metrics, multiclass_metrics, multilabel_metrics, regression_metrics


class Registry(Registry_):

def build(self, type, num_labels: int | None = None, **kwargs):
if type == "multilabel":
return self.init(self.lookup(type), num_labels=num_labels, **kwargs)
if type == "multiclass":
return self.init(self.lookup(type), num_classes=num_labels, **kwargs)
if type == "regression":
return self.init(self.lookup(type), num_outputs=num_labels, **kwargs)
return self.init(self.lookup(type), **kwargs)


MetricRegistry = Registry(key="type")
MetricRegistry.register(binary_metrics, "binary")
MetricRegistry.register(multiclass_metrics, "multiclass")
MetricRegistry.register(multilabel_metrics, "multilabel")
MetricRegistry.register(regression_metrics, "regression")
42 changes: 42 additions & 0 deletions multimolecule/runners/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import danling as dl

from .base_runner import BaseRunner


class MultiMoleculeRunner(type):
def __new__(cls, config):
if config.get("platform", "torch") == "torch":
return TorchRunner(config)
if config.platform == "deepspeed":
return DeepSpeedRunner(config)
if config.platform == "accelerate":
return AccelerateRunner(config)
raise ValueError(f"Unsupported platform: {config.platform}")


class TorchRunner(BaseRunner, dl.TorchRunner):
pass


class DeepSpeedRunner(BaseRunner, dl.DeepSpeedRunner):
pass


class AccelerateRunner(BaseRunner, dl.AccelerateRunner):
pass
20 changes: 20 additions & 0 deletions multimolecule/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# MultiMolecule
# Copyright (C) 2024-Present MultiMolecule

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from .apis import train

if __name__ == "__main__":
train()
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -45,8 +45,9 @@ dynamic = [
]
dependencies = [
"accelerate",
"art",
"chanfig>=0.0.105",
"danling[torch]>=0.3.11",
"danling[torch]>=0.4.0b1",
"datasets",
'StrEnum; python_version < "3.11"',
"torch",

0 comments on commit 8ee697a

Please sign in to comment.