Skip to content

Commit

Permalink
Merge pull request #169 from ntalluri/update-dataset-label
Browse files Browse the repository at this point in the history
Add Dataset Label Checking
  • Loading branch information
agitter authored Jul 25, 2024
2 parents 98d7e35 + 657f17a commit b4f8d51
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 1 deletion.
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ algorithms:
# Assume that if a dataset label does not change, the lists of associated input files do not change
datasets:
-
# Labels can only contain letters, numbers, or underscores
label: data0
node_files: ["node-prizes.txt", "sources.txt", "targets.txt"]
# DataLoader.py can currently only load a single edge file, which is the primary network
Expand Down
2 changes: 1 addition & 1 deletion config/egfr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ datasets:
data_dir: input
edge_files:
- phosphosite-irefindex13.0-uniprot.txt
label: tps-egfr
label: tps_egfr
node_files:
- tps-egfr-prizes.txt
other_files: []
Expand Down
6 changes: 6 additions & 0 deletions spras/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import copy as copy
import itertools as it
import os
import re

import numpy as np
import yaml
Expand Down Expand Up @@ -140,6 +141,11 @@ def process_config(self, raw_config):
# Convert to dicts to simplify the yaml logging
self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]}

for key in self.datasets:
pattern = r'^\w+$'
if not bool(re.match(pattern, key)):
raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.")

# Code snipped from Snakefile that may be useful for assigning default labels
# dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
# Maps from the dataset label to the dataset list index
Expand Down
17 changes: 17 additions & 0 deletions test/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,20 @@ def test_config_container_registry(self):
test_config["container_registry"]["owner"] = ""
config.init_global(test_config)
assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX)

def test_error_dataset_label(self):
test_config = get_test_config()
error_test_dicts = [{"label":"test$"}, {"label":"@test'"}, {"label":"[test]"}, {"label":"test-test"}, {"label":"✉"}]

for test_dict in error_test_dicts:
test_config["datasets"]= [test_dict]
with pytest.raises(ValueError): #raises error if any chars other than letters, numbers, or underscores are in dataset label
config.init_global(test_config)

def test_correct_dataset_label(self):
test_config = get_test_config()
correct_test_dicts = [{"label":"test"}, {"label":"123"}, {"label":"test123"}, {"label":"123test"}, {"label":"_"}, {"label":"test_test"}, {"label":"_test"}, {"label":"test_"}]

for test_dict in correct_test_dicts:
test_config["datasets"]= [test_dict]
config.init_global(test_config) # no error should be raised

0 comments on commit b4f8d51

Please sign in to comment.