diff --git a/BLRunner.py b/BLRunner.py index e958d5be..4d58ff41 100644 --- a/BLRunner.py +++ b/BLRunner.py @@ -22,6 +22,7 @@ from BLRun.runner import Runner import os import pandas as pd +import time import BLRun as br yaml.warnings({'YAMLLoadWarning': False}) @@ -57,8 +58,9 @@ def main(): with open(config_file, 'r') as conf: evaluation = br.ConfigParser.parse(conf) - print(evaluation) - print('Evaluation started') + # print(evaluation) + start_time = time.process_time() + print('Execution of algorithms started') for idx in range(len(evaluation.runners)): @@ -70,7 +72,8 @@ def main(): for idx in range(len(evaluation.runners)): evaluation.runners[idx].parseOutput() - print('Evaluation complete') + end_time = time.process_time() + print(f'Execution of algorithms completed in {end_time-start_time:0.2f} seconds') if __name__ == '__main__': diff --git a/config-files/Quickstart/Curated/GSD-quickstart.yaml b/config-files/Quickstart/Curated/GSD-quickstart.yaml new file mode 100644 index 00000000..c0176c17 --- /dev/null +++ b/config-files/Quickstart/Curated/GSD-quickstart.yaml @@ -0,0 +1,169 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Curated/GSD" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # ex_filename: scRNA-Seqexpression file name + # + # pt_filename: a file containing pseudotime ordering + # + datasets: + - name: "GSD-2000-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "GSD-2000-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "GSD-2000-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "GSD-2000-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "GSD-2000-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + + - name: "SCODE" + params: + should_run: [True] + z: [2] + nIter: [100] + nRep: [5] + + - name: "SCNS" + params: + should_run: [False] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [6] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.33 + maxLag: [0.1] + + + - name: "GRISLI" + params: + should_run: [False] + L: [10] + R: [1500] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [15] + num_lags: [5] + kernel_width: [0.5] + prob_zero_removal: [0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [6] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5,25,50,75,100"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "GSD" diff --git a/config-files/Quickstart/Curated/HSC-quickstart.yaml b/config-files/Quickstart/Curated/HSC-quickstart.yaml new file mode 100644 index 00000000..73013680 --- /dev/null +++ b/config-files/Quickstart/Curated/HSC-quickstart.yaml @@ -0,0 +1,170 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Curated/HSC" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # ex_filename: scRNA-Seqexpression file name + # + # pt_filename: a file containing pseudotime ordering + # + datasets: + + - name: "HSC-2000-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "HSC-2000-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "HSC-2000-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "HSC-2000-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "HSC-2000-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + + - name: "SCODE" + params: + should_run: [True] + z: [2] + nIter: [100] + nRep: [5] + + - name: "SCNS" + params: + should_run: [False] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [20] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.33 + maxLag: [0.05] + + + - name: "GRISLI" + params: + should_run: [False] + L: [5] + R: [1500] + alphaMin: [0.25] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [3] + num_lags: [5] + kernel_width: [1] + prob_zero_removal: [0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [2] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "HSC" diff --git a/config-files/Quickstart/Curated/VSC-quickstart.yaml b/config-files/Quickstart/Curated/VSC-quickstart.yaml new file mode 100644 index 00000000..a7b5721d --- /dev/null +++ b/config-files/Quickstart/Curated/VSC-quickstart.yaml @@ -0,0 +1,171 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Curated/VSC" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # ex_filename: scRNA-Seqexpression file name + # + # pt_filename: a file containing pseudotime ordering + # + datasets: + + - name: "VSC-2000-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "VSC-2000-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "VSC-2000-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "VSC-2000-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "VSC-2000-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + + - name: "SCODE" + params: + should_run: [True] + z: [10] + nIter: [100] + nRep: [5] + + - name: "SCNS" + params: + should_run: [False] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [5] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.33 + maxLag: [0.33] + + + - name: "GRISLI" + params: + should_run: [False] + L: [10] + R: [1500] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [15] + num_lags: [5] + kernel_width: [0.5] + prob_zero_removal: [0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [2] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5,10,15,20,25"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "VSC" diff --git a/config-files/Quickstart/Curated/mCAD-quickstart.yaml b/config-files/Quickstart/Curated/mCAD-quickstart.yaml new file mode 100644 index 00000000..1b163e0b --- /dev/null +++ b/config-files/Quickstart/Curated/mCAD-quickstart.yaml @@ -0,0 +1,170 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Curated/mCAD" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # ex_filename: scRNA-Seqexpression file name + # + # pt_filename: a file containing pseudotime ordering + # + datasets: + + - name: "mCAD-2000-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "mCAD-2000-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "mCAD-2000-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "mCAD-2000-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "mCAD-2000-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + + - name: "SCODE" + params: + should_run: [True] + z: [6] + nIter: [100] + nRep: [5] + + - name: "SCNS" + params: + should_run: [False] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [10] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.33 + maxLag: [0.3] + + + - name: "GRISLI" + params: + should_run: [False] + L: [100] + R: [1500] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.0] + dT: [5] + num_lags: [9] + kernel_width: [0.5] + prob_zero_removal: [0] + prob_remove_samples: [0] + family: ["gaussian"] + num_replicates: [2] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5,25,50,75,100"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "mCAD" diff --git a/config-files/Quickstart/Synthetic/dyn-BF-quickstart.yaml b/config-files/Quickstart/Synthetic/dyn-BF-quickstart.yaml new file mode 100644 index 00000000..dbf0f35a --- /dev/null +++ b/config-files/Quickstart/Synthetic/dyn-BF-quickstart.yaml @@ -0,0 +1,169 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Synthetic/dyn-BF" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # exprData: scRNA-Seqexpression file name + # cellData: a file containing pseudotime ordering + # trueEdges: a file contaning reference network for evaulation + datasets: + - name: "dyn-BF-100-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BF-100-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BF-100-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BF-100-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BF-100-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + - name: "SCNS" + params: + should_run: [False] + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + - name: "SCODE" + params: + should_run: [True] + z: [4] + nIter: [100] + nRep: [5] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [15] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.3 + # but it is very slow with 0.3 + # when we have more than 1000 cells. + maxLag: [0.10] + + - name: "GRISLI" + params: + should_run: [False] + L: [5] + R: [1500] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [5] + num_lags: [15] + kernel_width: [0.5] + prob_zero_removal: [0.0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [2] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5,10,20,25"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "dyn-BF" diff --git a/config-files/Quickstart/Synthetic/dyn-BFC-quickstart.yaml b/config-files/Quickstart/Synthetic/dyn-BFC-quickstart.yaml new file mode 100644 index 00000000..b2268fc1 --- /dev/null +++ b/config-files/Quickstart/Synthetic/dyn-BFC-quickstart.yaml @@ -0,0 +1,168 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Synthetic/dyn-BFC" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # exprData: scRNA-Seqexpression file name + # cellData: a file containing pseudotime ordering + # trueEdges: a file contaning reference network for evaulation + datasets: + - name: "dyn-BFC-100-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BFC-100-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BFC-100-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BFC-100-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-BFC-100-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + - name: "SCNS" + params: + should_run: [False] + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + - name: "SCODE" + params: + should_run: [True] + z: [4] + nIter: [100] + nRep: [5] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [15] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.3 + # but it is very slow with 0.3 + # when we have more than 1000 cells. + maxLag: [0.10] + + - name: "GRISLI" + params: + should_run: [False] + L: [5] + R: [1500] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [5] + num_lags: [15] + kernel_width: [0.5] + prob_zero_removal: [0.0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [2] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5,10,20,25"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "dyn-BFC" diff --git a/config-files/Quickstart/Synthetic/dyn-LI-quickstart.yaml b/config-files/Quickstart/Synthetic/dyn-LI-quickstart.yaml new file mode 100644 index 00000000..9ee0e4cf --- /dev/null +++ b/config-files/Quickstart/Synthetic/dyn-LI-quickstart.yaml @@ -0,0 +1,169 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Synthetic/dyn-LI" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # exprData: scRNA-Seqexpression file name + # cellData: a file containing pseudotime ordering + # trueEdges: a file contaning reference network for evaulation + datasets: + - name: "dyn-LI-100-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-LI-100-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-LI-100-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-LI-100-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-LI-100-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + + - name: "SCNS" + params: + should_run: [False] + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + + - name: "SCODE" + params: + should_run: [True] + z: [4] + nIter: [100] + nRep: [5] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [15] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.3 + # but it is very slow with 0.3 + # when we have more than 1000 cells. + maxLag: [0.10] + + - name: "GRISLI" + params: + should_run: [False] + L: [5] + R: [1500] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [5] + num_lags: [15] + kernel_width: [0.5] + prob_zero_removal: [0.0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [2] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5,10,20,25"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "dyn-LI" diff --git a/config-files/Quickstart/Synthetic/dyn-TF-quickstart.yaml b/config-files/Quickstart/Synthetic/dyn-TF-quickstart.yaml new file mode 100644 index 00000000..7be64716 --- /dev/null +++ b/config-files/Quickstart/Synthetic/dyn-TF-quickstart.yaml @@ -0,0 +1,169 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "Synthetic/dyn-TF" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # exprData: scRNA-Seqexpression file name + # cellData: a file containing pseudotime ordering + # trueEdges: a file contaning reference network for evaulation + datasets: + - name: "dyn-TF-100-1" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-TF-100-2" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-TF-100-3" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-TF-100-4" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + - name: "dyn-TF-100-5" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + - name: "SCNS" + params: + should_run: [False] + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # Used in parsing output + pVal: [0.01] + + - name: "SCODE" + params: + should_run: [True] + z: [4] + nIter: [100] + nRep: [5] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [15] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.3 + # but it is very slow with 0.3 + # when we have more than 1000 cells. + maxLag: [0.10] + + - name: "GRISLI" + params: + should_run: [False] + L: [5] + R: [1500] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [5] + num_lags: [15] + kernel_width: [0.5] + prob_zero_removal: [0.0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [2] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5,10,20,25"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "dyn-TF" diff --git a/config-files/Quickstart/example-quickstart.yaml b/config-files/Quickstart/example-quickstart.yaml new file mode 100644 index 00000000..372c6a01 --- /dev/null +++ b/config-files/Quickstart/example-quickstart.yaml @@ -0,0 +1,143 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "example" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # exprData: scRNA-Seq expression data file. Cells are along the + # columns and genes are along the rows. + # cellData: a file containing pseudotime ordering, or any other + # information about cells. + # trueEdges: Name of the refrence network file in the + # edge list format. Needed for evaluation. + datasets: + - name: "GSD" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + + + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # p-value cutoff + # Used in parsing output + pVal: [0.01] + + + - name: "SCODE" + params: + should_run: [True] + z: [10] + nIter: [1000] + nRep: [6] + + - name: "SCNS" + params: + should_run: [False] + + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [10] + + + - name: "LEAP" + params: + should_run: [False] + # Default maxLag value is 0.33 + maxLag: [0.33] + + + - name: "GRISLI" + params: + should_run: [False] + L: [10] + R: [3000] + alphaMin: [0.0] + + + - name: "SINGE" + params: + should_run: [False] + lambda: [0.01] + dT: [15] + num_lags: [5] + kernel_width: [0.5] + prob_zero_removal: [0] + prob_remove_samples: [0.0] + family: ["gaussian"] + num_replicates: [6] + + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['ucRDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "GSD" diff --git a/config-files/Quickstart/scRNA-seq/mTuck.yaml b/config-files/Quickstart/scRNA-seq/mTuck.yaml new file mode 100644 index 00000000..949b1b93 --- /dev/null +++ b/config-files/Quickstart/scRNA-seq/mTuck.yaml @@ -0,0 +1,150 @@ +# Input Settings: initialize base input folder names, +# dataset collections, and algorithms to run over +input_settings: + + # Base input directory + input_dir : "inputs" + + # Subdirectory of inputs that datasets are placed in + dataset_dir: "scRNA-Seq" + + # Denotes a list of datasets, each with the following parameters: + # name: Name of the dataset. May be used in logging or other + # messages written during execution + # + # ex_filename: scRNA-Seqexpression file name + # + # pt_filename: a file containing pseudotime ordering + # + datasets: + - name: "mTuck" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" + + # Denotes a list of algorithms to run. Each has the following parameters: + # name: Name of the algorithm. Must be recognized by the pipeline, see + # code for acceptable values + # + # should_run: whether or not to run the algorithm + # + # params: any additional, algorithm-specific parameters + # should be specified in the params map for a given algorithm + # + algorithms: + - name: "PIDC" + params: + should_run: [True] + + + - name: "GRNVBEM" + params: + should_run: [False] + + + + - name: "GENIE3" + params: + should_run: [False] + + + + - name: "GRNBOOST2" + params: + should_run: [True] + + + - name: "PPCOR" + params: + should_run: [False] + # p-value cutoff + # Used in parsing output + pVal: [0.01] + + + - name: "SCNS" + params: + should_run: [False] + + + - name: "LEAP" + params: + should_run: [False] + maxLag: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.33] + + - name: "SINCERITIES" + params: + should_run: [True] + nBins: [5,6] + #nBins: [5,6] + + - name: "SCRIBE" + params: + should_run: [False] + ### required parameters + # a list of delay values + delay: ["5","5,10", "5,10,20,25","5,25,50,75,100"] + # any of 'RDI', 'uRDI', 'cRDI', or 'ucRDI' + method: ['RDI'] + # lower detection limit (expression below this + # will be treated as zero. + lowerDetectionLimit: [0] + # expressionFamily: for synthetic data use uninormal + # for mRNA count data use negbinomial.size() + expressionFamily: ['uninormal'] + ### optional but recommended parameters + # log transform expression values or not + log: [False] + # ignore pseudotime values (and use experimental + # time points instead), recommended True for synthetic data + # False for real mRNA data + ignorePT: [True] + + - name: "SCODE" + params: + should_run: [True] + D: [2,4,6,8,10] + nIter: [50] + nRep: [2] + #nIter: [50] + #nRep: [2] + # + + - name: "GRISLI" + params: + should_run: [False] + forced: [False] + L: [5] + R: [1500] + #alphaMin: [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] + alphaMin: [0] + + - name: "SINGE" + # This is a total of 480 parameter sets + # use masterscript.py to run these + params: + should_run: [False] + forced: [False] + qsub: [False] + docker: [False] + cleanup: [True] + lambda: [0.01] + # In their evaluations, they only tested specific sets + # of dT and num_lags to not create too many combinations. + # I made an extra parameter to for this + dT: [5] + num_lags: [15] + kernel_width: [0.5] + prob_zero_removal: [0] + prob_remove_samples: [0] + num_replicates: [2] + family: ["gaussian"] + + + +# Output Settings: initialize base output folder names +output_settings: + + # Base output directory + output_dir: "outputs" + output_prefix: "mTuck" diff --git a/initialize.sh b/initialize.sh index d63dd91e..41df539b 100755 --- a/initialize.sh +++ b/initialize.sh @@ -54,7 +54,7 @@ echo "Docker container for SCODE is built and tagged as scode:base" cd $BASEDIR/Algorithms/SCRIBE/ docker build -q -t scribe:base . -echo "Docker container for SCRIBE is built and tagged as sincerities:base" +echo "Docker container for SCRIBE is built and tagged as scribe:base" cd $BASEDIR/Algorithms/SINCERITIES/ docker build -q -t sincerities:base .