-
Notifications
You must be signed in to change notification settings - Fork 14
/
validation.py
168 lines (139 loc) · 6.95 KB
/
validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3
from __future__ import division, print_function
import pandas
import numpy as np
import os
import json
import sys
from argparse import ArgumentParser
import JSON_templates
# Make sure to adapt accordingly in other event workflows!
# Here is APAeval:absQuant_2021
DEFAULT_bench_event_id = "OEBE0070000004"
EVENT = "absQuant_2021"
parser = ArgumentParser()
parser.add_argument("-i", "--participant_data", help="Execution workflow output file to be validated", required=True)
parser.add_argument("-com", "--community_name", help="name of benchmarking community", required=True)
parser.add_argument("-c", "--challenge_ids", nargs='+', help="List of challenge ids selected by the user, separated by spaces", required=True)
parser.add_argument("-p", "--participant_name", help="name of the tool used for prediction", required=True)
parser.add_argument("-o", "--output", help="output path where participant JSON file will be written", required=True)
parser.add_argument("-gtf", "--genome_dir", help="genome annotation directory. Used for relative PAS usage calculation. Directory needs to contain genome files with matching organism name from challenge.", required=True)
args = parser.parse_args()
def select_genome_file(file_name, genome_path):
"""Select the genome file according to the organism.
Requires that the file_name contains an expression containing organism
information, which will be matched against the genome_path directory.
The format should be: name.mm10.ext or name.hg38extension.ext, with
matching genome annotations: gencode.mm10.gtf and gencode.hg38extension.gtf.
Note: no check for the extension (e.g. gtf) is done.
Args:
file_name (str): Name containing organism information. Supported: mm* and hg*.
genome_path (str): directory containing genome annotations in gtf format.
Returns:
str with genome file path.
"""
GENOME_STRINGS = ["mm", "hg"]
SPLITSTRING = "."
assert os.path.exists(genome_path), f"Genome annotation directory not found: {genome_path}"
file_components = file_name.split(SPLITSTRING)
# search for genome
for genome_string in GENOME_STRINGS:
match = [comp for comp in file_components if genome_string in comp]
if len(match) != 0:
break
if len(match) == 0:
raise ValueError(f"No genome string: {GENOME_STRINGS} in file_name: {file_name} found.")
# find all genome files in genome_path
for f in os.listdir(genome_path):
# find exact match in file
genome_match = [f for comp in f.split(SPLITSTRING) if match[0] == comp]
if len(genome_match) != 0:
break
if len(genome_match) == 0:
raise ValueError(f"No genome string: {GENOME_STRINGS} in genome_path: {genome_path} found.")
# return file
return os.path.join(genome_path, genome_match[0])
def main(args):
# input parameters
participant_input = args.participant_data
community = args.community_name
challenge_ids = args.challenge_ids
participant_name = args.participant_name
out_path = args.output
genome_path = args.genome_dir
print(f"INFO: input {participant_input}")
print(f"INFO: Possible challenges {challenge_ids}")
sample_name = str(participant_input).split('.')[1]
challenges = [c for c in challenge_ids if c.split('.')[0] == sample_name]
print(f"INFO: Selected challenge(s) {challenges}")
for challenge in challenges:
# Check annotation files for all challenges
gtf = select_genome_file(challenge, genome_path)
print(f"INFO: Selected genome file {gtf}")
chr_names = list()
with open(gtf, 'r') as f:
for row in f:
if not row.startswith('#'):
# gtf is always tab-separated and first column is always seqname.
seqname = row.split('\t')[0]
if seqname not in chr_names:
chr_names.append(seqname)
seqnames_wchr = [s for s in chr_names if 'chr' in s]
assert len(seqnames_wchr) == len(chr_names) or len(seqnames_wchr) == 0, \
f"WARNING: {genome_path} has a mix of chromosome name formats!"
# Assuring the output path for validation.json does exist
if not os.path.exists(os.path.dirname(out_path)):
try:
print(os.path.dirname(out_path))
os.makedirs(os.path.dirname(out_path))
with open(out_path, mode="a") :
pass
except OSError as exc:
print("OS error: {0}".format(exc) + "\nCould not create output path: " + out_path)
validate_input_data(participant_input, sample_name, community, challenges, participant_name, out_path, chr_names)
def validate_input_data(infile, sample_name, community, challenges, participant_name, out_path, chr_names):
validated = False
# get participant output (= input to be validated)
try:
participant_data = pandas.read_csv(infile, sep='\t',
comment="#", header=None)
except:
sys.exit("ERROR: Submitted data file {} could not be read!".format(infile))
#---------------------------------------------------
# INPUT FILE VALIDATION
# FOR APAeval QUANTIFICATION:
# Check for valid bed6 format
## check number of columns
n_col_check = len(participant_data.columns) == 6
print(f"INFO: Columns check returned {n_col_check}")
## check start and end coordinates
coord_check = participant_data.dtypes[1] == np.int64 and participant_data.dtypes[2] == np.int64
print(f"INFO: Coordinate check returned {coord_check}")
## check strands
strands = list(set(participant_data.iloc[:, 5].values))
strand_check = len(strands) == 2 and strands.count('-')+strands.count('+') == 2
print(f"INFO: Strand check returned {strand_check}")
## check ref seq format of chromosomes
accepted_chr = chr_names
data_chr = list(set(participant_data.iloc[:, 0].values))
chr_check = [str(chr) in accepted_chr for chr in data_chr].count(False) == 0
print(f"INFO: Chromosome check returned {chr_check}")
## All checks true?
if n_col_check and coord_check and strand_check and chr_check:
validated = True
else:
print(f"WARNING: Submitted file {infile} does not comply with required bed format.")
validated = False
#----------------------------------------------------
data_id = ":".join([community, EVENT, sample_name, participant_name])
output_json = JSON_templates.write_participant_dataset(data_id, community, challenges, participant_name, validated)
# print validated participant file
with open(out_path , 'w') as f:
json.dump(output_json, f, sort_keys=True, indent=4, separators=(',', ': '))
# Only pass if all input files are valid
if validated:
sys.exit(0)
else:
sys.exit("ERROR: One or more of the submitted files don't comply with APAeval specified format! Please check " + out_path)
if __name__ == '__main__':
main(args)