Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Net datatype #18080

Merged
merged 2 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,7 @@
<!-- Flexible Image Transport System (FITS) used in Astronomy https://fits.gsfc.nasa.gov/ https://fits.gsfc.nasa.gov/rfc4047.txt -->
<datatype extension="fits" type="galaxy.datatypes.binary:FITS" mimetype="application/octet-stream" display_in_upload="true" description="Flexible Image Transport System (FITS) used in Astronomy"/>
<datatype extension="chain" type="galaxy.datatypes.chain:Chain" display_in_upload="true"/>
<datatype extension="net" type="galaxy.datatypes.chain:Net" display_in_upload="true"/>
martenson marked this conversation as resolved.
Show resolved Hide resolved
</registration>
<sniffers>
<!--
Expand Down Expand Up @@ -1084,7 +1085,7 @@
<sniffer type="galaxy.datatypes.binary:Edr"/>
<sniffer type="galaxy.datatypes.binary:Vel"/>
<sniffer type="galaxy.datatypes.binary:Xlsx"/>
<sniffer type="galaxy.datatypes.binary:Numpy"/>
<sniffer type="galaxy.datatypes.binary:Numpy"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Metadata"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Artifact"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Visualization"/>
Expand All @@ -1093,6 +1094,7 @@
<sniffer type="galaxy.datatypes.annotation:Augustus"/>
<sniffer type="galaxy.datatypes.xml:Owl"/>
<sniffer type="galaxy.datatypes.chain:Chain"/>
<sniffer type="galaxy.datatypes.chain:Net"/>
<sniffer type="galaxy.datatypes.triples:Rdf"/>
<sniffer type="galaxy.datatypes.blast:BlastXml"/>
<sniffer type="galaxy.datatypes.images:Gifti" />
Expand Down
69 changes: 67 additions & 2 deletions lib/galaxy/datatypes/chain.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Genome browser chain format class
Genome browser alignment formats
"""

import logging
Expand Down Expand Up @@ -98,7 +98,6 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
line = line.strip()
if line: # first non-empty line
if line.startswith("chain"):
# The next line.strip() must not be '', nor startwith '>'
tokens = line.split()
if not (
len(tokens) in [12, 13]
Expand Down Expand Up @@ -127,3 +126,69 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
else:
return False
return False


@build_sniff_from_prefix
class Net(data.Text):
"""Class describing a net format alignment file"""

edam_format = "format_3983"
file_ext = "net"

def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is in net format

For details see https://genome.ucsc.edu/goldenPath/help/net.html

Rules for sniffing as True:

We don't care about line length (other than empty lines).

The first non-empty line must start with 'net' followed by chromName (str) and chromSize (int)

We will only check that the first "net" line and the first data line are formatted correctly.

>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( '1.chain' )
>>> Net().sniff( fname )
False
>>> fname = get_test_fname( '1.net' )
>>> Net().sniff( fname )
True
>>>
"""
allowed_classes = ["fill", "gap"]
strands = ["+", "-"]

fh = file_prefix.string_io()
for line in fh:
line = line.strip()
if line: # first non-empty line
if line.startswith("net"):
tokens = line.split()
if not (len(tokens) == 3 and tokens[2].isdigit()):
return False
for line in fh:
if line[0] != " ": # children are indented one space
return False
line = line.strip()
if line == "":
break
tokens = line.split()
if not (
len(tokens) >= 7 # seven fixed fields
and len(tokens) <= 41 # plus seventeen optional name/value pairs
and tokens[0] in allowed_classes
and tokens[1].isdigit()
and tokens[2].isdigit()
and tokens[4] in strands
and tokens[5].isdigit()
and tokens[6].isdigit()
):
return False
else:
return True
else:
return False
return False
25 changes: 25 additions & 0 deletions lib/galaxy/datatypes/test/1.net
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
net chr2L 23011544
fill 6004 3278 chrXR_group3a - 1396397 2164 id 25606 score 23114 ali 782 qDup 576 type top tN 0 qN 0 tR 36 qR 0 tTrf 0 qTrf 0
gap 6065 2 chrXR_group3a - 1398498 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6096 1485 chrXR_group3a - 1397572 897 tN 0 qN 0 tR 36 qR 0 tTrf 0 qTrf 0
fill 6096 513 chrU - 5570675 533 id 48675 score 4435 ali 465 qDup 533 type nonSyn tN 0 qN 0 tR 0 qR 13 tTrf 0 qTrf 0
gap 6116 8 chrU - 5571188 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6156 5 chrU - 5571156 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6184 3 chrU - 5571133 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6212 18 chrU - 5571106 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6244 9 chrU - 5571092 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6340 2 chrU - 5570996 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6515 3 chrU - 5570771 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7623 1 chrXR_group3a - 1397530 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7664 1007 chrXR_group3a - 1397008 482 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
fill 7664 382 chrXL_group1e - 8262003 506 id 25608 score 10609 ali 364 qDup 506 type nonSyn tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7784 4 chrXL_group1e - 8262361 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7792 3 chrXL_group1e - 8262357 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7921 2 chrXL_group1e - 8262126 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7949 9 chrXL_group1e - 8262092 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 8693 1 chrXR_group3a - 1396985 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
fill 9833 1251 chrU - 5562980 1239 id 48675 score 10720 ali 1124 qDup 1094 type top tN 0 qN 0 tR 22 qR 88 tTrf 0 qTrf 0
gap 9966 7 chrU - 5564075 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 10015 3 chrU - 5564030 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 10088 2 chrU - 5563957 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 10101 8 chrU - 5563946 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
Loading