Skip to content

Commit

Permalink
Merge pull request #18080 from martenson/net-data
Browse files Browse the repository at this point in the history
add Net datatype
  • Loading branch information
mvdbeek authored May 3, 2024
2 parents 65559a8 + d4aa9c4 commit 72e00b2
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 4 deletions.
6 changes: 4 additions & 2 deletions lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,8 @@
<datatype extension="shp" type="galaxy.datatypes.gis:Shapefile" mimetype="application/octet-stream" display_in_upload="true" description="geospatial vector data format for geographic information system"/>
<!-- Flexible Image Transport System (FITS) used in Astronomy https://fits.gsfc.nasa.gov/ https://fits.gsfc.nasa.gov/rfc4047.txt -->
<datatype extension="fits" type="galaxy.datatypes.binary:FITS" mimetype="application/octet-stream" display_in_upload="true" description="Flexible Image Transport System (FITS) used in Astronomy"/>
<datatype extension="chain" type="galaxy.datatypes.chain:Chain" display_in_upload="true"/>
<datatype extension="chain" type="galaxy.datatypes.chain:Chain" display_in_upload="true" description_url="https://genome.ucsc.edu/goldenPath/help/chain.html"/>
<datatype extension="ucsc.net" type="galaxy.datatypes.chain:Net" display_in_upload="true" description_url="https://genome.ucsc.edu/goldenPath/help/net.html"/>
</registration>
<sniffers>
<!--
Expand Down Expand Up @@ -1084,7 +1085,7 @@
<sniffer type="galaxy.datatypes.binary:Edr"/>
<sniffer type="galaxy.datatypes.binary:Vel"/>
<sniffer type="galaxy.datatypes.binary:Xlsx"/>
<sniffer type="galaxy.datatypes.binary:Numpy"/>
<sniffer type="galaxy.datatypes.binary:Numpy"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Metadata"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Artifact"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Visualization"/>
Expand All @@ -1093,6 +1094,7 @@
<sniffer type="galaxy.datatypes.annotation:Augustus"/>
<sniffer type="galaxy.datatypes.xml:Owl"/>
<sniffer type="galaxy.datatypes.chain:Chain"/>
<sniffer type="galaxy.datatypes.chain:Net"/>
<sniffer type="galaxy.datatypes.triples:Rdf"/>
<sniffer type="galaxy.datatypes.blast:BlastXml"/>
<sniffer type="galaxy.datatypes.images:Gifti" />
Expand Down
69 changes: 67 additions & 2 deletions lib/galaxy/datatypes/chain.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Genome browser chain format class
Genome browser alignment formats
"""

import logging
Expand Down Expand Up @@ -98,7 +98,6 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
line = line.strip()
if line: # first non-empty line
if line.startswith("chain"):
# The next line.strip() must not be '', nor startwith '>'
tokens = line.split()
if not (
len(tokens) in [12, 13]
Expand Down Expand Up @@ -127,3 +126,69 @@ def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
else:
return False
return False


@build_sniff_from_prefix
class Net(data.Text):
"""Class describing a net format alignment file"""

edam_format = "format_3983"
file_ext = "ucsc.net"

def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is in net format
For details see https://genome.ucsc.edu/goldenPath/help/net.html
Rules for sniffing as True:
We don't care about line length (other than empty lines).
The first non-empty line must start with 'net' followed by chromName (str) and chromSize (int)
We will only check that the first "net" line and the first data line are formatted correctly.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( '1.chain' )
>>> Net().sniff( fname )
False
>>> fname = get_test_fname( '1.ucsc.net' )
>>> Net().sniff( fname )
True
>>>
"""
allowed_classes = ["fill", "gap"]
strands = ["+", "-"]

fh = file_prefix.string_io()
for line in fh:
line = line.strip()
if line: # first non-empty line
if line.startswith("net"):
tokens = line.split()
if not (len(tokens) == 3 and tokens[2].isdigit()):
return False
for line in fh:
if line[0] != " ": # children are indented one space
return False
line = line.strip()
if line == "":
break
tokens = line.split()
if not (
len(tokens) >= 7 # seven fixed fields
and len(tokens) <= 41 # plus seventeen optional name/value pairs
and tokens[0] in allowed_classes
and tokens[1].isdigit()
and tokens[2].isdigit()
and tokens[4] in strands
and tokens[5].isdigit()
and tokens[6].isdigit()
):
return False
else:
return True
else:
return False
return False
25 changes: 25 additions & 0 deletions lib/galaxy/datatypes/test/1.ucsc.net
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
net chr2L 23011544
fill 6004 3278 chrXR_group3a - 1396397 2164 id 25606 score 23114 ali 782 qDup 576 type top tN 0 qN 0 tR 36 qR 0 tTrf 0 qTrf 0
gap 6065 2 chrXR_group3a - 1398498 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6096 1485 chrXR_group3a - 1397572 897 tN 0 qN 0 tR 36 qR 0 tTrf 0 qTrf 0
fill 6096 513 chrU - 5570675 533 id 48675 score 4435 ali 465 qDup 533 type nonSyn tN 0 qN 0 tR 0 qR 13 tTrf 0 qTrf 0
gap 6116 8 chrU - 5571188 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6156 5 chrU - 5571156 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6184 3 chrU - 5571133 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6212 18 chrU - 5571106 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6244 9 chrU - 5571092 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6340 2 chrU - 5570996 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 6515 3 chrU - 5570771 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7623 1 chrXR_group3a - 1397530 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7664 1007 chrXR_group3a - 1397008 482 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
fill 7664 382 chrXL_group1e - 8262003 506 id 25608 score 10609 ali 364 qDup 506 type nonSyn tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7784 4 chrXL_group1e - 8262361 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7792 3 chrXL_group1e - 8262357 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7921 2 chrXL_group1e - 8262126 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 7949 9 chrXL_group1e - 8262092 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 8693 1 chrXR_group3a - 1396985 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
fill 9833 1251 chrU - 5562980 1239 id 48675 score 10720 ali 1124 qDup 1094 type top tN 0 qN 0 tR 22 qR 88 tTrf 0 qTrf 0
gap 9966 7 chrU - 5564075 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 10015 3 chrU - 5564030 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 10088 2 chrU - 5563957 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0
gap 10101 8 chrU - 5563946 0 tN 0 qN 0 tR 0 qR 0 tTrf 0 qTrf 0

0 comments on commit 72e00b2

Please sign in to comment.