diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..ec750ad --- /dev/null +++ b/.travis.yml @@ -0,0 +1,9 @@ +language: python +python: + - "2.7" + - "3.5" + - "3.6" +install: + travis_retry python setup.py install +script: + travis_retry python setup.py test diff --git a/README.md b/README.md index 02d6ae9..ddbfff9 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +[![Build Status](https://travis-ci.org/morrislab/rnascan.svg?branch=master)](https://travis-ci.org/morrislab/rnascan) +![Version](https://img.shields.io/badge/version-0.10.2-brightgreen.svg) +[![GitHub license](https://img.shields.io/github/license/morrislab/rnascan.svg)](https://github.com/morrislab/rnascan/blob/master/LICENSE) + # rnascan rnascan is a (mostly) Python suite to scan RNA sequences and secondary structures with sequence and secondary structure PFMs. Secondary structure is represented as weights in different secondary structure contexts, similar to how a PFM represents weights of different nucleotides or amino acids. This allows representation and use of secondary structures in a way that is similar to how PFMs are used to scan nucleotide sequences, and also allows for some flexibility in the structure, as you might find in the boltzmann distribution of secondary structures. @@ -44,7 +48,7 @@ g++ -o ~/bin/parse_secondary_structure scripts/parse_secondary_structure.cpp ### 4. Install `rnascan` Python components -This package was written for Python 2.7.x or later (_not compatible with Python 3 yet_). To install the package, run the following: +This package requires Python 2.7+ or Python 3.5+. To install the package, run the following: ``` python setup.py install @@ -80,7 +84,7 @@ Here are some example commands using minimal options: ``` # To run a test sequence -rnascan -p pfm_seq.txt -s AGTTCCGGTCCGGCAGAGATCGCG > hits.tab +rnascan -p pfm_seq.txt -t AGTTCCGGTCCGGCAGAGATCGCG > hits.tab # Sequence-only (use -p) rnascan -p pfm_seq.txt sequences.fasta > hits.tab diff --git a/rnascan/BioAddons/motifs/matrix.py b/rnascan/BioAddons/motifs/matrix.py index cbb9137..03bfde2 100644 --- a/rnascan/BioAddons/motifs/matrix.py +++ b/rnascan/BioAddons/motifs/matrix.py @@ -6,12 +6,13 @@ # - from Bio.motifs import matrix from Bio.Alphabet import NucleotideAlphabet import platform -class ExtendedPositionSpecificScoringMatrix(matrix.PositionSpecificScoringMatrix): + +class ExtendedPositionSpecificScoringMatrix( + matrix.PositionSpecificScoringMatrix): """This new class inherits Bio.motifs.matrix.PositionSpecificScoringMatrix. It has been modified to support any kind of Alphabet. This allows us to perform motif scans on RNA sequence as well as RNA secondary structure. @@ -45,6 +46,7 @@ def _py_calculate(self, sequence, m, n): # Fall back to the slower Python implementation if Jython or IronPython. try: from . import _pwm + def _calculate(self, sequence, m, n): # Only RNA and DNA is supported right now. If sequence is @@ -54,7 +56,7 @@ def _calculate(self, sequence, m, n): letters = ''.join(sorted(self.alphabet.letters)) logodds = [[self[letter][i] for letter in letters] - for i in range(m)] + for i in range(m)] return self._pwm.calculate(sequence, logodds) except ImportError: if platform.python_implementation() == 'CPython': @@ -63,7 +65,6 @@ def _calculate(self, sequence, m, n): def _calculate(self, sequence, m, n): return self._py_calculate(sequence, m, n) - def calculate(self, sequence): # TODO - Force uppercase here and optimise switch statement in C diff --git a/rnascan/average_structure.py b/rnascan/average_structure.py index 9058dba..6d122d5 100644 --- a/rnascan/average_structure.py +++ b/rnascan/average_structure.py @@ -20,7 +20,7 @@ from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq -from pfmutil import norm_pfm +from .pfmutil import norm_pfm import subprocess import numpy as np @@ -44,7 +44,7 @@ def struct_pfm_from_aligned(sequences): def get_structure_probability_matrix_for_sequence(id,seq,frag_length,overlap): aligned_annotated_sequences = [] - for i in xrange(-frag_length/2,len(seq)-frag_length/2,frag_length-overlap): + for i in range(-frag_length/2,len(seq)-frag_length/2,frag_length-overlap): temphandle = tempfile.NamedTemporaryFile(delete=False,mode='r+b') # for centroid structure temphandle2 = tempfile.NamedTemporaryFile(delete=False,mode='r+b') # for output of structure parser @@ -108,7 +108,7 @@ def get_structure_probability_matrix_from_probabilities(id,seq,frag_length): probabilities = {} - for alph,p in programs.iteritems(): + for alph,p in programs.items(): args = [p] + plfold_args plfold_proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None) #plfold_output = plfold_proc.communicate(input_record.format("fasta"))[0] diff --git a/rnascan/pfmutil.py b/rnascan/pfmutil.py index d0745b1..3598885 100644 --- a/rnascan/pfmutil.py +++ b/rnascan/pfmutil.py @@ -45,7 +45,7 @@ def read_pfm(pfmfile): pfm = {} with open(pfmfile) as f: reader = csv.reader(f, delimiter='\t') - headerline = reader.next() + headerline = next(reader) alphabet = headerline[1:] for base in alphabet: pfm[base] = [] @@ -244,9 +244,9 @@ def reduce_pfm_alphabet(pfm): pfm = read_pfm(file) - print(format_pfm(pfm)) + print((format_pfm(pfm))) reduced_pfm = reduce_pfm_alphabet(pfm) - print(format_pfm(reduced_pfm)) + print((format_pfm(reduced_pfm))) diff --git a/rnascan/rnascan.py b/rnascan/rnascan.py index 7f2189c..cb38e0a 100644 --- a/rnascan/rnascan.py +++ b/rnascan/rnascan.py @@ -15,6 +15,8 @@ # You should have received a copy of the GNU Affero General Public License # along with rnascan. If not, see . + +from __future__ import print_function import sys import time import glob @@ -29,20 +31,19 @@ import multiprocessing import pandas as pd import numpy as np -from itertools import izip, repeat -from BioAddons.Alphabet import ContextualSecondaryStructure -from BioAddons.motifs import matrix +from itertools import repeat +from .BioAddons.Alphabet import ContextualSecondaryStructure +from .BioAddons.motifs import matrix from Bio import motifs, SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import RNAAlphabet, IUPAC - -__version__ = 'v0.9.1' +from .version import __version__ def getoptions(): desc = "Scan sequence for motif binding sites. Results sent to STDOUT." - parser = argparse.ArgumentParser(description=desc, version=__version__) + parser = argparse.ArgumentParser(description=desc) parser.add_argument('fastafiles', metavar='FASTA', nargs='*', help="Input sequence and structure FASTA files") pfm_grp = parser.add_argument_group("PFM options") @@ -86,6 +87,8 @@ def getoptions(): bg_grp.add_argument('-B', '--bg_struct', default=None, dest="bg_struct", help=("Load file of pre-computed background " "probabilities for nucleotide sequences")) + parser.add_argument('-v', '--version', action='version', + version='%(prog)s ' + __version__) parser.add_argument('-x', '--debug', action="store_true", default=False, dest="debug", help=("Turn on debug mode " @@ -102,6 +105,9 @@ def getoptions(): return args +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + ############################################################################### # Sequence functions ############################################################################### @@ -112,12 +118,12 @@ def _guess_seq_type(args): if nfiles == 2: if not (args.pfm_seq or args.pfm_struct): - print >> sys.stderr, "Missing PFMs" + eprint("Missing PFMs") sys.exit(1) seq_type = "RNASS" else: # nfiles == 1 if args.pfm_seq and args.pfm_struct and not args.testseq: - print >> sys.stderr, "Can't specify two PFMs with one input file" + eprint("Can't specify two PFMs with one input file") sys.exit(1) elif args.pfm_seq and args.pfm_struct and args.testseq: seq_type = "RNASS" @@ -126,7 +132,7 @@ def _guess_seq_type(args): elif args.pfm_struct: seq_type = "SS" else: - print >> sys.stderr, "Must specify PFMs with -p and/or -q" + eprint("Must specify PFMs with -p and/or -q") sys.exit(1) return seq_type @@ -150,7 +156,7 @@ def batch_iterator(iterator, batch_size): batch = [] while len(batch) < batch_size: try: - entry = iterator.next() + entry = next(iterator) except StopIteration: entry = None if entry is None: @@ -205,25 +211,27 @@ def load_motif(pfm_file, *args): """ Load PFM """ motifs_set = {} - print >> sys.stderr, "Loading PFM %s" % pfm_file, + eprint("Loading PFM %s" % pfm_file, end="") tic = time.time() try: motif_id = os.path.splitext(os.path.basename(pfm_file))[0] motifs_set[motif_id] = pfm2pssm(pfm_file, *args) except ValueError: - print >> sys.stderr, "\nFailed to load motif %s" % pfm_file + eprint("\nFailed to load motif %s" % pfm_file) except KeyError: - print >> sys.stderr, "\nFailed to load motif %s" % pfm_file - print >> sys.stderr, "Check that you are using the correct --type" + eprint("\nFailed to load motif %s" % pfm_file) + eprint("Check that you are using the correct --type") raise except: - print "Unexpected error:", sys.exc_info()[0] + eprint("Unexpected error: %s" % sys.exc_info()[0]) raise - print >> sys.stderr, "\b.", + eprint("\b.", end="") sys.stderr.flush() toc = time.time() - print >> sys.stderr, "done in %0.2f seconds!" % (float(toc - tic)) - print >> sys.stderr, "Found %d motifs" % len(motifs_set) + eprint("done in %0.2f seconds!" % (float(toc - tic))) + eprint("Found %d motifs" % len(motifs_set)) + if len(motifs_set) == 0: + raise ValueError("No motifs found.") return motifs_set @@ -251,7 +259,7 @@ def scan(pssm, seq, alphabet, minscore): """ Core scanning function """ results = [] - (motif_id, pm) = pssm.items()[0] + (motif_id, pm) = list(pssm.items())[0] for position, score in pm.search(seq, threshold=minscore, both=False): end_position = position + len(pm.consensus) @@ -287,13 +295,13 @@ def scan_averaged_structure(struct_file, pssm, minscore): """ struct = pd.read_table(struct_file) del struct['PO'] - (motif_id, pm) = pssm.items()[0] + (motif_id, pm) = list(pssm.items())[0] motif_scores = [] pm = pd.DataFrame(pm) # Convert dict back to data frame N = len(pm.index) - for i in xrange(0, len(struct.index) - N + 1): + for i in range(0, len(struct.index) - N + 1): score = 0 - for j in xrange(0, N): + for j in range(0, N): # Multiply by SSM score += np.nan_to_num(np.dot(struct.iloc[i + j, :], pm.iloc[j, :])) @@ -318,6 +326,12 @@ def _add_sequence_id(df, seq_id, description): df['Description'] = description +def _add_match_id(df): + """Insert a unique identifier between 1...n + """ + df['Match_ID'] = list(range(1, df.shape[0] + 1)) + + def scan_main(fasta_file, pssm, alphabet, bg, args): """ Main function for handling scanning of PSSM and a sequence/structure """ @@ -332,7 +346,7 @@ def scan_main(fasta_file, pssm, alphabet, bg, args): results = [] if os.path.isdir(fasta_file): - print >> sys.stderr, "Scanning averaged secondary structures " + eprint("Scanning averaged secondary structures ") structures = glob.glob(fasta_file + "/structure.*.txt") if len(structures) == 0: @@ -348,7 +362,7 @@ def scan_main(fasta_file, pssm, alphabet, bg, args): else: p = multiprocessing.Pool(args.cores) batch_results = p.map(_scan_averaged_structure, - izip(structures, repeat(pssm), + zip(structures, repeat(pssm), repeat(args.minscore))) for j, hits in enumerate(batch_results): if hits is None: @@ -360,7 +374,7 @@ def scan_main(fasta_file, pssm, alphabet, bg, args): results.append(hits) p.close() else: - print >> sys.stderr, "Scanning sequences " + eprint("Scanning sequences ") seq_iter = parse_sequences(fasta_file) @@ -373,7 +387,7 @@ def scan_main(fasta_file, pssm, alphabet, bg, args): else: p = multiprocessing.Pool(args.cores) for i, batch in enumerate(batch_iterator(seq_iter, 2000)): - batch_results = p.map(_scan_all, izip(batch, + batch_results = p.map(_scan_all, zip(batch, repeat(pssm), repeat(alphabet), repeat(args.minscore) @@ -393,7 +407,7 @@ def scan_main(fasta_file, pssm, alphabet, bg, args): if len(results) != 0: final = pd.concat(results) - print >> sys.stderr, "Processed %d sequences" % count + eprint("Processed %d sequences" % count) cols = final.columns.tolist() cols = cols[-2:] + cols[:-2] return final[cols] @@ -426,7 +440,7 @@ def combine(seq_results, struct_results): def compute_background(fastas, alphabet, verbose=True): """ Compute background probabiilities from all input sequences """ - print >> sys.stderr, "Calculating background probabilities..." + eprint("Calculating background probabilities...") content = defaultdict(int) total = len(alphabet.letters) # add psuedocount for each letter seq_iter = parse_sequences(fastas) @@ -439,14 +453,14 @@ def compute_background(fastas, alphabet, verbose=True): total += amount pct_sum = 0 - for letter, count in content.iteritems(): + for letter, count in content.items(): content[letter] = (float(count) + 1) / total # add pseudocount if content[letter] <= 0.05: warnings.warn("Letter %s has low content: %0.2f" % (letter, content[letter]), Warning) pct_sum += content[letter] - if verbose: print >> sys.stderr, dict(content) + if verbose: eprint(dict(content)) assert abs(1.0 - pct_sum) < 0.0001, "Background sums to %f" % pct_sum return content @@ -456,14 +470,13 @@ def load_background(bg_file, uniform, *args): input files or use uniform """ if bg_file: - print >> sys.stderr, ("Reading custom background probabilities " - "from %s" % file) + eprint("Reading custom background probabilities from %s" % bg_file) # load custom background # http://stackoverflow.com/a/11027069 with open(bg_file, 'r') as fin: bg = fin.read() bg = ast.literal_eval(bg) - print >> sys.stderr, dict(bg) + eprint(dict(bg)) elif not uniform: bg = compute_background(*args) else: @@ -507,7 +520,7 @@ def main(): IUPAC.IUPACUnambiguousRNA(), bg, args) else: - print dict(bg) + print(dict(bg)) sys.exit() ## Structure @@ -536,24 +549,30 @@ def main(): ContextualSecondaryStructure(), bg, args) else: - print dict(bg) + print(dict(bg)) sys.exit() if seq_type == 'RNASS': combined_results = combine(seq_results, struct_results) + combined_results.reset_index(drop=True) + _add_match_id(combined_results) combined_results.to_csv(sys.stdout, sep="\t", index=False) elif seq_type == 'RNA': + seq_results.reset_index(drop=True) + _add_match_id(seq_results) seq_results.to_csv(sys.stdout, sep="\t", index=False) else: + struct_results.reset_index(drop=True) + _add_match_id(struct_results) struct_results.to_csv(sys.stdout, sep="\t", index=False) toc = time.time() runtime = float(toc - tic) if runtime > 60: - print >> sys.stderr, "Done in %0.4f minutes!" % (runtime / 60) + eprint("Done in %0.4f minutes!" % (runtime / 60)) else: - print >> sys.stderr, "Done in %0.4f seconds!" % (runtime) + eprint("Done in %0.4f seconds!" % (runtime)) if __name__ == '__main__': diff --git a/rnascan/version.py b/rnascan/version.py new file mode 100644 index 0000000..85b551d --- /dev/null +++ b/rnascan/version.py @@ -0,0 +1 @@ +__version__ = '0.10.2' diff --git a/scripts/rnascan b/scripts/rnascan deleted file mode 100755 index 368260e..0000000 --- a/scripts/rnascan +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python -# Copyright (C) 2016-2017 Kevin Ha -# -# This file is part of rnascan. -# -# rnascan is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# rnascan is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with rnascan. If not, see . -from rnascan import rnascan -rnascan.main() diff --git a/setup.py b/setup.py index ab3ce78..fd8ea05 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,16 @@ +import os.path import sys from setuptools import find_packages from distutils.core import setup, Extension if sys.version_info < (2, 7): - sys.stderr.write("At least Python 2.7 is required\n") - sys.exit(1) + sys.stderr.write("rnascan requires Python 2.7, or Python 3.5 or later. " + "Python %d.%d detected.\n" % sys.version_info[:2]) + sys.exit(1) +elif sys.version_info[0] == 3 and sys.version_info[:2] < (3, 5): + sys.stderr.write("rnascan requires Python 2.7, or Python 3.5 or later. " + "Python %d.%d detected.\n" % sys.version_info[:2]) + sys.exit(1) # Borrowing setup.py code from Biopython @@ -43,19 +49,30 @@ def is_Numpy_installed(): include_dirs=[numpy_include_dir], )) +here = os.path.abspath(os.path.dirname(__file__)) +exec(open(os.path.join(here, 'rnascan/version.py')).read()) + + setup(name='rnascan', - version='0.9.1', + version=__version__, description='Scan RBP motifs and secondary structure from SSMs', url='http://github.com/morrislab/rnascan', author='Kevin Ha, Kate Cook, Kaitlin Laverty', author_email='k.ha@mail.utoronto.ca, kate.cook@gmail.com, kaitlin.laverty@mail.utoronto.ca', license='AGPLv3', packages=find_packages(), - scripts=['scripts/rnascan', 'scripts/run_folding'], + scripts=['scripts/run_folding'], install_requires=['setuptools', 'pandas >= 0.17', 'numpy >= 1.10.0', 'biopython >= 1.66'], + entry_points={ + 'console_scripts': [ + 'rnascan = rnascan.rnascan:main' + ] + }, ext_modules=EXTENSIONS, - zip_safe=False + zip_safe=False, + test_suite='nose.collector', + tests_require=['nose'] ) diff --git a/tests/motif_scan_test.py b/tests/motif_scan_test.py index b96d15e..4e6e5f9 100644 --- a/tests/motif_scan_test.py +++ b/tests/motif_scan_test.py @@ -1,9 +1,10 @@ +from __future__ import print_function import unittest import sys import types -sys.path.append("..") -import motif_scan.motif_scan as ms -from motif_scan.BioAddons.Alphabet import * +import os.path +import rnascan.rnascan as ms +from rnascan.BioAddons.Alphabet import * from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC @@ -12,12 +13,12 @@ class ParseSequencesTestCase(unittest.TestCase): def setUp(self): - self.fastas = ['test.fa'] + self.fastas = [os.path.join(os.path.dirname(__file__), 'test.fa')] def test_parse_sequences_1(self): '''Test parsing of test sequences ''' - target = ms.parse_sequences(self.fastas, IUPAC.IUPACUnambiguousRNA()) + target = ms.parse_sequences(self.fastas) self.assertIsInstance(target, types.GeneratorType) for item in target: @@ -27,7 +28,7 @@ def test_parse_sequences_1(self): class ComputeBackgroundTestCase(unittest.TestCase): def setUp(self): - self.fastas = ['test.fa'] + self.fastas = [os.path.join(os.path.dirname(__file__), 'test.fa')] def test_compute_background_1(self): target = ms.compute_background(self.fastas, @@ -38,10 +39,10 @@ def test_compute_background_1(self): 'U': 0.5277, 'G': 0.1388} - for key,value in expected.iteritems(): + for key,value in expected.items(): self.assertAlmostEqual(target[key], value, 3) if __name__ == '__main__': - print sys.argv[0] + print(sys.argv[0]) unittest.main() diff --git a/tests/preprocess_seq_test.py b/tests/preprocess_seq_test.py index 450e021..0946c53 100644 --- a/tests/preprocess_seq_test.py +++ b/tests/preprocess_seq_test.py @@ -1,8 +1,8 @@ +from __future__ import print_function import unittest import sys -sys.path.append("..") -import motif_scan.motif_scan as ms -from motif_scan.BioAddons.Alphabet import * +import rnascan.rnascan as ms +from rnascan.BioAddons.Alphabet import * from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC, SingleLetterAlphabet @@ -47,11 +47,11 @@ def test_preprocessSeq_5(self): def test_preprocessSeq_6(self): '''Test preprocess_seq() on RNA alphabet''' - seqrec = SeqRecord(Seq('KHIL', ContextualSequenceSecondaryStructure())) + seqrec = SeqRecord(Seq('KHIL', ContextualSecondaryStructure())) target = ms.preprocess_seq(seqrec, IUPAC.IUPACUnambiguousRNA()) expected = 'KHIL' self.assertEqual(str(target), expected) if __name__ == '__main__': - print sys.argv[0] + print(sys.argv[0]) unittest.main()