Skip to content

Commit

Permalink
Merge pull request #4 from apcamargo/rust-kmers
Browse files Browse the repository at this point in the history
Version 0.2.0
  • Loading branch information
apcamargo authored Oct 21, 2019
2 parents 7769985 + bfc9850 commit 6631c37
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 24 deletions.
13 changes: 13 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "rnasamba"
version = "0.2.0"
authors = ["Antonio Camargo <[email protected]>"]
edition = "2018"

[lib]
name = "rnasamba"
crate-type = ["cdylib"]

[dependencies.pyo3]
version = "0.8.1"
features = ["extension-module"]
4 changes: 3 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
include LICENSE
include LICENSE
include Cargo.toml
recursive-include src *
2 changes: 2 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
--version show program's version number and exit
-s EARLY_STOPPING, --early_stopping EARLY_STOPPING
number of epochs after lowest validation loss before
stopping training (a fraction of 0.1 of the training
Expand Down Expand Up @@ -78,6 +79,7 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
--version show program's version number and exit
-p PROTEIN_FASTA, --protein_fasta PROTEIN_FASTA
output FASTA file containing translated sequences for
the predicted coding ORFs. (default: None)
Expand Down
2 changes: 1 addition & 1 deletion rnasamba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
#
# Contact: [email protected]

from rnasamba.core.model import RNAsambaClassificationModel, RNAsambaTrainModel
from rnasamba.core.model import RNAsambaClassificationModel, RNAsambaTrainModel
3 changes: 3 additions & 0 deletions rnasamba/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def train(args):


def classify_cli(parser):
parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
parser.set_defaults(func=classify)
parser.add_argument(
'output_file',
Expand Down Expand Up @@ -78,6 +79,7 @@ def classify_cli(parser):

def train_cli(parser):
parser.set_defaults(func=train)
parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
parser.add_argument(
'output_file',
help='output HDF5 file containing weights of the newly trained RNAsamba network.',
Expand Down Expand Up @@ -126,6 +128,7 @@ def cli():
description='Coding potential calculation using deep learning.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
subparsers = parser.add_subparsers()
classify_parser = subparsers.add_parser(
'classify',
Expand Down
30 changes: 10 additions & 20 deletions rnasamba/core/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from rnasamba.core.kmer import count_kmers


def read_fasta(filename, tokenize=False):
seqs = []
Expand All @@ -44,20 +46,20 @@ def read_fasta(filename, tokenize=False):
return seqs


def tokenize_dna(seq):
def tokenize_dna(sequence):
lookup = dict(zip('NATCG', range(5)))
if not seq:
if not sequence:
token = [0]
else:
token = [lookup[c] for c in seq if c in lookup]
token = [lookup[c] for c in sequence if c in lookup]
return token


def longest_orf(input_seq):
def longest_orf(sequence):
start_codon = re.compile('ATG')
longest = (0, 0, '')
for m in start_codon.finditer(input_seq):
putative_orf = input_seq[m.start() :]
for m in start_codon.finditer(sequence):
putative_orf = sequence[m.start() :]
# Add trailing Ns to make the sequence length a multiple of three:
putative_orf = putative_orf + 'N' * (3 - len(putative_orf) % 3)
protein = Seq.Seq(putative_orf).translate(to_stop=True)
Expand All @@ -79,22 +81,10 @@ def orf_indicator(orfs, maxlen):
return orf_indicator


def count_kmers(read, k):
counts = {}
num_kmers = len(read) - k + 1
for i in range(num_kmers):
kmer = read[i : i + k]
if kmer not in counts:
counts[kmer] = 0
counts[kmer] += 1
return counts


def kmer_frequency(nucleotide_sequences):
def kmer_frequency(sequence_tuple, kmer_lengths=[2, 3, 4]):
kmer_frequency = []
bases = ['A', 'T', 'C', 'G']
kmer_lengths = [2, 3, 4]
for nucleotide_seq in nucleotide_sequences:
for nucleotide_seq in sequence_tuple:
matches = [bases, bases]
sequence_kmer_frequency = []
for current_length in kmer_lengths:
Expand Down
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@
# Contact: [email protected]

from setuptools import find_packages, setup
from setuptools_rust import RustExtension

setup(
name='rnasamba',
version='0.1.6',
version='0.2.0',
packages=find_packages(),
rust_extensions=[RustExtension('rnasamba.core.kmer', debug=False)],
zip_safe=False,
license='GNU General Public License v3.0',
description='A tool for computing the coding potential of RNA transcript sequences using deep learning.',
long_description=open('README.md').read(),
Expand All @@ -45,7 +48,7 @@
'machine learning',
'neural networks',
],
author='Antonio Pedro Camargo, Vsevolod Sourkov',
author='Antonio Camargo, Vsevolod Sourkov',
author_email='[email protected]',
classifiers=[
'Development Status :: 3 - Alpha',
Expand Down
21 changes: 21 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
use std::collections::HashMap;
use std::str;

#[pyfunction]
fn count_kmers(sequence: &str, k: usize) -> PyResult<HashMap<&str, u16>> {
let mut counts = HashMap::new();
let n_kmers = sequence.len() - k + 1;
for i in 0..n_kmers {
let kmer = &sequence[i..i + k];
*counts.entry(kmer).or_insert(0) += 1;
}
Ok(counts)
}

#[pymodule]
fn kmer(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(count_kmers))?;
Ok(())
}

0 comments on commit 6631c37

Please sign in to comment.