Merge pull request #4 from apcamargo/rust-kmers

Version 0.2.0
apcamargo · Oct 21, 2019 · 6631c37 · 6631c37
2 parents 7769985 + bfc9850
commit 6631c37
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 24 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "rnasamba"
+version = "0.2.0"
+authors = ["Antonio Camargo <[email protected]>"]
+edition = "2018"
+
+[lib]
+name = "rnasamba"
+crate-type = ["cdylib"]
+
+[dependencies.pyo3]
+version = "0.8.1"
+features = ["extension-module"]
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,3 @@
-include LICENSE
+include LICENSE
+include Cargo.toml
+recursive-include src *
diff --git a/docs/usage.md b/docs/usage.md
@@ -43,6 +43,7 @@ positional arguments:
 
 optional arguments:
   -h, --help            show this help message and exit
+  --version             show program's version number and exit
   -s EARLY_STOPPING, --early_stopping EARLY_STOPPING
                         number of epochs after lowest validation loss before
                         stopping training (a fraction of 0.1 of the training
@@ -78,6 +79,7 @@ positional arguments:
 
 optional arguments:
   -h, --help            show this help message and exit
+  --version             show program's version number and exit
   -p PROTEIN_FASTA, --protein_fasta PROTEIN_FASTA
                         output FASTA file containing translated sequences for
                         the predicted coding ORFs. (default: None)

diff --git a/rnasamba/__init__.py b/rnasamba/__init__.py
@@ -18,4 +18,4 @@
 #
 #   Contact: [email protected]
 
-from rnasamba.core.model import RNAsambaClassificationModel, RNAsambaTrainModel
+from rnasamba.core.model import RNAsambaClassificationModel, RNAsambaTrainModel
diff --git a/rnasamba/cli.py b/rnasamba/cli.py
@@ -48,6 +48,7 @@ def train(args):
 
 
 def classify_cli(parser):
+    parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
     parser.set_defaults(func=classify)
     parser.add_argument(
         'output_file',
@@ -78,6 +79,7 @@ def classify_cli(parser):
 
 def train_cli(parser):
     parser.set_defaults(func=train)
+    parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
     parser.add_argument(
         'output_file',
         help='output HDF5 file containing weights of the newly trained RNAsamba network.',
@@ -126,6 +128,7 @@ def cli():
         description='Coding potential calculation using deep learning.',
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
+    parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
     subparsers = parser.add_subparsers()
     classify_parser = subparsers.add_parser(
         'classify',

diff --git a/rnasamba/core/sequences.py b/rnasamba/core/sequences.py
@@ -27,6 +27,8 @@
 from keras.preprocessing.sequence import pad_sequences
 from keras.utils import to_categorical
 
+from rnasamba.core.kmer import count_kmers
+
 
 def read_fasta(filename, tokenize=False):
     seqs = []
@@ -44,20 +46,20 @@ def read_fasta(filename, tokenize=False):
     return seqs
 
 
-def tokenize_dna(seq):
+def tokenize_dna(sequence):
     lookup = dict(zip('NATCG', range(5)))
-    if not seq:
+    if not sequence:
         token = [0]
     else:
-        token = [lookup[c] for c in seq if c in lookup]
+        token = [lookup[c] for c in sequence if c in lookup]
     return token
 
 
-def longest_orf(input_seq):
+def longest_orf(sequence):
     start_codon = re.compile('ATG')
     longest = (0, 0, '')
-    for m in start_codon.finditer(input_seq):
-        putative_orf = input_seq[m.start() :]
+    for m in start_codon.finditer(sequence):
+        putative_orf = sequence[m.start() :]
         # Add trailing Ns to make the sequence length a multiple of three:
         putative_orf = putative_orf + 'N' * (3 - len(putative_orf) % 3)
         protein = Seq.Seq(putative_orf).translate(to_stop=True)
@@ -79,22 +81,10 @@ def orf_indicator(orfs, maxlen):
     return orf_indicator
 
 
-def count_kmers(read, k):
-    counts = {}
-    num_kmers = len(read) - k + 1
-    for i in range(num_kmers):
-        kmer = read[i : i + k]
-        if kmer not in counts:
-            counts[kmer] = 0
-        counts[kmer] += 1
-    return counts
-
-
-def kmer_frequency(nucleotide_sequences):
+def kmer_frequency(sequence_tuple, kmer_lengths=[2, 3, 4]):
     kmer_frequency = []
     bases = ['A', 'T', 'C', 'G']
-    kmer_lengths = [2, 3, 4]
-    for nucleotide_seq in nucleotide_sequences:
+    for nucleotide_seq in sequence_tuple:
         matches = [bases, bases]
         sequence_kmer_frequency = []
         for current_length in kmer_lengths:

diff --git a/setup.py b/setup.py
@@ -20,11 +20,14 @@
 #   Contact: [email protected]
 
 from setuptools import find_packages, setup
+from setuptools_rust import RustExtension
 
 setup(
     name='rnasamba',
-    version='0.1.6',
+    version='0.2.0',
     packages=find_packages(),
+    rust_extensions=[RustExtension('rnasamba.core.kmer', debug=False)],
+    zip_safe=False,
     license='GNU General Public License v3.0',
     description='A tool for computing the coding potential of RNA transcript sequences using deep learning.',
     long_description=open('README.md').read(),
@@ -45,7 +48,7 @@
         'machine learning',
         'neural networks',
     ],
-    author='Antonio Pedro Camargo, Vsevolod Sourkov',
+    author='Antonio Camargo, Vsevolod Sourkov',
     author_email='[email protected]',
     classifiers=[
         'Development Status :: 3 - Alpha',

diff --git a/src/lib.rs b/src/lib.rs
@@ -0,0 +1,21 @@
+use pyo3::prelude::*;
+use pyo3::wrap_pyfunction;
+use std::collections::HashMap;
+use std::str;
+
+#[pyfunction]
+fn count_kmers(sequence: &str, k: usize) -> PyResult<HashMap<&str, u16>> {
+    let mut counts = HashMap::new();
+    let n_kmers = sequence.len() - k + 1;
+    for i in 0..n_kmers {
+        let kmer = &sequence[i..i + k];
+        *counts.entry(kmer).or_insert(0) += 1;
+    }
+    Ok(counts)
+}
+
+#[pymodule]
+fn kmer(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
+    m.add_wrapped(wrap_pyfunction!(count_kmers))?;
+    Ok(())
+}