From 208fd7feb05712c774e4f955d4dca7f3b50d5033 Mon Sep 17 00:00:00 2001 From: Stefan Date: Mon, 30 Sep 2019 17:14:34 +0200 Subject: [PATCH 1/4] Update sequence.py Based on BED file columns ordering: use strand information to reverse the motif automatically, and add the possibility to take just score as a target --- kipoiseq/dataloaders/sequence.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kipoiseq/dataloaders/sequence.py b/kipoiseq/dataloaders/sequence.py index f705221..f4f337f 100644 --- a/kipoiseq/dataloaders/sequence.py +++ b/kipoiseq/dataloaders/sequence.py @@ -194,7 +194,8 @@ def __init__(self, # max_seq_len=None, # use_strand=False, force_upper=True, - ignore_targets=False): + ignore_targets=False, + target_only_score=target_only_score): self.num_chr_fasta = num_chr_fasta self.intervals_file = intervals_file @@ -226,7 +227,7 @@ def __getitem__(self, idx): force_upper=self.force_upper) interval, labels = self.bed[idx] - + if self.auto_resize_len: # automatically resize the sequence to cerat interval = resize_interval(interval, self.auto_resize_len, anchor='center') @@ -237,7 +238,13 @@ def __getitem__(self, idx): # Run the fasta extractor and transform if necessary seq = self.fasta_extractors.extract(interval) - + + if labels[2]=="-": #reverse strand + seq = seq[::-1] + + if target_only_score: # remove "strand" and "name" columns for straightforward ML + labels = labels[1] + return { "inputs": np.array(seq), "targets": labels, @@ -332,12 +339,14 @@ def __init__(self, dummy_axis=None, alphabet="ACGT", ignore_targets=False, + target_only_score=True, dtype=None): # core dataset, not using the one-hot encoding params self.seq_dl = StringSeqIntervalDl(intervals_file, fasta_file, num_chr_fasta=num_chr_fasta, label_dtype=label_dtype, auto_resize_len=auto_resize_len, # use_strand=use_strand, - ignore_targets=ignore_targets) + ignore_targets=ignore_targets, + target_only_score=target_only_score) self.input_transform = ReorderedOneHot(alphabet=alphabet, dtype=dtype, From 69f33d6d2e81b71ff94df8f36483561c246823a2 Mon Sep 17 00:00:00 2001 From: Stefan Dvoretskii Date: Wed, 2 Oct 2019 12:14:48 +0200 Subject: [PATCH 2/4] Fix discrepancies/default no target score specificity additional: remove unused imports --- kipoiseq/dataloaders/sequence.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kipoiseq/dataloaders/sequence.py b/kipoiseq/dataloaders/sequence.py index f4f337f..957c608 100644 --- a/kipoiseq/dataloaders/sequence.py +++ b/kipoiseq/dataloaders/sequence.py @@ -1,23 +1,19 @@ -from collections import OrderedDict import pandas as pd import numpy as np from copy import deepcopy from kipoi.metadata import GenomicRanges -from kipoi.specs import DataLoaderArgument, ArraySpecialType -from kipoi.plugin import is_installed from kipoi.data import Dataset, kipoi_dataloader from kipoi_conda.dependencies import Dependencies from kipoi.specs import Author from kipoi_utils.utils import default_kwargs from kipoiseq.extractors import FastaStringExtractor -from kipoiseq.transforms import SwapAxes, DummyAxis, Compose, OneHot, ReorderedOneHot +from kipoiseq.transforms import ReorderedOneHot from kipoiseq.transforms.functional import resize_interval from kipoiseq.utils import to_scalar, parse_dtype import pybedtools -from pybedtools import BedTool, Interval # general dependencies # bioconda::genomelake', TODO - add genomelake again once it gets released with pyfaidx to bioconda @@ -195,7 +191,7 @@ def __init__(self, # use_strand=False, force_upper=True, ignore_targets=False, - target_only_score=target_only_score): + target_only_score=False): self.num_chr_fasta = num_chr_fasta self.intervals_file = intervals_file @@ -210,6 +206,7 @@ def __init__(self, # bed_columns = 6 # else: # bed_columns = 3 + self.target_only_score = target_only_score self.bed = BedDataset(self.intervals_file, num_chr=self.num_chr_fasta, @@ -242,7 +239,7 @@ def __getitem__(self, idx): if labels[2]=="-": #reverse strand seq = seq[::-1] - if target_only_score: # remove "strand" and "name" columns for straightforward ML + if self.target_only_score: # remove "strand" and "name" columns for straightforward ML labels = labels[1] return { @@ -339,7 +336,7 @@ def __init__(self, dummy_axis=None, alphabet="ACGT", ignore_targets=False, - target_only_score=True, + target_only_score=False, dtype=None): # core dataset, not using the one-hot encoding params self.seq_dl = StringSeqIntervalDl(intervals_file, fasta_file, num_chr_fasta=num_chr_fasta, From 60a9bfb03484c05fb69160570572b06023b2d53a Mon Sep 17 00:00:00 2001 From: Stefan Dvoretskii Date: Wed, 2 Oct 2019 12:41:49 +0200 Subject: [PATCH 3/4] fixup! Fix discrepancies/default no target score specificity additional: remove unused imports --- kipoiseq/dataloaders/sequence.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kipoiseq/dataloaders/sequence.py b/kipoiseq/dataloaders/sequence.py index 957c608..28fc4b4 100644 --- a/kipoiseq/dataloaders/sequence.py +++ b/kipoiseq/dataloaders/sequence.py @@ -161,6 +161,8 @@ class StringSeqIntervalDl(Dataset): doc: Force uppercase output of sequences ignore_targets: doc: if True, don't return any target variables + target_only_score: + doc: if True, only 'score' column of BED file is returned in targets array output_schema: inputs: name: seq @@ -303,7 +305,8 @@ class SeqIntervalDl(Dataset): doc: 'defines the numpy dtype of the returned array. Example: int, np.int32, np.float32, float' ignore_targets: doc: if True, don't return any target variables - + target_only_score: + doc: if True, only 'score' column of BED file is returned in targets array output_schema: inputs: name: seq From 0e7cc23c78817f21d15685e3eec13457de7dc3c5 Mon Sep 17 00:00:00 2001 From: Stefan Dvoretskii Date: Wed, 2 Oct 2019 12:58:06 +0200 Subject: [PATCH 4/4] add logical constraints on labels length --- kipoiseq/dataloaders/sequence.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kipoiseq/dataloaders/sequence.py b/kipoiseq/dataloaders/sequence.py index 28fc4b4..65e7a24 100644 --- a/kipoiseq/dataloaders/sequence.py +++ b/kipoiseq/dataloaders/sequence.py @@ -238,10 +238,10 @@ def __getitem__(self, idx): # Run the fasta extractor and transform if necessary seq = self.fasta_extractors.extract(interval) - if labels[2]=="-": #reverse strand + if len(labels) >= 2 and labels[2]=="-": #reverse strand seq = seq[::-1] - if self.target_only_score: # remove "strand" and "name" columns for straightforward ML + if self.target_only_score and len(labels) > 1: # remove "strand" and "name" columns for straightforward ML labels = labels[1] return {