-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimised NonReferenceContigAssembler.graphByKmerNode
High coverage RP kmers had different quals in each position meaning a O(n^2) array traversal. Replaced SortedSet using to reduce cost O(nlogn) Optimisation for n=1 to avoid creating a SortedSet at all (directly storing the KmerNode in the top-level Long2ObjectMap lookup)
- Loading branch information
Daniel Cameron
committed
Mar 17, 2021
1 parent
3010c03
commit a34993a
Showing
4 changed files
with
254 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
121 changes: 121 additions & 0 deletions
121
...java/au/edu/wehi/idsv/debruijn/positional/optimiseddatastructures/KmerIntervalLookup.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
package au.edu.wehi.idsv.debruijn.positional.optimiseddatastructures; | ||
|
||
import au.edu.wehi.idsv.debruijn.KmerEncodingHelper; | ||
import au.edu.wehi.idsv.debruijn.positional.KmerNode; | ||
import au.edu.wehi.idsv.util.IntervalUtil; | ||
import com.google.common.collect.ImmutableList; | ||
import com.google.common.collect.Iterables; | ||
import it.unimi.dsi.fastutil.ints.Int2ObjectRBTreeMap; | ||
import it.unimi.dsi.fastutil.ints.Int2ObjectSortedMap; | ||
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
/** | ||
* Kmer interval lookup in which no records overlap | ||
*/ | ||
public abstract class KmerIntervalLookup<T> { | ||
/** | ||
* Lookup of node starts. Secondary key is end() | ||
* Values are the KmerNode itself (when there is only 1) or a Int2ObjectSortedMap | ||
*/ | ||
private final Long2ObjectOpenHashMap<Object> kmerLookup = new Long2ObjectOpenHashMap<>(); | ||
private int size = 0; | ||
protected abstract int getStart(T node); | ||
protected abstract int getEnd(T node); | ||
protected abstract long getKmer(T node); | ||
public void add(T node) { | ||
long kmer = getKmer(node); | ||
Object x = kmerLookup.get(kmer); | ||
Int2ObjectSortedMap<T> positionLookup; | ||
if (x == null) { | ||
kmerLookup.put(kmer, node); | ||
return; | ||
} else if (x instanceof Int2ObjectSortedMap) { | ||
positionLookup = (Int2ObjectSortedMap<T>) x; | ||
} else { | ||
T existing = (T)x; | ||
positionLookup = new Int2ObjectRBTreeMap<>(); | ||
kmerLookup.put(kmer, positionLookup); | ||
positionLookup.put(getEnd(existing), existing); | ||
} | ||
positionLookup.put(getEnd(node), node); | ||
} | ||
public void remove(T node) { | ||
long kmer = getKmer(node); | ||
Object x = kmerLookup.get(kmer); | ||
if (x instanceof Int2ObjectSortedMap) { | ||
Int2ObjectSortedMap<T> positionLookup = (Int2ObjectSortedMap<T>) x; | ||
T found = positionLookup.remove(getEnd(node)); | ||
assert (found != null); | ||
if (positionLookup.isEmpty()) { | ||
kmerLookup.remove(kmer); | ||
} | ||
} else { | ||
Object found = kmerLookup.remove(kmer); | ||
assert(found != null); | ||
} | ||
} | ||
/** | ||
* Gets the KmerNode that overlaps exactly | ||
* @param kmer | ||
* @param start | ||
* @param end | ||
* @return | ||
*/ | ||
public T get(long kmer, int start, int end) { | ||
Object x = kmerLookup.get(kmer); | ||
T node = null; | ||
if (x instanceof Int2ObjectSortedMap) { | ||
Int2ObjectSortedMap<T> positionLookup = (Int2ObjectSortedMap<T>) x; | ||
node = positionLookup.get(end); | ||
} else { | ||
node = (T)x; | ||
} | ||
if (node != null && (getStart(node) != start || getEnd(node) != end)) { | ||
// doesn't overlap exactly | ||
return null; | ||
} | ||
return node; | ||
} | ||
public List<T> getOverlapping(long kmer, int start, int end) { | ||
Object x = kmerLookup.get(kmer); | ||
T node = null; | ||
if (x == null) { | ||
return Collections.EMPTY_LIST; | ||
} if (x instanceof Int2ObjectSortedMap) { | ||
Int2ObjectSortedMap<T> positionLookup = (Int2ObjectSortedMap<T>) x; | ||
positionLookup = positionLookup.tailMap(start); | ||
ArrayList result = new ArrayList(); | ||
Iterator<T> it = positionLookup.values().stream().iterator(); | ||
while (it.hasNext()) { | ||
node = it.next(); | ||
if (IntervalUtil.overlapsClosed(start, end, getStart(node), getEnd(node))) { | ||
result.add(node); | ||
} else { | ||
break; | ||
} | ||
} | ||
return result; | ||
} else { | ||
node = (T)x; | ||
if (IntervalUtil.overlapsClosed(start, end, getStart(node), getEnd(node))) { | ||
return ImmutableList.of(node); | ||
} | ||
return Collections.EMPTY_LIST; | ||
} | ||
} | ||
public Stream<T> stream() { | ||
return kmerLookup.values().stream().flatMap(o -> stream(o)); | ||
} | ||
private Stream<T> stream(Object x) { | ||
if (x == null) return Stream.empty(); | ||
if (x instanceof Int2ObjectSortedMap) return ((Int2ObjectSortedMap<T>)x).values().stream(); | ||
return Stream.of((T)x); | ||
} | ||
} |
18 changes: 18 additions & 0 deletions
18
...hi/idsv/debruijn/positional/optimiseddatastructures/KmerNodeByLastKmerIntervalLookup.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package au.edu.wehi.idsv.debruijn.positional.optimiseddatastructures; | ||
|
||
import au.edu.wehi.idsv.debruijn.positional.KmerNode; | ||
|
||
public class KmerNodeByLastKmerIntervalLookup<T extends KmerNode> extends KmerIntervalLookup<T> { | ||
@Override | ||
protected int getStart(T node) { return node.lastStart(); } | ||
|
||
@Override | ||
protected int getEnd(T node) { | ||
return node.lastEnd(); | ||
} | ||
|
||
@Override | ||
protected long getKmer(T node) { | ||
return node.lastKmer(); | ||
} | ||
} |
106 changes: 106 additions & 0 deletions
106
...sv/debruijn/positional/optimiseddatastructures/KmerNodeByFirstKmerIntervalLookupTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package au.edu.wehi.idsv.debruijn.positional.optimiseddatastructures; | ||
|
||
import au.edu.wehi.idsv.TestHelper; | ||
import au.edu.wehi.idsv.debruijn.KmerEncodingHelper; | ||
import au.edu.wehi.idsv.debruijn.positional.ImmutableKmerNode; | ||
import au.edu.wehi.idsv.debruijn.positional.KmerNode; | ||
import au.edu.wehi.idsv.util.IntervalUtil; | ||
import com.google.common.collect.ImmutableList; | ||
import com.google.common.collect.Lists; | ||
import org.junit.Assert; | ||
import org.junit.Test; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
|
||
public class KmerNodeByFirstKmerIntervalLookupTest extends TestHelper { | ||
private static final int k = 4; | ||
private KmerNode kn(String kmer, int start, int end) { | ||
return kn(KmerEncodingHelper.picardBaseToEncoded(k, kmer.getBytes()), start, end); | ||
} | ||
private KmerNode kn(long kmer, int start, int end) { | ||
return new ImmutableKmerNode(kmer, start, end, false, 1); | ||
} | ||
@Test | ||
public void get_should_match_overlap_logic() { | ||
for (List<KmerNode> kns : ImmutableList.of( | ||
ImmutableList.of( | ||
kn(0, 1, 2), | ||
kn(0, 4, 5), | ||
kn(0, 6, 6), | ||
kn(0, 7, 7), | ||
kn(0, 9, 9)), | ||
ImmutableList.of( | ||
kn(0, 1, 3), | ||
kn(0, 7, 7)), | ||
ImmutableList.of( | ||
kn(0, 1, 2)), | ||
ImmutableList.of( | ||
kn(0, 1, 2), | ||
kn(1, 1, 2)) | ||
)) { | ||
KmerNodeByLastKmerIntervalLookup<KmerNode> lookup = new KmerNodeByLastKmerIntervalLookup<>(); | ||
kns.stream().forEach(n -> lookup.add(n)); | ||
validate_against_direct_comparison(lookup, kns); | ||
} | ||
} | ||
|
||
private void validate_against_direct_comparison(KmerNodeByLastKmerIntervalLookup<KmerNode> lookup, List<KmerNode> kns) { | ||
for (long kmer : new long[] { 0, 1, 2}) { | ||
for (int i = -1; i < 11; i++) { | ||
for (int j = i; j < 12; j++) { | ||
int start = i; | ||
int end = j; | ||
List<KmerNode> expected = kns.stream() | ||
.filter(n -> n.firstKmer() == kmer && IntervalUtil.overlapsClosed(start, end, n.firstStart(), n.firstEnd())) | ||
.collect(Collectors.toList()); | ||
Assert.assertEquals(expected, lookup.getOverlapping(kmer, i, j)); | ||
KmerNode exactMatch = kns.stream() | ||
.filter(n -> n.firstKmer() == kmer && n.firstStart() == start && n.firstEnd() == end) | ||
.findFirst().orElse(null); | ||
Assert.assertEquals(exactMatch, lookup.get(kmer, i, j)); | ||
} | ||
} | ||
} | ||
} | ||
|
||
@Test | ||
public void remove_should_match_overlap_logic() { | ||
for (List<KmerNode> full : ImmutableList.of( | ||
ImmutableList.of( | ||
kn(0, 1, 2), | ||
kn(0, 4, 5), | ||
kn(0, 6, 6), | ||
kn(0, 7, 7), | ||
kn(0, 9, 9)), | ||
ImmutableList.of( | ||
kn(0, 1, 3), | ||
kn(0, 7, 7)), | ||
ImmutableList.of( | ||
kn(0, 1, 2)), | ||
ImmutableList.of( | ||
kn(0, 1, 2), | ||
kn(1, 1, 2)) | ||
)) { | ||
for (int offset = 0; offset < full.size(); offset++) { | ||
ArrayList<KmerNode> kns = Lists.newArrayList(full); | ||
KmerNodeByLastKmerIntervalLookup<KmerNode> lookup = new KmerNodeByLastKmerIntervalLookup<>(); | ||
full.stream().forEach(n -> lookup.add(n)); | ||
// remove each element | ||
KmerNode removed = kns.remove(offset); | ||
lookup.remove(removed); | ||
validate_against_direct_comparison(lookup, kns); | ||
for (int offset2 = 0; offset2 < kns.size(); offset2++) { | ||
KmerNodeByLastKmerIntervalLookup lookup2 = new KmerNodeByLastKmerIntervalLookup<>(); | ||
full.stream().forEach(n -> lookup2.add(n)); | ||
ArrayList<KmerNode> kns2 = Lists.newArrayList(kns); | ||
KmerNode removed2 = kns2.remove(offset2); | ||
lookup2.remove(removed); | ||
lookup2.remove(removed2); | ||
validate_against_direct_comparison(lookup2, kns2); | ||
} | ||
} | ||
} | ||
} | ||
} |