Skip to content

Commit a9ac0b0

Browse files
committed
Encapsulate barcode correction so it can be shared between programs
1 parent 6726a82 commit a9ac0b0

File tree

5 files changed

+315
-188
lines changed

5 files changed

+315
-188
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* MIT License
3+
*
4+
* Copyright 2025 Broad Institute
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
package org.broadinstitute.dropseqrna.beadsynthesis;
25+
26+
import htsjdk.samtools.metrics.MetricBase;
27+
28+
public class BarcodeCorrectionMetrics extends MetricBase {
29+
public long NUM_READS_EXACT_MATCH;
30+
public long NUM_READS_CORRECTED_SINGLE_ED1;
31+
public long NUM_READS_CORRECTED_MULTI_ED1;
32+
public long NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES;
33+
public long NUM_READS_UNCORRECTED_AMBIGUOUS;
34+
35+
public void merge(final BarcodeCorrectionMetrics that) {
36+
this.NUM_READS_EXACT_MATCH += that.NUM_READS_EXACT_MATCH;
37+
this.NUM_READS_CORRECTED_SINGLE_ED1 += that.NUM_READS_CORRECTED_SINGLE_ED1;
38+
this.NUM_READS_CORRECTED_MULTI_ED1 += that.NUM_READS_CORRECTED_MULTI_ED1;
39+
this.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES += that.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES;
40+
this.NUM_READS_UNCORRECTED_AMBIGUOUS += that.NUM_READS_UNCORRECTED_AMBIGUOUS;
41+
}
42+
}
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
/*
2+
* MIT License
3+
*
4+
* Copyright 2025 Broad Institute
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
package org.broadinstitute.dropseqrna.beadsynthesis;
25+
26+
import htsjdk.samtools.SAMRecord;
27+
import htsjdk.samtools.metrics.MetricsFile;
28+
import htsjdk.samtools.util.*;
29+
import org.broadinstitute.dropseqrna.utils.BaseRange;
30+
import org.broadinstitute.dropseqrna.utils.StringInterner;
31+
import org.broadinstitute.dropseqrna.utils.readpairs.ReadPair;
32+
33+
import java.io.File;
34+
import java.util.ArrayList;
35+
import java.util.HashMap;
36+
import java.util.List;
37+
import java.util.Map;
38+
39+
/**
40+
* Encapsulation of barcode correction part of CorrectAndSplitScrnaReadPairs, so it can be shared with other CLPs.
41+
*/
42+
public class BarcodeCorrector {
43+
private static final Log log = Log.getInstance(BarcodeCorrector.class);
44+
45+
// Normalized counts of exact-match barcodes loaded from ALLOWED_BARCODE_COUNTS file
46+
final private Map<String, Double> allowedBarcodeNormalizedOccurences;
47+
// Don't store many copies of the same allowed barcode in ed1MatchCache
48+
private final StringInterner allowedBarcodeInterner = new StringInterner();
49+
50+
private final ResourceLimitedMap<String, List<String>> ed1MatchCache =
51+
new ResourceLimitedMap<>(
52+
1_000_000,
53+
new ResourceLimitedMapFunctor<>() {
54+
@Override
55+
public List<String> makeValue(final String key) {
56+
return getEd1Matches(key);
57+
}
58+
59+
@Override
60+
public void finalizeValue(final String key, final List<String> strings) {
61+
}
62+
}
63+
);
64+
65+
private final List<BaseRange> baseRanges;
66+
private final int BARCODED_READ;
67+
private final String BARCODE_TAG;
68+
private final boolean TAG_BOTH_READS;
69+
private final double LIKELIHOOD_RATIO;
70+
private final String RAW_BARCODE_TAG;
71+
private final String BARCODE_QUALS_TAG;
72+
private Log.LogLevel VERBOSITY = Log.LogLevel.INFO;
73+
private final BarcodeCorrectionMetrics metrics = new BarcodeCorrectionMetrics();
74+
private final Histogram<Integer> numCandidatesHist = new Histogram<>("NUM_ED1_CANDIDATES", "NUM_READS");
75+
76+
/**
77+
* See CorrectAndSplitScRnaReadPairs CLP documentation for parameter documentation.
78+
*/
79+
public BarcodeCorrector(final File ALLOWED_BARCODE_COUNTS,
80+
final int BARCODED_READ,
81+
final String BASE_RANGE,
82+
final String BARCODE_TAG,
83+
final boolean TAG_BOTH_READS,
84+
final double LIKELIHOOD_RATIO,
85+
final String RAW_BARCODE_TAG,
86+
final String BARCODE_QUALS_TAG) {
87+
this.BARCODED_READ = BARCODED_READ;
88+
this.BARCODE_TAG = BARCODE_TAG;
89+
this.TAG_BOTH_READS = TAG_BOTH_READS;
90+
this.LIKELIHOOD_RATIO = LIKELIHOOD_RATIO;
91+
this.RAW_BARCODE_TAG = RAW_BARCODE_TAG;
92+
this.BARCODE_QUALS_TAG = BARCODE_QUALS_TAG;
93+
baseRanges = BaseRange.parseBaseRange(BASE_RANGE);
94+
String baseRangeStr = StringUtil.join(",", baseRanges);
95+
log.info(String.format("Splitting BAM files based on cell barcode on read %d in range %s",
96+
BARCODED_READ, baseRangeStr));
97+
final MetricsFile<CountBarcodeSequences.CountBarcodeSequenceMetrics, String> metricsFile = new MetricsFile<>();
98+
metricsFile.read(IOUtil.openFileForBufferedReading(ALLOWED_BARCODE_COUNTS));
99+
Histogram<String> allowedBarcodeHistogram = metricsFile.getHistogram();
100+
double allowedBarcodeOccurenceCount = allowedBarcodeHistogram.getSumOfValues();
101+
allowedBarcodeNormalizedOccurences = new HashMap<>(allowedBarcodeHistogram.size());
102+
for (final String allowedBarcode: allowedBarcodeHistogram.keySet()) {
103+
allowedBarcodeNormalizedOccurences.put(allowedBarcodeInterner.intern(allowedBarcode), allowedBarcodeHistogram.get(allowedBarcode).getValue()/allowedBarcodeOccurenceCount);
104+
}
105+
}
106+
107+
/**
108+
* Tag the read pair according to ctor parameters, and accumulate metrics.
109+
* @return final CBC, corrected if possible.
110+
*/
111+
public String correctReadPair(final ReadPair pair) {
112+
final SAMRecord readWithBarcode = BARCODED_READ == 1? pair.getFirstRead(): pair.getSecondRead();
113+
final SAMRecord otherRead = BARCODED_READ == 2? pair.getFirstRead(): pair.getSecondRead();
114+
if (RAW_BARCODE_TAG != null) {
115+
final String rawBarcode = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString());
116+
otherRead.setAttribute(RAW_BARCODE_TAG, rawBarcode);
117+
if (TAG_BOTH_READS) {
118+
readWithBarcode.setAttribute(RAW_BARCODE_TAG, rawBarcode);
119+
}
120+
}
121+
if (BARCODE_QUALS_TAG != null) {
122+
final String barcodeQuals = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getBaseQualityString());
123+
otherRead.setAttribute(BARCODE_QUALS_TAG, barcodeQuals);
124+
if (TAG_BOTH_READS) {
125+
readWithBarcode.setAttribute(BARCODE_QUALS_TAG, barcodeQuals);
126+
}
127+
}
128+
final String cellBarcode = getCorrectedCellBarcode(readWithBarcode);
129+
otherRead.setAttribute(BARCODE_TAG, cellBarcode);
130+
if (TAG_BOTH_READS) {
131+
readWithBarcode.setAttribute(BARCODE_TAG, cellBarcode);
132+
}
133+
return cellBarcode;
134+
}
135+
136+
private List<String> getEd1Matches(final String cellBarcode) {
137+
final List<String> ret = new ArrayList<>();
138+
final byte[] cellBarcodeBytes = StringUtil.stringToBytes(cellBarcode);
139+
for (int i = 0; i < cellBarcode.length(); ++i) {
140+
final byte original = cellBarcodeBytes[i];
141+
for (byte b: getOtherBases(original)) {
142+
cellBarcodeBytes[i] = b;
143+
final String candidate = StringUtil.bytesToString(cellBarcodeBytes);
144+
if (allowedBarcodeNormalizedOccurences.containsKey(candidate)) {
145+
ret.add(allowedBarcodeInterner.intern(candidate));
146+
}
147+
}
148+
cellBarcodeBytes[i] = original;
149+
}
150+
return ret;
151+
}
152+
153+
private String getCorrectedCellBarcode(final SAMRecord readWithBarcode) {
154+
final String cellBarcode = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString());
155+
if (allowedBarcodeNormalizedOccurences.containsKey(cellBarcode)) {
156+
// exact match -- no correction needed.
157+
if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_EXACT_MATCH == 0) {
158+
log.debug("EXACT_MATCH " + readWithBarcode);
159+
}
160+
++metrics.NUM_READS_EXACT_MATCH;
161+
return cellBarcode;
162+
} else {
163+
List<String> ed1Matches = ed1MatchCache.get(cellBarcode);
164+
if (ed1Matches.isEmpty()) {
165+
if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES == 0) {
166+
log.debug("UNCORRECTABLE_NO_ED1 " + readWithBarcode);
167+
}
168+
++metrics.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES;
169+
return cellBarcode;
170+
} else if (ed1Matches.size() == 1) {
171+
if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_CORRECTED_SINGLE_ED1 == 0) {
172+
log.debug("CORRECTED_SINGLE_ED1 " + readWithBarcode);
173+
}
174+
++metrics.NUM_READS_CORRECTED_SINGLE_ED1;
175+
numCandidatesHist.increment(1);
176+
return ed1Matches.getFirst();
177+
} else {
178+
String bestBarcode = null;
179+
double bestBarcodeLikelihood = 0;
180+
double sumLikelihoods = 0;
181+
final byte[] baseQualities = BaseRange.getBytesForBaseRange(baseRanges, readWithBarcode.getBaseQualities());
182+
for (final String candidateBarcode : ed1Matches) {
183+
double thisLikelihood = computeCandidateBarcodeLikelihood(cellBarcode, candidateBarcode, baseQualities);
184+
sumLikelihoods += thisLikelihood;
185+
if (thisLikelihood > bestBarcodeLikelihood) {
186+
bestBarcodeLikelihood = thisLikelihood;
187+
bestBarcode = candidateBarcode;
188+
}
189+
}
190+
if (bestBarcodeLikelihood / sumLikelihoods >= LIKELIHOOD_RATIO) {
191+
if (VERBOSITY == Log.LogLevel.DEBUG &&
192+
(metrics.NUM_READS_CORRECTED_MULTI_ED1 == 0 || ed1Matches.size() >= 4)) {
193+
log.debug("CORRECTED_MULTI_ED1 " + readWithBarcode);
194+
}
195+
++metrics.NUM_READS_CORRECTED_MULTI_ED1;
196+
numCandidatesHist.increment(ed1Matches.size());
197+
return bestBarcode;
198+
} else {
199+
if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_UNCORRECTED_AMBIGUOUS == 0) {
200+
log.debug("UNCORRECTED_AMBIGUOUS " + readWithBarcode);
201+
}
202+
++metrics.NUM_READS_UNCORRECTED_AMBIGUOUS;
203+
return cellBarcode;
204+
}
205+
}
206+
}
207+
}
208+
209+
private double computeCandidateBarcodeLikelihood(final String uncorrectedBarcode, final String candidateBarcode, final byte[] baseQualities) {
210+
int i;
211+
for (i = 0; i < uncorrectedBarcode.length(); ++i) {
212+
if (uncorrectedBarcode.charAt(i) != candidateBarcode.charAt(i) ) {
213+
break;
214+
}
215+
}
216+
return QualityUtil.getErrorProbabilityFromPhredScore(baseQualities[i]) * allowedBarcodeNormalizedOccurences.get(candidateBarcode);
217+
}
218+
219+
private final byte[] gct = {SequenceUtil.G, SequenceUtil.C, SequenceUtil.T};
220+
private final byte[] act = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.T};
221+
private final byte[] agt = {SequenceUtil.A, SequenceUtil.G, SequenceUtil.T};
222+
private final byte[] acg = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.G};
223+
private final byte[] acgt = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.G, SequenceUtil.T};
224+
private byte[] getOtherBases(byte original) {
225+
return switch (original) {
226+
case SequenceUtil.A, SequenceUtil.a -> gct;
227+
case SequenceUtil.C, SequenceUtil.c -> agt;
228+
case SequenceUtil.G, SequenceUtil.g -> act;
229+
case SequenceUtil.T, SequenceUtil.t -> acg;
230+
case SequenceUtil.N, SequenceUtil.n -> acgt;
231+
default -> throw new IllegalArgumentException(String.format("Unexpected base %d", original));
232+
};
233+
}
234+
235+
public void setVERBOSITY(Log.LogLevel VERBOSITY) {
236+
this.VERBOSITY = VERBOSITY;
237+
}
238+
239+
/**
240+
* @return Accumulated metrics of barcodes processed by this object.
241+
*/
242+
public BarcodeCorrectionMetrics getMetrics() {
243+
return metrics;
244+
}
245+
246+
/**
247+
* @return histogram with bucket key: number of ED1 matches for a non-exact match barcode;
248+
* bucket value: number of reads with that number of ED1 matches.
249+
*/
250+
public Histogram<Integer> getNumCandidatesHist() {
251+
return numCandidatesHist;
252+
}
253+
}

0 commit comments

Comments
 (0)