Skip to content

Commit 6754c4b

Browse files
committed
New program CorrectScrnaReadPairs
1 parent d9b6160 commit 6754c4b

File tree

5 files changed

+278
-12
lines changed

5 files changed

+278
-12
lines changed

src/java/org/broadinstitute/dropseqrna/beadsynthesis/BarcodeCorrector.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ public BarcodeCorrector(final File ALLOWED_BARCODE_COUNTS,
9292
this.BARCODE_QUALS_TAG = BARCODE_QUALS_TAG;
9393
baseRanges = BaseRange.parseBaseRange(BASE_RANGE);
9494
String baseRangeStr = StringUtil.join(",", baseRanges);
95-
log.info(String.format("Splitting BAM files based on cell barcode on read %d in range %s",
95+
log.info(String.format("Correcting cell barcode on read %d in range %s",
9696
BARCODED_READ, baseRangeStr));
9797
final MetricsFile<CountBarcodeSequences.CountBarcodeSequenceMetrics, String> metricsFile = new MetricsFile<>();
9898
metricsFile.read(IOUtil.openFileForBufferedReading(ALLOWED_BARCODE_COUNTS));
@@ -232,6 +232,13 @@ private byte[] getOtherBases(byte original) {
232232
};
233233
}
234234

235+
public void writeMetrics(final File METRICS,
236+
final MetricsFile<BarcodeCorrectionMetrics, Integer> metricsFile) {
237+
metricsFile.addMetric(this.getMetrics());
238+
metricsFile.addHistogram(this.getNumCandidatesHist());
239+
metricsFile.write(METRICS);
240+
}
241+
235242
public void setVERBOSITY(Log.LogLevel VERBOSITY) {
236243
this.VERBOSITY = VERBOSITY;
237244
}

src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairs.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@
3939
import java.util.ArrayList;
4040

4141
@CommandLineProgramProperties(
42-
summary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs.",
42+
summary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs, and split into BAMs such " +
43+
"that all the reads for a cell barcode are in the same BAM.",
4344
oneLineSummary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs.",
4445
programGroup = DropSeq.class
4546
)
@@ -103,10 +104,7 @@ protected void splitBAMs() {
103104
}
104105

105106
if (METRICS != null) {
106-
final MetricsFile<org.broadinstitute.dropseqrna.beadsynthesis.BarcodeCorrectionMetrics, Integer> metricsFile = getMetricsFile();
107-
metricsFile.addMetric(barcodeCorrector.getMetrics());
108-
metricsFile.addHistogram(barcodeCorrector.getNumCandidatesHist());
109-
metricsFile.write(METRICS);
107+
barcodeCorrector.writeMetrics(METRICS, getMetricsFile());
110108
}
111109
}
112110

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
/*
2+
* MIT License
3+
*
4+
* Copyright 2025 Broad Institute
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
package org.broadinstitute.dropseqrna.beadsynthesis;
25+
26+
import htsjdk.samtools.SAMFileWriter;
27+
import htsjdk.samtools.SAMFileWriterFactory;
28+
import htsjdk.samtools.SamFiles;
29+
import htsjdk.samtools.util.*;
30+
import org.broadinstitute.barclay.argparser.Argument;
31+
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
32+
import org.broadinstitute.dropseqrna.cmdline.DropSeq;
33+
import org.broadinstitute.dropseqrna.utils.AbstractSplitBamClp;
34+
import org.broadinstitute.dropseqrna.utils.FileListParsingUtils;
35+
import org.broadinstitute.dropseqrna.utils.PairedSamRecordIterator;
36+
import org.broadinstitute.dropseqrna.utils.SamHeaderUtil;
37+
import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil;
38+
import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator;
39+
import org.broadinstitute.dropseqrna.utils.readpairs.ReadPair;
40+
import picard.cmdline.CommandLineProgram;
41+
import picard.cmdline.StandardOptionDefinitions;
42+
import picard.nio.PicardHtsPath;
43+
44+
import java.io.File;
45+
import java.io.IOException;
46+
import java.nio.file.Files;
47+
import java.nio.file.Path;
48+
import java.util.List;
49+
50+
@CommandLineProgramProperties(
51+
summary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs, in which a region of " +
52+
"one read of the pair contains the raw cell barcode. The corrected cell barcode is assigned to the " +
53+
"read in a tag. The reads are not altered beyond the addition of tags.",
54+
oneLineSummary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs.",
55+
programGroup = DropSeq.class
56+
)
57+
public class CorrectScrnaReadPairs
58+
extends CommandLineProgram {
59+
60+
protected static final Log log = Log.getInstance(CorrectScrnaReadPairs.class);
61+
protected ProgressLogger progressLogger = new ProgressLogger(log, 10000000);
62+
63+
64+
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input paired-end SAM or BAM files to " +
65+
"correct. They must all have the same sort order", minElements = 1)
66+
public List<PicardHtsPath> INPUT;
67+
68+
@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
69+
doc="Output SAM or BAM, tagged with corrected cell barcodes.")
70+
public File OUTPUT;
71+
72+
@Argument(optional=true, shortName="D", doc="Delete input BAM(s) after splitting them. Default: do not delete input BAM(s).")
73+
public boolean DELETE_INPUTS = false;
74+
75+
@Argument(optional = true, shortName = "DI", doc="Delete BAM indices corresponding to input BAMs. Default: DELETE_INPUTS setting.")
76+
public Boolean DELETE_INPUT_INDICES;
77+
78+
@Argument(doc = "Which read of each read pair contains the cell barcode [1/2].")
79+
public int BARCODED_READ = 1;
80+
81+
@Argument(doc="The region of the barcoded read containing the cell barcode, seperated by a dash. " +
82+
"E.g. 1-4. Can extract multiple ranges by separating them by a colon. " +
83+
"For example 1-4:17-22 extracts the first 4 bases, then the 17-22 bases, and glues the sequence together " +
84+
"into a single cell barcode.")
85+
public String BASE_RANGE;
86+
87+
@Argument(doc="Metrics file produced by CountBarcodeSequences that has counts for all the expected cell barcodes " +
88+
"that are found as exact matches in the input data.")
89+
public File ALLOWED_BARCODE_COUNTS;
90+
91+
@Argument(doc="Tag to store the corrected barcode on the non-barcode read.")
92+
public String BARCODE_TAG = "XC";
93+
94+
@Argument(doc="If true, assign BARCODE_TAG (also RAW_BARCODE_TAG and BARCODE_QUALS_TAG, if set) to both reads.")
95+
public boolean TAG_BOTH_READS = false;
96+
97+
@Argument(shortName = StandardOptionDefinitions.METRICS_FILE_SHORT_NAME, optional = true,
98+
doc="Various matching and correction metrics")
99+
public File METRICS;
100+
101+
@Argument(doc="If more than on allowed barcode matches, (best likelihood)/sum(all likelihoods) " +
102+
"must be >= this value.")
103+
public double LIKELIHOOD_RATIO = 0.95;
104+
105+
@Argument(doc="Store the original barcode sequence in this tag on the non-barcode read. Default: do not assign this tag.",
106+
optional = true)
107+
public String RAW_BARCODE_TAG;
108+
@Argument(doc="Store the barcode base qualities in this tag on the non-barcode read. Default: do not assign this tag.",
109+
optional = true)
110+
public String BARCODE_QUALS_TAG;
111+
112+
113+
@Override
114+
protected int doWork() {
115+
INPUT = FileListParsingUtils.expandPicardHtsPathList(INPUT);
116+
final List<Path> inputPaths = PicardHtsPath.toPaths(INPUT);
117+
inputPaths.stream().forEach(p -> IOUtil.assertFileIsReadable(p));
118+
IOUtil.assertFileIsWritable(OUTPUT);
119+
if (DELETE_INPUT_INDICES == null) {
120+
DELETE_INPUT_INDICES = DELETE_INPUTS;
121+
}
122+
// Check that input BAM files can be deleted
123+
if (DELETE_INPUTS) {
124+
for (final Path bamFile : inputPaths) {
125+
IOUtil.assertFileIsWritable(bamFile);
126+
}
127+
}
128+
129+
if (DELETE_INPUT_INDICES) {
130+
for (final Path bamFile : inputPaths) {
131+
final Path index = SamFiles.findIndex(bamFile);
132+
if (index != null && Files.exists(index)) {
133+
IOUtil.assertFileIsWritable(index);
134+
}
135+
}
136+
}
137+
final SamHeaderAndIterator headerAndIterator = SamFileMergeUtil.mergeInputPaths(inputPaths, true);
138+
final SAMFileWriter samFileWriter = new SAMFileWriterFactory().setCreateIndex(CREATE_INDEX).
139+
makeSAMOrBAMWriter(headerAndIterator.header, true, OUTPUT);
140+
141+
SamHeaderUtil.addPgRecord(headerAndIterator.header, this);
142+
final BarcodeCorrector barcodeCorrector = new BarcodeCorrector(
143+
ALLOWED_BARCODE_COUNTS,
144+
BARCODED_READ,
145+
BASE_RANGE,
146+
BARCODE_TAG,
147+
TAG_BOTH_READS,
148+
LIKELIHOOD_RATIO,
149+
RAW_BARCODE_TAG,
150+
BARCODE_QUALS_TAG
151+
);
152+
barcodeCorrector.setVERBOSITY(VERBOSITY);
153+
final PairedSamRecordIterator iterator = new PairedSamRecordIterator(headerAndIterator.iterator);
154+
for (ReadPair pair: new IterableAdapter<>(iterator)) {
155+
progressLogger.record(pair.getFirstRead());
156+
barcodeCorrector.correctReadPair(pair);
157+
samFileWriter.addAlignment(pair.getFirstRead());
158+
samFileWriter.addAlignment(pair.getSecondRead());
159+
}
160+
CloserUtil.close(headerAndIterator.iterator);
161+
samFileWriter.close();
162+
if (METRICS != null) {
163+
barcodeCorrector.writeMetrics(METRICS, getMetricsFile());
164+
}
165+
166+
try {
167+
if (DELETE_INPUTS) {
168+
inputPaths.stream().forEach(p -> {
169+
try {
170+
Files.delete(p);
171+
} catch (IOException e) {
172+
throw new RuntimeIOException(e);
173+
}
174+
});
175+
}
176+
if (DELETE_INPUT_INDICES) {
177+
for (final Path inputBam : inputPaths) {
178+
final Path index = SamFiles.findIndex(inputBam);
179+
if (index != null && index.toFile().exists()) {
180+
Files.delete(index);
181+
182+
}
183+
}
184+
}
185+
} catch (IOException e) {
186+
throw new RuntimeIOException(e);
187+
}
188+
189+
return 0;
190+
}
191+
/** Stock main method, for testing. */
192+
public static void main(final String[] args) {
193+
System.exit(new CorrectScrnaReadPairs().instanceMain(args));
194+
}
195+
}

src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairsTest.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@
3939

4040
public class CorrectAndSplitScrnaReadPairsTest {
4141

42-
private static final File TEST_DATA_DIR = new File("testdata/org/broadinstitute/dropseq/beadsynthesis/CorrectAndSplitScrnaReadPairsTest");
43-
private static final File INPUT_SAM = new File(TEST_DATA_DIR, "correct_barcodes_test.sam");
44-
private static final File EXPECTED_BARCODES_HIST = new File(TEST_DATA_DIR, "correct_barcodes_test.expected_barcode_metrics.gz");
45-
private static final String BARCODE_QUALS_TAG = "CY";
46-
private static final String RAW_BARCODE_TAG = "CR";
47-
private static final String BASE_RANGE = "1-16";
42+
static final File TEST_DATA_DIR = new File("testdata/org/broadinstitute/dropseq/beadsynthesis/CorrectAndSplitScrnaReadPairsTest");
43+
static final File INPUT_SAM = new File(TEST_DATA_DIR, "correct_barcodes_test.sam");
44+
static final File EXPECTED_BARCODES_HIST = new File(TEST_DATA_DIR, "correct_barcodes_test.expected_barcode_metrics.gz");
45+
static final String BARCODE_QUALS_TAG = "CY";
46+
static final String RAW_BARCODE_TAG = "CR";
47+
static final String BASE_RANGE = "1-16";
4848

4949
@Test(dataProvider = "testBasicDataProvider")
5050
public void testBasic(boolean tagBothReads) {
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* MIT License
3+
*
4+
* Copyright 2025 Broad Institute
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
package org.broadinstitute.dropseqrna.beadsynthesis;
25+
26+
import htsjdk.samtools.SAMRecord;
27+
import htsjdk.samtools.SamReader;
28+
import htsjdk.samtools.SamReaderFactory;
29+
import org.broadinstitute.dropseqrna.utils.TestUtils;
30+
import org.testng.Assert;
31+
import org.testng.annotations.DataProvider;
32+
import org.testng.annotations.Test;
33+
import picard.nio.PicardHtsPath;
34+
35+
import java.util.Collections;
36+
37+
public class CorrectScrnaReadPairsTest {
38+
@Test(dataProvider = "testBasicDataProvider")
39+
public void testBasic(boolean tagBothReads) {
40+
CorrectScrnaReadPairs clp = new CorrectScrnaReadPairs();
41+
clp.INPUT = Collections.singletonList(new PicardHtsPath(CorrectAndSplitScrnaReadPairsTest.INPUT_SAM));
42+
clp.ALLOWED_BARCODE_COUNTS = CorrectAndSplitScrnaReadPairsTest.EXPECTED_BARCODES_HIST;
43+
clp.BARCODED_READ = 1;
44+
clp.BASE_RANGE = CorrectAndSplitScrnaReadPairsTest.BASE_RANGE;
45+
clp.METRICS = TestUtils.getTempReportFile("test.", ".corrected_barcode_metrics");
46+
clp.OUTPUT = TestUtils.getTempReportFile("CorrectScrnaReadPairsTest.", ".sam");
47+
clp.TAG_BOTH_READS = tagBothReads;
48+
Assert.assertEquals(clp.doWork(), 0);
49+
final SamReader reader = SamReaderFactory.makeDefault().open(clp.OUTPUT);
50+
for (final SAMRecord rec : reader) {
51+
if (rec.getSecondOfPairFlag() || tagBothReads) {
52+
final String cellBarcode = rec.getStringAttribute(clp.BARCODE_TAG);
53+
final String expectedCellBarcode = rec.getReadName().split(":")[1];
54+
Assert.assertEquals(cellBarcode, expectedCellBarcode, rec.getSAMString());
55+
}
56+
}
57+
}
58+
59+
@DataProvider(name = "testBasicDataProvider")
60+
public Object[][] testBasicDataProvider() {
61+
return new Object[][]{
62+
{false},
63+
{true}
64+
};
65+
}
66+
}

0 commit comments

Comments
 (0)