diff --git a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BarcodeCorrectionMetrics.java b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BarcodeCorrectionMetrics.java new file mode 100644 index 00000000..4d59371e --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BarcodeCorrectionMetrics.java @@ -0,0 +1,42 @@ +/* + * MIT License + * + * Copyright 2025 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.beadsynthesis; + +import htsjdk.samtools.metrics.MetricBase; + +public class BarcodeCorrectionMetrics extends MetricBase { + public long NUM_READS_EXACT_MATCH; + public long NUM_READS_CORRECTED_SINGLE_ED1; + public long NUM_READS_CORRECTED_MULTI_ED1; + public long NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES; + public long NUM_READS_UNCORRECTED_AMBIGUOUS; + + public void merge(final BarcodeCorrectionMetrics that) { + this.NUM_READS_EXACT_MATCH += that.NUM_READS_EXACT_MATCH; + this.NUM_READS_CORRECTED_SINGLE_ED1 += that.NUM_READS_CORRECTED_SINGLE_ED1; + this.NUM_READS_CORRECTED_MULTI_ED1 += that.NUM_READS_CORRECTED_MULTI_ED1; + this.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES += that.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES; + this.NUM_READS_UNCORRECTED_AMBIGUOUS += that.NUM_READS_UNCORRECTED_AMBIGUOUS; + } +} diff --git a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BarcodeCorrector.java b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BarcodeCorrector.java new file mode 100644 index 00000000..d4099bc1 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BarcodeCorrector.java @@ -0,0 +1,253 @@ +/* + * MIT License + * + * Copyright 2025 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.beadsynthesis; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.*; +import org.broadinstitute.dropseqrna.utils.BaseRange; +import org.broadinstitute.dropseqrna.utils.StringInterner; +import org.broadinstitute.dropseqrna.utils.readpairs.ReadPair; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Encapsulation of barcode correction part of CorrectAndSplitScrnaReadPairs, so it can be shared with other CLPs. + */ +public class BarcodeCorrector { + private static final Log log = Log.getInstance(BarcodeCorrector.class); + + // Normalized counts of exact-match barcodes loaded from ALLOWED_BARCODE_COUNTS file + final private Map allowedBarcodeNormalizedOccurences; + // Don't store many copies of the same allowed barcode in ed1MatchCache + private final StringInterner allowedBarcodeInterner = new StringInterner(); + + private final ResourceLimitedMap> ed1MatchCache = + new ResourceLimitedMap<>( + 1_000_000, + new ResourceLimitedMapFunctor<>() { + @Override + public List makeValue(final String key) { + return getEd1Matches(key); + } + + @Override + public void finalizeValue(final String key, final List strings) { + } + } + ); + + private final List baseRanges; + private final int BARCODED_READ; + private final String BARCODE_TAG; + private final boolean TAG_BOTH_READS; + private final double LIKELIHOOD_RATIO; + private final String RAW_BARCODE_TAG; + private final String BARCODE_QUALS_TAG; + private Log.LogLevel VERBOSITY = Log.LogLevel.INFO; + private final BarcodeCorrectionMetrics metrics = new BarcodeCorrectionMetrics(); + private final Histogram numCandidatesHist = new Histogram<>("NUM_ED1_CANDIDATES", "NUM_READS"); + + /** + * See CorrectAndSplitScRnaReadPairs CLP documentation for parameter documentation. + */ + public BarcodeCorrector(final CorrectScrnaReadPairsArgumentCollection args) { + this.BARCODED_READ = args.BARCODED_READ; + this.BARCODE_TAG = args.BARCODE_TAG; + this.TAG_BOTH_READS = args.TAG_BOTH_READS; + this.LIKELIHOOD_RATIO = args.LIKELIHOOD_RATIO; + this.RAW_BARCODE_TAG = args.RAW_BARCODE_TAG; + this.BARCODE_QUALS_TAG = args.BARCODE_QUALS_TAG; + baseRanges = BaseRange.parseBaseRange(args.BASE_RANGE); + String baseRangeStr = StringUtil.join(",", baseRanges); + log.info(String.format("Correcting cell barcode on read %d in range %s", + BARCODED_READ, baseRangeStr)); + final MetricsFile metricsFile = new MetricsFile<>(); + metricsFile.read(IOUtil.openFileForBufferedReading(args.ALLOWED_BARCODE_COUNTS)); + Histogram allowedBarcodeHistogram = metricsFile.getHistogram(); + double allowedBarcodeOccurenceCount = allowedBarcodeHistogram.getSumOfValues(); + allowedBarcodeNormalizedOccurences = new HashMap<>(allowedBarcodeHistogram.size()); + for (final String allowedBarcode: allowedBarcodeHistogram.keySet()) { + allowedBarcodeNormalizedOccurences.put(allowedBarcodeInterner.intern(allowedBarcode), allowedBarcodeHistogram.get(allowedBarcode).getValue()/allowedBarcodeOccurenceCount); + } + } + + /** + * Tag the read pair according to ctor parameters, and accumulate metrics. + * @return final CBC, corrected if possible. + */ + public String correctReadPair(final ReadPair pair) { + final SAMRecord readWithBarcode = BARCODED_READ == 1? pair.getFirstRead(): pair.getSecondRead(); + final SAMRecord otherRead = BARCODED_READ == 2? pair.getFirstRead(): pair.getSecondRead(); + if (RAW_BARCODE_TAG != null) { + final String rawBarcode = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString()); + otherRead.setAttribute(RAW_BARCODE_TAG, rawBarcode); + if (TAG_BOTH_READS) { + readWithBarcode.setAttribute(RAW_BARCODE_TAG, rawBarcode); + } + } + if (BARCODE_QUALS_TAG != null) { + final String barcodeQuals = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getBaseQualityString()); + otherRead.setAttribute(BARCODE_QUALS_TAG, barcodeQuals); + if (TAG_BOTH_READS) { + readWithBarcode.setAttribute(BARCODE_QUALS_TAG, barcodeQuals); + } + } + final String cellBarcode = getCorrectedCellBarcode(readWithBarcode); + otherRead.setAttribute(BARCODE_TAG, cellBarcode); + if (TAG_BOTH_READS) { + readWithBarcode.setAttribute(BARCODE_TAG, cellBarcode); + } + return cellBarcode; + } + + private List getEd1Matches(final String cellBarcode) { + final List ret = new ArrayList<>(); + final byte[] cellBarcodeBytes = StringUtil.stringToBytes(cellBarcode); + for (int i = 0; i < cellBarcode.length(); ++i) { + final byte original = cellBarcodeBytes[i]; + for (byte b: getOtherBases(original)) { + cellBarcodeBytes[i] = b; + final String candidate = StringUtil.bytesToString(cellBarcodeBytes); + if (allowedBarcodeNormalizedOccurences.containsKey(candidate)) { + ret.add(allowedBarcodeInterner.intern(candidate)); + } + } + cellBarcodeBytes[i] = original; + } + return ret; + } + + private String getCorrectedCellBarcode(final SAMRecord readWithBarcode) { + final String cellBarcode = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString()); + if (allowedBarcodeNormalizedOccurences.containsKey(cellBarcode)) { + // exact match -- no correction needed. + if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_EXACT_MATCH == 0) { + log.debug("EXACT_MATCH " + readWithBarcode); + } + ++metrics.NUM_READS_EXACT_MATCH; + return cellBarcode; + } else { + List ed1Matches = ed1MatchCache.get(cellBarcode); + if (ed1Matches.isEmpty()) { + if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES == 0) { + log.debug("UNCORRECTABLE_NO_ED1 " + readWithBarcode); + } + ++metrics.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES; + return cellBarcode; + } else if (ed1Matches.size() == 1) { + if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_CORRECTED_SINGLE_ED1 == 0) { + log.debug("CORRECTED_SINGLE_ED1 " + readWithBarcode); + } + ++metrics.NUM_READS_CORRECTED_SINGLE_ED1; + numCandidatesHist.increment(1); + return ed1Matches.getFirst(); + } else { + String bestBarcode = null; + double bestBarcodeLikelihood = 0; + double sumLikelihoods = 0; + final byte[] baseQualities = BaseRange.getBytesForBaseRange(baseRanges, readWithBarcode.getBaseQualities()); + for (final String candidateBarcode : ed1Matches) { + double thisLikelihood = computeCandidateBarcodeLikelihood(cellBarcode, candidateBarcode, baseQualities); + sumLikelihoods += thisLikelihood; + if (thisLikelihood > bestBarcodeLikelihood) { + bestBarcodeLikelihood = thisLikelihood; + bestBarcode = candidateBarcode; + } + } + if (bestBarcodeLikelihood / sumLikelihoods >= LIKELIHOOD_RATIO) { + if (VERBOSITY == Log.LogLevel.DEBUG && + (metrics.NUM_READS_CORRECTED_MULTI_ED1 == 0 || ed1Matches.size() >= 4)) { + log.debug("CORRECTED_MULTI_ED1 " + readWithBarcode); + } + ++metrics.NUM_READS_CORRECTED_MULTI_ED1; + numCandidatesHist.increment(ed1Matches.size()); + return bestBarcode; + } else { + if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_UNCORRECTED_AMBIGUOUS == 0) { + log.debug("UNCORRECTED_AMBIGUOUS " + readWithBarcode); + } + ++metrics.NUM_READS_UNCORRECTED_AMBIGUOUS; + return cellBarcode; + } + } + } + } + + private double computeCandidateBarcodeLikelihood(final String uncorrectedBarcode, final String candidateBarcode, final byte[] baseQualities) { + int i; + for (i = 0; i < uncorrectedBarcode.length(); ++i) { + if (uncorrectedBarcode.charAt(i) != candidateBarcode.charAt(i) ) { + break; + } + } + return QualityUtil.getErrorProbabilityFromPhredScore(baseQualities[i]) * allowedBarcodeNormalizedOccurences.get(candidateBarcode); + } + + private final byte[] gct = {SequenceUtil.G, SequenceUtil.C, SequenceUtil.T}; + private final byte[] act = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.T}; + private final byte[] agt = {SequenceUtil.A, SequenceUtil.G, SequenceUtil.T}; + private final byte[] acg = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.G}; + private final byte[] acgt = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.G, SequenceUtil.T}; + private byte[] getOtherBases(byte original) { + return switch (original) { + case SequenceUtil.A, SequenceUtil.a -> gct; + case SequenceUtil.C, SequenceUtil.c -> agt; + case SequenceUtil.G, SequenceUtil.g -> act; + case SequenceUtil.T, SequenceUtil.t -> acg; + case SequenceUtil.N, SequenceUtil.n -> acgt; + default -> throw new IllegalArgumentException(String.format("Unexpected base %d", original)); + }; + } + + public void writeMetrics(final File METRICS, + final MetricsFile metricsFile) { + metricsFile.addMetric(this.getMetrics()); + metricsFile.addHistogram(this.getNumCandidatesHist()); + metricsFile.write(METRICS); + } + + public void setVERBOSITY(Log.LogLevel VERBOSITY) { + this.VERBOSITY = VERBOSITY; + } + + /** + * @return Accumulated metrics of barcodes processed by this object. + */ + public BarcodeCorrectionMetrics getMetrics() { + return metrics; + } + + /** + * @return histogram with bucket key: number of ED1 matches for a non-exact match barcode; + * bucket value: number of reads with that number of ED1 matches. + */ + public Histogram getNumCandidatesHist() { + return numCandidatesHist; + } +} diff --git a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairs.java b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairs.java index f57e15e2..73875b3b 100644 --- a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairs.java +++ b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairs.java @@ -23,272 +23,70 @@ */ package org.broadinstitute.dropseqrna.beadsynthesis; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; -import htsjdk.samtools.util.*; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.IterableAdapter; import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.dropseqrna.cmdline.CustomCommandLineValidationHelper; import org.broadinstitute.dropseqrna.cmdline.DropSeq; import org.broadinstitute.dropseqrna.utils.AbstractSplitBamClp; -import org.broadinstitute.dropseqrna.utils.BaseRange; import org.broadinstitute.dropseqrna.utils.PairedSamRecordIterator; -import org.broadinstitute.dropseqrna.utils.StringInterner; import org.broadinstitute.dropseqrna.utils.readpairs.ReadPair; import picard.cmdline.StandardOptionDefinitions; import java.io.File; -import java.util.*; +import java.util.ArrayList; @CommandLineProgramProperties( - summary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs.", + summary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs, and split into BAMs such " + + "that all the reads for a cell barcode are in the same BAM.", oneLineSummary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs.", programGroup = DropSeq.class ) public class CorrectAndSplitScrnaReadPairs extends AbstractSplitBamClp { - @Argument(doc = "Which read of each read pair contains the cell barcode [1/2].") - public int BARCODED_READ = 1; + @ArgumentCollection + public CorrectScrnaReadPairsArgumentCollection ARGUMENT_COLLECTION = new CorrectScrnaReadPairsArgumentCollection(); - @Argument(doc="The region of the barcoded read containing the cell barcode, seperated by a dash. " + - "E.g. 1-4. Can extract multiple ranges by separating them by a colon. " + - "For example 1-4:17-22 extracts the first 4 bases, then the 17-22 bases, and glues the sequence together " + - "into a single cell barcode.") - public String BASE_RANGE; - @Argument(doc="Metrics file produced by CountBarcodeSequences that has counts for all the expected cell barcodes " + - "that are found as exact matches in the input data.") - public File ALLOWED_BARCODE_COUNTS; - - @Argument(doc="Tag to store the corrected barcode on the non-barcode read.") - public String BARCODE_TAG = "XC"; - - @Argument(doc="If true, assign BARCODE_TAG (also RAW_BARCODE_TAG and BARCODE_QUALS_TAG, if set) to both reads.") - public boolean TAG_BOTH_READS = false; - - @Argument(shortName = StandardOptionDefinitions.METRICS_FILE_SHORT_NAME, optional = true, - doc="Various matching and correction metrics") - public File METRICS; - - @Argument(doc="If more than on allowed barcode matches, (best likelihood)/sum(all likelihoods) " + - "must be >= this value.") - public double LIKELIHOOD_RATIO = 0.95; - - @Argument(doc="Store the original barcode sequence in this tag on the non-barcode read. Default: do not assign this tag.", - optional = true) - public String RAW_BARCODE_TAG; - @Argument(doc="Store the barcode base qualities in this tag on the non-barcode read. Default: do not assign this tag.", - optional = true) - public String BARCODE_QUALS_TAG; - - private Map allowedBarcodeNormalizedOccurences; - // Don't store many copies of the same allowed barcode in ed1MatchCache - private final StringInterner allowedBarcodeInterner = new StringInterner(); - - private final ResourceLimitedMap> ed1MatchCache = - new ResourceLimitedMap<>( - 1_000_000, - new ResourceLimitedMapFunctor<>() { - @Override - public List makeValue(final String key) { - return CorrectAndSplitScrnaReadPairs.this.getEd1Matches(key); - } - - @Override - public void finalizeValue(final String key, final List strings) { - } - } - ); - - private List baseRanges; - - private final BarcodeCorrectionMetrics metrics = new BarcodeCorrectionMetrics(); - private final Histogram numCandidatesHist = new Histogram<>("NUM_ED1_CANDIDATES", "NUM_READS"); @Override protected void splitBAMs() { - baseRanges = BaseRange.parseBaseRange(this.BASE_RANGE); - final String baseRangeStr = StringUtil.join(",", baseRanges); - log.info(String.format("Splitting BAM files based on cell barcode on read %d in range %s", - BARCODED_READ, baseRangeStr)); - allowedBarcodeNormalizedOccurences = getNormalizedAllowedBarcodes(); + final BarcodeCorrector barcodeCorrector = new BarcodeCorrector(ARGUMENT_COLLECTION); + barcodeCorrector.setVERBOSITY(VERBOSITY); final PairedSamRecordIterator iterator = new PairedSamRecordIterator(headerAndIterator.iterator); for (ReadPair pair: new IterableAdapter<>(iterator)) { progressLogger.record(pair.getFirstRead()); - final SAMRecord readWithBarcode = BARCODED_READ == 1? pair.getFirstRead(): pair.getSecondRead(); - final SAMRecord otherRead = BARCODED_READ == 2? pair.getFirstRead(): pair.getSecondRead(); - if (RAW_BARCODE_TAG != null) { - final String rawBarcode = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString()); - otherRead.setAttribute(RAW_BARCODE_TAG, rawBarcode); - if (TAG_BOTH_READS) { - readWithBarcode.setAttribute(RAW_BARCODE_TAG, rawBarcode); - } - } - if (BARCODE_QUALS_TAG != null) { - final String barcodeQuals = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getBaseQualityString()); - otherRead.setAttribute(BARCODE_QUALS_TAG, barcodeQuals); - if (TAG_BOTH_READS) { - readWithBarcode.setAttribute(BARCODE_QUALS_TAG, barcodeQuals); - } - } - final String cellBarcode = getCorrectedCellBarcode(readWithBarcode); - otherRead.setAttribute(BARCODE_TAG, cellBarcode); - if (TAG_BOTH_READS) { - readWithBarcode.setAttribute(BARCODE_TAG, cellBarcode); - } + final String cellBarcode = barcodeCorrector.correctReadPair(pair); final int writerIdx = getWriterIdxForCellBarcode(cellBarcode); writeRecord(writerIdx, pair.getFirstRead()); writeRecord(writerIdx, pair.getSecondRead()); } - if (METRICS != null) { - final MetricsFile metricsFile = getMetricsFile(); - metricsFile.addMetric(metrics); - metricsFile.addHistogram(numCandidatesHist); - metricsFile.write(METRICS); + if (ARGUMENT_COLLECTION.METRICS != null) { + barcodeCorrector.writeMetrics(ARGUMENT_COLLECTION.METRICS, getMetricsFile()); } } - private Map getNormalizedAllowedBarcodes() { - final MetricsFile metricsFile = new MetricsFile<>(); - metricsFile.read(IOUtil.openFileForBufferedReading(ALLOWED_BARCODE_COUNTS)); - Histogram allowedBarcodeHistogram = metricsFile.getHistogram(); - double allowedBarcodeOccurenceCount = allowedBarcodeHistogram.getSumOfValues(); - final Map ret = new HashMap<>(allowedBarcodeHistogram.size()); - for (final String allowedBarcode: allowedBarcodeHistogram.keySet()) { - ret.put(allowedBarcodeInterner.intern(allowedBarcode), allowedBarcodeHistogram.get(allowedBarcode).getValue()/allowedBarcodeOccurenceCount); - } - return ret; - } - - private String getCorrectedCellBarcode(final SAMRecord readWithBarcode) { - final String cellBarcode = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString()); - if (allowedBarcodeNormalizedOccurences.containsKey(cellBarcode)) { - // exact match -- no correction needed. - if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_EXACT_MATCH == 0) { - log.debug("EXACT_MATCH " + readWithBarcode); - } - ++metrics.NUM_READS_EXACT_MATCH; - return cellBarcode; - } else { - List ed1Matches = ed1MatchCache.get(cellBarcode); - if (ed1Matches.isEmpty()) { - if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES == 0) { - log.debug("UNCORRECTABLE_NO_ED1 " + readWithBarcode); - } - ++metrics.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES; - return cellBarcode; - } else if (ed1Matches.size() == 1) { - if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_CORRECTED_SINGLE_ED1 == 0) { - log.debug("CORRECTED_SINGLE_ED1 " + readWithBarcode); - } - ++metrics.NUM_READS_CORRECTED_SINGLE_ED1; - numCandidatesHist.increment(1); - return ed1Matches.getFirst(); - } else { - String bestBarcode = null; - double bestBarcodeLikelihood = 0; - double sumLikelihoods = 0; - final byte[] baseQualities = BaseRange.getBytesForBaseRange(baseRanges, readWithBarcode.getBaseQualities()); - for (final String candidateBarcode: ed1Matches) { - double thisLikelihood = computeCandidateBarcodeLikelihood(cellBarcode, candidateBarcode, baseQualities); - sumLikelihoods += thisLikelihood; - if (thisLikelihood > bestBarcodeLikelihood) { - bestBarcodeLikelihood = thisLikelihood; - bestBarcode = candidateBarcode; - } - } - if (bestBarcodeLikelihood/sumLikelihoods >= LIKELIHOOD_RATIO) { - if (VERBOSITY == Log.LogLevel.DEBUG && - (metrics.NUM_READS_CORRECTED_MULTI_ED1 == 0 || ed1Matches.size() >= 4)) { - log.debug("CORRECTED_MULTI_ED1 " + readWithBarcode); - } - ++metrics.NUM_READS_CORRECTED_MULTI_ED1; - numCandidatesHist.increment(ed1Matches.size()); - return bestBarcode; - } else { - if (VERBOSITY == Log.LogLevel.DEBUG && metrics.NUM_READS_UNCORRECTED_AMBIGUOUS == 0) { - log.debug("UNCORRECTED_AMBIGUOUS " + readWithBarcode); - } - ++metrics.NUM_READS_UNCORRECTED_AMBIGUOUS; - return cellBarcode; - } - } - } - } - - private double computeCandidateBarcodeLikelihood(final String uncorrectedBarcode, final String candidateBarcode, final byte[] baseQualities) { - int i; - for (i = 0; i < uncorrectedBarcode.length(); ++i) { - if (uncorrectedBarcode.charAt(i) != candidateBarcode.charAt(i) ) { - break; - } - } - return QualityUtil.getErrorProbabilityFromPhredScore(baseQualities[i]) * allowedBarcodeNormalizedOccurences.get(candidateBarcode); - } - - private List getEd1Matches(final String cellBarcode) { - final List ret = new ArrayList<>(); - final byte[] cellBarcodeBytes = StringUtil.stringToBytes(cellBarcode); - for (int i = 0; i < cellBarcode.length(); ++i) { - final byte original = cellBarcodeBytes[i]; - for (byte b: getOtherBases(original)) { - cellBarcodeBytes[i] = b; - final String candidate = StringUtil.bytesToString(cellBarcodeBytes); - if (allowedBarcodeNormalizedOccurences.containsKey(candidate)) { - ret.add(allowedBarcodeInterner.intern(candidate)); - } - } - cellBarcodeBytes[i] = original; - } - return ret; - } - - private final byte[] gct = {SequenceUtil.G, SequenceUtil.C, SequenceUtil.T}; - private final byte[] act = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.T}; - private final byte[] agt = {SequenceUtil.A, SequenceUtil.G, SequenceUtil.T}; - private final byte[] acg = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.G}; - private final byte[] acgt = {SequenceUtil.A, SequenceUtil.C, SequenceUtil.G, SequenceUtil.T}; - private byte[] getOtherBases(byte original) { - return switch (original) { - case SequenceUtil.A, SequenceUtil.a -> gct; - case SequenceUtil.C, SequenceUtil.c -> agt; - case SequenceUtil.G, SequenceUtil.g -> act; - case SequenceUtil.T, SequenceUtil.t -> acg; - case SequenceUtil.N, SequenceUtil.n -> acgt; - default -> throw new IllegalArgumentException(String.format("Unexpected base %d", original)); - }; - } - @Override protected String[] customCommandLineValidation() { - IOUtil.assertFileIsReadable(ALLOWED_BARCODE_COUNTS); + IOUtil.assertFileIsReadable(ARGUMENT_COLLECTION.ALLOWED_BARCODE_COUNTS); final ArrayList list = new ArrayList<>(); - if (BARCODED_READ < 1 || BARCODED_READ > 2) { + if (ARGUMENT_COLLECTION.BARCODED_READ < 1 || ARGUMENT_COLLECTION.BARCODED_READ > 2) { list.add("BARCODE_READ must be 1 or 2."); } return CustomCommandLineValidationHelper.makeValue(super.customCommandLineValidation(), list); } - public static class BarcodeCorrectionMetrics extends MetricBase { - public long NUM_READS_EXACT_MATCH; - public long NUM_READS_CORRECTED_SINGLE_ED1; - public long NUM_READS_CORRECTED_MULTI_ED1; - public long NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES; - public long NUM_READS_UNCORRECTED_AMBIGUOUS; - - public void merge(final BarcodeCorrectionMetrics that) { - this.NUM_READS_EXACT_MATCH += that.NUM_READS_EXACT_MATCH; - this.NUM_READS_CORRECTED_SINGLE_ED1 += that.NUM_READS_CORRECTED_SINGLE_ED1; - this.NUM_READS_CORRECTED_MULTI_ED1 += that.NUM_READS_CORRECTED_MULTI_ED1; - this.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES += that.NUM_READS_UNCORRECTABLE_NO_ED1_BARCODES; - this.NUM_READS_UNCORRECTED_AMBIGUOUS += that.NUM_READS_UNCORRECTED_AMBIGUOUS; - } - } - - /** Stock main method. */ + /** Stock main method. */ public static void main(final String[] args) { System.exit(new CorrectAndSplitScrnaReadPairs().instanceMain(args)); } + + // To support loading metrics files before this class became top-level + public static class BarcodeCorrectionMetrics + extends org.broadinstitute.dropseqrna.beadsynthesis.BarcodeCorrectionMetrics + {} } diff --git a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairs.java b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairs.java new file mode 100644 index 00000000..df04b094 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairs.java @@ -0,0 +1,155 @@ +/* + * MIT License + * + * Copyright 2025 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.beadsynthesis; + +import htsjdk.samtools.SAMFileWriter; +import htsjdk.samtools.SAMFileWriterFactory; +import htsjdk.samtools.SamFiles; +import htsjdk.samtools.util.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.dropseqrna.cmdline.DropSeq; +import org.broadinstitute.dropseqrna.utils.AbstractSplitBamClp; +import org.broadinstitute.dropseqrna.utils.FileListParsingUtils; +import org.broadinstitute.dropseqrna.utils.PairedSamRecordIterator; +import org.broadinstitute.dropseqrna.utils.SamHeaderUtil; +import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil; +import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator; +import org.broadinstitute.dropseqrna.utils.readpairs.ReadPair; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.nio.PicardHtsPath; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +@CommandLineProgramProperties( + summary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs, in which a region of " + + "one read of the pair contains the raw cell barcode. The corrected cell barcode is assigned to the " + + "read in a tag. The reads are not altered beyond the addition of tags.", + oneLineSummary = "Correct edit-distance 1 errors in cell barcodes in scRNA-seq read pairs.", + programGroup = DropSeq.class +) +public class CorrectScrnaReadPairs +extends CommandLineProgram { + + protected static final Log log = Log.getInstance(CorrectScrnaReadPairs.class); + protected ProgressLogger progressLogger = new ProgressLogger(log, 10000000); + + + @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input paired-end SAM or BAM files to " + + "correct. They must all have the same sort order", minElements = 1) + public List INPUT; + + @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, + doc="Output SAM or BAM, tagged with corrected cell barcodes.") + public File OUTPUT; + + @Argument(optional=true, shortName="D", doc="Delete input BAM(s) after splitting them. Default: do not delete input BAM(s).") + public boolean DELETE_INPUTS = false; + + @Argument(optional = true, shortName = "DI", doc="Delete BAM indices corresponding to input BAMs. Default: DELETE_INPUTS setting.") + public Boolean DELETE_INPUT_INDICES; + + @ArgumentCollection + public CorrectScrnaReadPairsArgumentCollection ARGUMENT_COLLECTION = new CorrectScrnaReadPairsArgumentCollection(); + + @Override + protected int doWork() { + INPUT = FileListParsingUtils.expandPicardHtsPathList(INPUT); + final List inputPaths = PicardHtsPath.toPaths(INPUT); + inputPaths.stream().forEach(p -> IOUtil.assertFileIsReadable(p)); + IOUtil.assertFileIsWritable(OUTPUT); + if (DELETE_INPUT_INDICES == null) { + DELETE_INPUT_INDICES = DELETE_INPUTS; + } + // Check that input BAM files can be deleted + if (DELETE_INPUTS) { + for (final Path bamFile : inputPaths) { + IOUtil.assertFileIsWritable(bamFile); + } + } + + if (DELETE_INPUT_INDICES) { + for (final Path bamFile : inputPaths) { + final Path index = SamFiles.findIndex(bamFile); + if (index != null && Files.exists(index)) { + IOUtil.assertFileIsWritable(index); + } + } + } + final SamHeaderAndIterator headerAndIterator = SamFileMergeUtil.mergeInputPaths(inputPaths, true); + final SAMFileWriter samFileWriter = new SAMFileWriterFactory().setCreateIndex(CREATE_INDEX). + makeSAMOrBAMWriter(headerAndIterator.header, true, OUTPUT); + + SamHeaderUtil.addPgRecord(headerAndIterator.header, this); + final BarcodeCorrector barcodeCorrector = new BarcodeCorrector(ARGUMENT_COLLECTION); + barcodeCorrector.setVERBOSITY(VERBOSITY); + final PairedSamRecordIterator iterator = new PairedSamRecordIterator(headerAndIterator.iterator); + for (ReadPair pair: new IterableAdapter<>(iterator)) { + progressLogger.record(pair.getFirstRead()); + barcodeCorrector.correctReadPair(pair); + samFileWriter.addAlignment(pair.getFirstRead()); + samFileWriter.addAlignment(pair.getSecondRead()); + } + CloserUtil.close(headerAndIterator.iterator); + samFileWriter.close(); + if (ARGUMENT_COLLECTION.METRICS != null) { + barcodeCorrector.writeMetrics(ARGUMENT_COLLECTION.METRICS, getMetricsFile()); + } + + try { + if (DELETE_INPUTS) { + inputPaths.stream().forEach(p -> { + try { + Files.delete(p); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + }); + } + if (DELETE_INPUT_INDICES) { + for (final Path inputBam : inputPaths) { + final Path index = SamFiles.findIndex(inputBam); + if (index != null && index.toFile().exists()) { + Files.delete(index); + + } + } + } + } catch (IOException e) { + throw new RuntimeIOException(e); + } + + return 0; + } + /** Stock main method, for testing. */ + public static void main(final String[] args) { + System.exit(new CorrectScrnaReadPairs().instanceMain(args)); + } +} diff --git a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairsArgumentCollection.java b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairsArgumentCollection.java new file mode 100644 index 00000000..e6528642 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairsArgumentCollection.java @@ -0,0 +1,68 @@ +/* + * MIT License + * + * Copyright 2025 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.beadsynthesis; + +import org.broadinstitute.barclay.argparser.Argument; +import picard.cmdline.StandardOptionDefinitions; + +import java.io.File; + +/** + * Shared CLP argument definitions for multiple CLPs that correct cell barcodes. + */ +public class CorrectScrnaReadPairsArgumentCollection { + @Argument(doc = "Which read of each read pair contains the cell barcode [1/2].") + public int BARCODED_READ = 1; + + @Argument(doc="The region of the barcoded read containing the cell barcode, seperated by a dash. " + + "E.g. 1-4. Can extract multiple ranges by separating them by a colon. " + + "For example 1-4:17-22 extracts the first 4 bases, then the 17-22 bases, and glues the sequence together " + + "into a single cell barcode.") + public String BASE_RANGE; + + @Argument(doc="Metrics file produced by CountBarcodeSequences that has counts for all the expected cell barcodes " + + "that are found as exact matches in the input data.") + public File ALLOWED_BARCODE_COUNTS; + + @Argument(doc="Tag to store the corrected barcode on the non-barcode read.") + public String BARCODE_TAG = "XC"; + + @Argument(doc="If true, assign BARCODE_TAG (also RAW_BARCODE_TAG and BARCODE_QUALS_TAG, if set) to both reads.") + public boolean TAG_BOTH_READS = false; + + @Argument(shortName = StandardOptionDefinitions.METRICS_FILE_SHORT_NAME, optional = true, + doc="Various matching and correction metrics") + public File METRICS; + + @Argument(doc="If more than on allowed barcode matches, (best likelihood)/sum(all likelihoods) " + + "must be >= this value.") + public double LIKELIHOOD_RATIO = 0.95; + + @Argument(doc="Store the original barcode sequence in this tag on the non-barcode read. Default: do not assign this tag.", + optional = true) + public String RAW_BARCODE_TAG; + @Argument(doc="Store the barcode base qualities in this tag on the non-barcode read. Default: do not assign this tag.", + optional = true) + public String BARCODE_QUALS_TAG; +} diff --git a/src/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetrics.java b/src/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetrics.java index 7ad3211e..fb8c50cd 100644 --- a/src/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetrics.java +++ b/src/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetrics.java @@ -25,9 +25,8 @@ import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; -import org.broadinstitute.dropseqrna.beadsynthesis.CorrectAndSplitScrnaReadPairs.BarcodeCorrectionMetrics; +import org.broadinstitute.dropseqrna.beadsynthesis.BarcodeCorrectionMetrics; import org.broadinstitute.dropseqrna.cmdline.DropSeq; -import org.broadinstitute.dropseqrna.utils.FilteredReadsMetric; import picard.cmdline.CommandLineProgram; import picard.cmdline.StandardOptionDefinitions; diff --git a/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairsTest.java b/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairsTest.java index 845404db..862d6ade 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairsTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectAndSplitScrnaReadPairsTest.java @@ -39,12 +39,12 @@ public class CorrectAndSplitScrnaReadPairsTest { - private static final File TEST_DATA_DIR = new File("testdata/org/broadinstitute/dropseq/beadsynthesis/CorrectAndSplitScrnaReadPairsTest"); - private static final File INPUT_SAM = new File(TEST_DATA_DIR, "correct_barcodes_test.sam"); - private static final File EXPECTED_BARCODES_HIST = new File(TEST_DATA_DIR, "correct_barcodes_test.expected_barcode_metrics.gz"); - private static final String BARCODE_QUALS_TAG = "CY"; - private static final String RAW_BARCODE_TAG = "CR"; - private static final String BASE_RANGE = "1-16"; + static final File TEST_DATA_DIR = new File("testdata/org/broadinstitute/dropseq/beadsynthesis/CorrectAndSplitScrnaReadPairsTest"); + static final File INPUT_SAM = new File(TEST_DATA_DIR, "correct_barcodes_test.sam"); + static final File EXPECTED_BARCODES_HIST = new File(TEST_DATA_DIR, "correct_barcodes_test.expected_barcode_metrics.gz"); + static final String BARCODE_QUALS_TAG = "CY"; + static final String RAW_BARCODE_TAG = "CR"; + static final String BASE_RANGE = "1-16"; @Test(dataProvider = "testBasicDataProvider") public void testBasic(boolean tagBothReads) { @@ -55,7 +55,7 @@ public void testBasic(boolean tagBothReads) { final SamReader reader = SamReaderFactory.makeDefault().open(outputSam); for (final SAMRecord rec : reader) { if (rec.getSecondOfPairFlag() || tagBothReads) { - final String cellBarcode = rec.getStringAttribute(clp.BARCODE_TAG); + final String cellBarcode = rec.getStringAttribute(clp.ARGUMENT_COLLECTION.BARCODE_TAG); final String expectedCellBarcode = rec.getReadName().split(":")[1]; Assert.assertEquals(cellBarcode, expectedCellBarcode, rec.getSAMString()); } @@ -73,17 +73,17 @@ public Object[][] testBasicDataProvider() { private CorrectAndSplitScrnaReadPairs initClp(boolean tagBothReads) { final CorrectAndSplitScrnaReadPairs clp = new CorrectAndSplitScrnaReadPairs(); clp.INPUT = Collections.singletonList(new PicardHtsPath(INPUT_SAM)); - clp.ALLOWED_BARCODE_COUNTS = EXPECTED_BARCODES_HIST; - clp.BARCODED_READ = 1; - clp.BASE_RANGE = BASE_RANGE; - clp.METRICS = TestUtils.getTempReportFile("test.", ".corrected_barcode_metrics"); + clp.ARGUMENT_COLLECTION.ALLOWED_BARCODE_COUNTS = EXPECTED_BARCODES_HIST; + clp.ARGUMENT_COLLECTION.BARCODED_READ = 1; + clp.ARGUMENT_COLLECTION.BASE_RANGE = BASE_RANGE; + clp.ARGUMENT_COLLECTION.METRICS = TestUtils.getTempReportFile("test.", ".corrected_barcode_metrics"); clp.NUM_OUTPUTS = 1; clp.OUTPUT_LIST = TestUtils.getTempReportFile("CorrectAndSplitScrnaReadPairsTest.", ".bam_list"); clp.OUTPUT = new File("test." + clp.OUTPUT_SLUG + ".sam"); clp.OVERWRITE_EXISTING = true; clp.OUTPUT_MANIFEST = TestUtils.getTempReportFile("CorrectAndSplitScrnaReadPairsTest.", ".split_bam_manifest.gz"); clp.REPORT = TestUtils.getTempReportFile("CorrectAndSplitScrnaReadPairsTest.", ".split_bam_report"); - clp.TAG_BOTH_READS = tagBothReads; + clp.ARGUMENT_COLLECTION.TAG_BOTH_READS = tagBothReads; return clp; } @@ -92,8 +92,8 @@ public void testOptionalTags(boolean tagBothReads) { CorrectAndSplitScrnaReadPairs clp = initClp(tagBothReads); final File outputSam = new File(clp.OUTPUT_LIST.getParentFile(), "test.0.sam"); outputSam.deleteOnExit(); - clp.BARCODE_QUALS_TAG = BARCODE_QUALS_TAG; - clp.RAW_BARCODE_TAG = RAW_BARCODE_TAG; + clp.ARGUMENT_COLLECTION.BARCODE_QUALS_TAG = BARCODE_QUALS_TAG; + clp.ARGUMENT_COLLECTION.RAW_BARCODE_TAG = RAW_BARCODE_TAG; Assert.assertEquals(clp.doWork(), 0); final SamReader reader = SamReaderFactory.makeDefault().open(outputSam); final List baseRanges = org.broadinstitute.dropseqrna.utils.BaseRange.parseBaseRange(BASE_RANGE); diff --git a/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairsTest.java b/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairsTest.java new file mode 100644 index 00000000..e76bc6a8 --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/CorrectScrnaReadPairsTest.java @@ -0,0 +1,66 @@ +/* + * MIT License + * + * Copyright 2025 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.beadsynthesis; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import org.broadinstitute.dropseqrna.utils.TestUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import picard.nio.PicardHtsPath; + +import java.util.Collections; + +public class CorrectScrnaReadPairsTest { + @Test(dataProvider = "testBasicDataProvider") + public void testBasic(boolean tagBothReads) { + CorrectScrnaReadPairs clp = new CorrectScrnaReadPairs(); + clp.INPUT = Collections.singletonList(new PicardHtsPath(CorrectAndSplitScrnaReadPairsTest.INPUT_SAM)); + clp.ARGUMENT_COLLECTION.ALLOWED_BARCODE_COUNTS = CorrectAndSplitScrnaReadPairsTest.EXPECTED_BARCODES_HIST; + clp.ARGUMENT_COLLECTION.BARCODED_READ = 1; + clp.ARGUMENT_COLLECTION.BASE_RANGE = CorrectAndSplitScrnaReadPairsTest.BASE_RANGE; + clp.ARGUMENT_COLLECTION.METRICS = TestUtils.getTempReportFile("test.", ".corrected_barcode_metrics"); + clp.OUTPUT = TestUtils.getTempReportFile("CorrectScrnaReadPairsTest.", ".sam"); + clp.ARGUMENT_COLLECTION.TAG_BOTH_READS = tagBothReads; + Assert.assertEquals(clp.doWork(), 0); + final SamReader reader = SamReaderFactory.makeDefault().open(clp.OUTPUT); + for (final SAMRecord rec : reader) { + if (rec.getSecondOfPairFlag() || tagBothReads) { + final String cellBarcode = rec.getStringAttribute(clp.ARGUMENT_COLLECTION.BARCODE_TAG); + final String expectedCellBarcode = rec.getReadName().split(":")[1]; + Assert.assertEquals(cellBarcode, expectedCellBarcode, rec.getSAMString()); + } + } + } + + @DataProvider(name = "testBasicDataProvider") + public Object[][] testBasicDataProvider() { + return new Object[][]{ + {false}, + {true} + }; + } +} diff --git a/src/tests/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetricsTest.java b/src/tests/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetricsTest.java index dfbd7975..5fc87f91 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetricsTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/metrics/MergeBarcodeCorrectionMetricsTest.java @@ -26,7 +26,7 @@ import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; -import org.broadinstitute.dropseqrna.beadsynthesis.CorrectAndSplitScrnaReadPairs.BarcodeCorrectionMetrics; +import org.broadinstitute.dropseqrna.beadsynthesis.BarcodeCorrectionMetrics; import org.testng.Assert; import org.testng.annotations.Test;