diff --git a/README.md b/README.md index 1981fd1823..c2af305925 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.samtools/htsjdk/badge.svg)](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.github.samtools%22%20AND%20a%3A%22htsjdk%22) [![License](http://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/samtools/htsjdk) [![Language](http://img.shields.io/badge/language-java-brightgreen.svg)](https://www.java.com/) +[![Join the chat at https://gitter.im/samtools/htsjdk](https://badges.gitter.im/samtools/htsjdk.svg)](https://gitter.im/samtools/htsjdk?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) Status of downstream projects automatically built on top of the current htsjdk master branch. See [gatk-jenkins](https://gatk-jenkins.broadinstitute.org/view/HTSJDK%20Release%20Tests/) for detailed logs. Failure may indicate problems in htsjdk, but may also be due to expected incompatibilities between versions, or unrelated failures in downstream projects. - [Picard](https://github.com/broadinstitute/picard): [![Build Status](https://gatk-jenkins.broadinstitute.org/buildStatus/icon?job=picard-on-htsjdk-master)](https://gatk-jenkins.broadinstitute.org/job/picard-on-htsjdk-master/) diff --git a/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java b/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java index 7078bf1dcb..0c3d484203 100644 --- a/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java +++ b/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java @@ -59,8 +59,6 @@ public void setAttribute(final String key, final Object value) { /** * Set the given value for the attribute named 'key'. Replaces an existing value, if any. * If value is null, the attribute is removed. - * Supported types are Character, Integer, Float and String. Byte and Short may also be - * passed in but they will be converted to Integer. * @param key attribute name * @param value attribute value */ @@ -71,6 +69,7 @@ public void setAttribute(final String key, final String value) { mAttributes.put(key, value); } } + /** * Returns the Set of attributes. */ diff --git a/src/main/java/htsjdk/samtools/BAMFileWriter.java b/src/main/java/htsjdk/samtools/BAMFileWriter.java index fc766ae7d3..cf2fdbf86c 100644 --- a/src/main/java/htsjdk/samtools/BAMFileWriter.java +++ b/src/main/java/htsjdk/samtools/BAMFileWriter.java @@ -23,8 +23,10 @@ */ package htsjdk.samtools; +import htsjdk.samtools.util.AbstractBlockCompressedOutputStream; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.BlockCompressedOutputStream; +import htsjdk.samtools.util.ParallelBlockCompressedOutputStream; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.samtools.util.zip.DeflaterFactory; @@ -42,35 +44,50 @@ class BAMFileWriter extends SAMFileWriterImpl { private final BinaryCodec outputBinaryCodec; private BAMRecordCodec bamRecordCodec = null; - private final BlockCompressedOutputStream blockCompressedOutputStream; + private final AbstractBlockCompressedOutputStream blockCompressedOutputStream; private BAMIndexer bamIndexer = null; - protected BAMFileWriter(final File path) { - blockCompressedOutputStream = new BlockCompressedOutputStream(path); + protected BAMFileWriter(final File path, boolean createIndex) { + blockCompressedOutputStream = Defaults.ZIP_THREADS > 0 && !createIndex ? + new ParallelBlockCompressedOutputStream(path) : + new BlockCompressedOutputStream(path); + outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(path.getAbsolutePath()); } - protected BAMFileWriter(final File path, final int compressionLevel) { - blockCompressedOutputStream = new BlockCompressedOutputStream(path, compressionLevel); + protected BAMFileWriter(final File path, final int compressionLevel, boolean createIndex) { + blockCompressedOutputStream = Defaults.ZIP_THREADS > 0 && !createIndex ? + new ParallelBlockCompressedOutputStream(path, compressionLevel) : + new BlockCompressedOutputStream(path, compressionLevel); + outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(path.getAbsolutePath()); } - protected BAMFileWriter(final OutputStream os, final File file) { - blockCompressedOutputStream = new BlockCompressedOutputStream(os, file); + protected BAMFileWriter(final OutputStream os, final File file, boolean createIndex) { + blockCompressedOutputStream = Defaults.ZIP_THREADS > 0 && !createIndex ? + new ParallelBlockCompressedOutputStream(os, file) : + new BlockCompressedOutputStream(os, file); + outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(getPathString(file)); } - protected BAMFileWriter(final OutputStream os, final File file, final int compressionLevel) { - blockCompressedOutputStream = new BlockCompressedOutputStream(os, file, compressionLevel); + protected BAMFileWriter(final OutputStream os, final File file, final int compressionLevel, boolean createIndex) { + blockCompressedOutputStream = Defaults.ZIP_THREADS > 0 && !createIndex ? + new ParallelBlockCompressedOutputStream(os, file, compressionLevel) : + new BlockCompressedOutputStream(os, file, compressionLevel); + outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(getPathString(file)); } - protected BAMFileWriter(final OutputStream os, final File file, final int compressionLevel, final DeflaterFactory deflaterFactory) { - blockCompressedOutputStream = new BlockCompressedOutputStream(os, file, compressionLevel, deflaterFactory); + protected BAMFileWriter(final OutputStream os, final File file, final int compressionLevel, + final DeflaterFactory deflaterFactory, boolean createIndex) { + blockCompressedOutputStream = Defaults.ZIP_THREADS > 0 && !createIndex ? + new ParallelBlockCompressedOutputStream(os, file, compressionLevel, deflaterFactory) : + new BlockCompressedOutputStream(os, file, compressionLevel, deflaterFactory); outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream)); outputBinaryCodec.setOutputFileName(getPathString(file)); } diff --git a/src/main/java/htsjdk/samtools/Defaults.java b/src/main/java/htsjdk/samtools/Defaults.java index 5e3f6dab17..0697b860e5 100644 --- a/src/main/java/htsjdk/samtools/Defaults.java +++ b/src/main/java/htsjdk/samtools/Defaults.java @@ -15,7 +15,7 @@ */ public class Defaults { private static Log log = Log.getInstance(Defaults.class); - + /** Should BAM index files be created when writing out coordinate sorted BAM files? Default = false. */ public static final boolean CREATE_INDEX; @@ -84,6 +84,7 @@ public class Defaults { */ public static final boolean SRA_LIBRARIES_DOWNLOAD; + public static final int ZIP_THREADS; static { CREATE_INDEX = getBooleanProperty("create_index", false); @@ -104,6 +105,7 @@ public class Defaults { CUSTOM_READER_FACTORY = getStringProperty("custom_reader", ""); SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name())); SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false); + ZIP_THREADS = getIntProperty("zip_threads", 0); } /** diff --git a/src/main/java/htsjdk/samtools/SAMFileHeader.java b/src/main/java/htsjdk/samtools/SAMFileHeader.java index f2750d4cc6..eff595341b 100644 --- a/src/main/java/htsjdk/samtools/SAMFileHeader.java +++ b/src/main/java/htsjdk/samtools/SAMFileHeader.java @@ -270,7 +270,7 @@ public SortOrder getSortOrder() { public void setSortOrder(final SortOrder so) { sortOrder = so; - setAttribute(SORT_ORDER_TAG, so.name()); + super.setAttribute(SORT_ORDER_TAG, so.name()); } public GroupOrder getGroupOrder() { @@ -292,7 +292,42 @@ public GroupOrder getGroupOrder() { public void setGroupOrder(final GroupOrder go) { groupOrder = go; - setAttribute(GROUP_ORDER_TAG, go.name()); + super.setAttribute(GROUP_ORDER_TAG, go.name()); + } + + + /** + * Set the given value for the attribute named 'key'. Replaces an existing value, if any. + * If value is null, the attribute is removed. + * Otherwise, the value will be converted to a String with toString. + * @param key attribute name + * @param value attribute value + * @deprecated Use {@link #setAttribute(String, String) instead + */ + @Deprecated + @Override + public void setAttribute(final String key, final Object value) { + if (key.equals(SORT_ORDER_TAG) || key.equals(GROUP_ORDER_TAG)) { + this.setAttribute(key, value.toString()); + } else { + super.setAttribute(key, value); + } + } + + /** + * Set the given value for the attribute named 'key'. Replaces an existing value, if any. + * If value is null, the attribute is removed. + * @param key attribute name + * @param value attribute value + */ + @Override + public void setAttribute(final String key, final String value) { + if (key.equals(SORT_ORDER_TAG)) { + this.sortOrder = null; + } else if (key.equals(GROUP_ORDER_TAG)) { + this.groupOrder = null; + } + super.setAttribute(key, value); } /** diff --git a/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java b/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java index 30b36d7b31..f3a8be401d 100644 --- a/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java +++ b/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java @@ -264,7 +264,7 @@ public SAMFileWriter makeBAMWriter(final SAMFileHeader header, final boolean pre } OutputStream os = IOUtil.maybeBufferOutputStream(new FileOutputStream(outputFile, false), bufferSize); if (createMd5File) os = new Md5CalculatingOutputStream(os, new File(outputFile.getAbsolutePath() + ".md5")); - final BAMFileWriter ret = new BAMFileWriter(os, outputFile, compressionLevel, deflaterFactory); + final BAMFileWriter ret = new BAMFileWriter(os, outputFile, compressionLevel, deflaterFactory, createIndex); final boolean createIndex = this.createIndex && IOUtil.isRegularPath(outputFile); if (this.createIndex && !createIndex) { log.warn("Cannot create index for BAM because output file is not a regular file: " + outputFile.getAbsolutePath()); @@ -347,7 +347,7 @@ public SAMFileWriter makeSAMWriter(final SAMFileHeader header, final boolean pre */ public SAMFileWriter makeBAMWriter(final SAMFileHeader header, final boolean presorted, final OutputStream stream) { - return initWriter(header, presorted, new BAMFileWriter(stream, null, this.getCompressionLevel(), this.deflaterFactory)); + return initWriter(header, presorted, new BAMFileWriter(stream, null, this.getCompressionLevel(), this.deflaterFactory, false)); } /** diff --git a/src/main/java/htsjdk/samtools/SAMRecord.java b/src/main/java/htsjdk/samtools/SAMRecord.java index f93b2d72cb..ec394ca174 100644 --- a/src/main/java/htsjdk/samtools/SAMRecord.java +++ b/src/main/java/htsjdk/samtools/SAMRecord.java @@ -1519,7 +1519,7 @@ public SAMTagAndValue(final String tag, final Object value) { */ public List getAttributes() { SAMBinaryTagAndValue binaryAttributes = getBinaryAttributes(); - final List ret = new ArrayList(); + final List ret = new ArrayList<>(); while (binaryAttributes != null) { ret.add(new SAMTagAndValue(SAMTagUtil.getSingleton().makeStringTag(binaryAttributes.tag), binaryAttributes.value)); @@ -1769,7 +1769,7 @@ public List getAlignmentBlocks() { /** * Run all validations of CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. - * @param recordNumber For error reporting. -1 if not known. + * @param recordNumber For error reporting, the record number in the SAM/BAM file. -1 if not known. * @return List of errors, or null if no errors. */ public List validateCigar(final long recordNumber) { @@ -1878,35 +1878,40 @@ public List isValid(final boolean firstOnly) { ArrayList ret = null; if (!getReadPairedFlag()) { if (getProperPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getMateUnmappedFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_UNMAPPED, "Mate unmapped flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getMateNegativeStrandFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_NEG_STRAND, "Mate negative strand flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getFirstOfPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_FIRST_OF_PAIR, "First of pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getSecondOfPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_SECOND_OF_PAIR, "Second of pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (null != getHeader() && getMateReferenceIndex() != NO_ALIGNMENT_REFERENCE_INDEX) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MATE_REF_INDEX, "MRNM should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } + if (!getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_UNPAIRED_MATE_REFERENCE, "Unpaired read mate reference is " + getMateReferenceName() + " not " + SAMRecord.NO_ALIGNMENT_REFERENCE_NAME + " for unpaired read", getReadName())); + if (firstOnly) return ret; + } } else { final List errors = isValidReferenceIndexAndPosition(mMateReferenceIndex, mMateReferenceName, getMateAlignmentStart(), true, firstOnly); @@ -1937,23 +1942,23 @@ public List isValid(final boolean firstOnly) { */ } if (getInferredInsertSize() > MAX_INSERT_SIZE || getInferredInsertSize() < -MAX_INSERT_SIZE) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INSERT_SIZE, "Insert size out of range", getReadName())); if (firstOnly) return ret; } if (getReadUnmappedFlag()) { if (getNotPrimaryAlignmentFlag()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_NOT_PRIM_ALIGNMENT, "Not primary alignment flag should not be set for unmapped read.", getReadName())); if (firstOnly) return ret; } if (getSupplementaryAlignmentFlag()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_SUPPLEMENTARY_ALIGNMENT, "Supplementary alignment flag should not be set for unmapped read.", getReadName())); if (firstOnly) return ret; } if (getMappingQuality() != 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ should be 0 for unmapped read.", getReadName())); if (firstOnly) return ret; } @@ -1962,22 +1967,22 @@ public List isValid(final boolean firstOnly) { TODO: PIC-97 This validation should be enabled, but probably at this point there are too many BAM files that have the proper pair flag set when read or mate is unmapped. if (getProperPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unmapped read.", getReadName())); } */ } else { if (getMappingQuality() >= 256) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ should be < 256.", getReadName())); if (firstOnly) return ret; } if (getCigarLength() == 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR should have > zero elements for mapped read.", getReadName())); /* todo - will uncomment once unit tests are added } else if (getCigar().getReadLength() != getReadLength()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR read length " + getCigar().getReadLength() + " doesn't match read length " + getReadLength(), getReadName())); */ if (firstOnly) return ret; @@ -1988,7 +1993,7 @@ public List isValid(final boolean firstOnly) { if (firstOnly) return ret; } if (!hasReferenceName()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_READ_UNMAPPED, "Mapped read should have valid reference name", getReadName())); if (firstOnly) return ret; } @@ -2006,14 +2011,14 @@ public List isValid(final boolean firstOnly) { // Validate the RG ID is found in header final String rgId = (String)getAttribute(SAMTagUtil.getSingleton().RG); if (rgId != null && getHeader() != null && getHeader().getReadGroup(rgId) == null) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.READ_GROUP_NOT_FOUND, "RG ID on SAMRecord not found in header: " + rgId, getReadName())); if (firstOnly) return ret; } final List errors = isValidReferenceIndexAndPosition(mReferenceIndex, mReferenceName, getAlignmentStart(), false); if (errors != null) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.addAll(errors); if (firstOnly) return ret; } @@ -2024,7 +2029,7 @@ public List isValid(final boolean firstOnly) { final String cq = (String)getAttribute(SAMTagUtil.getSingleton().CQ); final String cs = (String)getAttribute(SAMTagUtil.getSingleton().CS); if (cq == null || cq.isEmpty() || cs == null || cs.isEmpty()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, "Zero-length read without FZ, CS or CQ tag", getReadName())); if (firstOnly) return ret; @@ -2038,7 +2043,7 @@ public List isValid(final boolean firstOnly) { } } if (!hasIndel) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, "Colorspace read with zero-length bases but no indel", getReadName())); if (firstOnly) return ret; @@ -2047,7 +2052,7 @@ public List isValid(final boolean firstOnly) { } } if (this.getReadLength() != getBaseQualities().length && !Arrays.equals(getBaseQualities(), NULL_QUALS)) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.MISMATCH_READ_LENGTH_AND_QUALS_LENGTH, "Read length does not match quals length", getReadName())); if (firstOnly) return ret; @@ -2055,13 +2060,39 @@ public List isValid(final boolean firstOnly) { if (this.getAlignmentStart() != NO_ALIGNMENT_START && this.getIndexingBin() != null && this.computeIndexingBin() != this.getIndexingBin()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN, "bin field of BAM record does not equal value computed based on alignment start and end, and length of sequence to which read is aligned", getReadName())); if (firstOnly) return ret; } + if (getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) && + getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_UNALIGNED_MATE_START, + "The unaligned mate start position is " + getAlignmentStart() + ", should be " + SAMRecord.NO_ALIGNMENT_START, + getReadName())); + if (firstOnly) return ret; + } + + if (getCigar().getReadLength() != 0 && getCigar().getReadLength() != getReadLength()) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError(SAMValidationError.Type.MISMATCH_CIGAR_SEQ_LENGTH, + "CIGAR covers " + getCigar().getReadLength() + " bases but the sequence is " + getReadLength() + " read bases ", + getReadName())); + if (firstOnly) return ret; + } + + if (getBaseQualities().length != 0 && getReadLength() != getBaseQualities().length) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError( + SAMValidationError.Type.MISMATCH_SEQ_QUAL_LENGTH, + "Read length is " + getReadLength() + " bases but have " + mBaseQualities.length + " qualities ", + getReadName())); + if (firstOnly) return ret; + } + if (ret == null || ret.isEmpty()) { return null; } @@ -2099,13 +2130,13 @@ private List isValidReferenceIndexAndPosition(final Integer ArrayList ret = null; if (!hasReference) { if (alignmentStart != 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should be 0 because reference name = *.", isMate), getReadName())); if (firstOnly) return ret; } } else { if (alignmentStart == 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should != 0 because reference name != *.", isMate), getReadName())); if (firstOnly) return ret; } @@ -2113,12 +2144,12 @@ private List isValidReferenceIndexAndPosition(final Integer final SAMSequenceRecord sequence = (referenceIndex != null? getHeader().getSequence(referenceIndex): getHeader().getSequence(referenceName)); if (sequence == null) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_REFERENCE_INDEX, buildMessage("Reference sequence not found in sequence dictionary.", isMate), getReadName())); if (firstOnly) return ret; } else { if (alignmentStart > sequence.getSequenceLength()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start (" + alignmentStart + ") must be <= reference sequence length (" + sequence.getSequenceLength() + ") on reference " + sequence.getSequenceName(), isMate), getReadName())); if (firstOnly) return ret; diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index b7744d796d..86ffa6c9f7 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -29,7 +29,6 @@ import java.math.BigInteger; import java.security.MessageDigest; import java.util.*; -import java.util.stream.Collector; import java.util.stream.Collectors; import javax.xml.bind.annotation.XmlElement; @@ -50,8 +49,8 @@ public class SAMSequenceDictionary implements Serializable { getter because the later wraps the list into an unmodifiable List see http://tech.joshuacummings.com/2010/10/problems-with-defensive-collection.html */ @XmlElement(name="Reference") - private List mSequences = new ArrayList(); - private final Map mSequenceMap = new HashMap(); + private List mSequences = new ArrayList<>(); + private final Map mSequenceMap = new HashMap<>(); public SAMSequenceDictionary() { } @@ -150,7 +149,7 @@ public boolean isEmpty() { private static String DICT_MISMATCH_TEMPLATE = "SAM dictionaries are not the same: %s."; /** * Non-comprehensive {@link #equals(Object)}-assertion: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent - * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call + * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}. * Aliases are ignored. * @@ -161,20 +160,49 @@ public void assertSameDictionary(final SAMSequenceDictionary that) { final Iterator thatSequences = that.mSequences.iterator(); for (final SAMSequenceRecord thisSequence : mSequences) { - if (!thatSequences.hasNext()) + if (!thatSequences.hasNext()) { throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thisSequence + " is present in only one dictionary")); - else { + } else { final SAMSequenceRecord thatSequence = thatSequences.next(); - if(!thatSequence.isSameSequence(thisSequence)) + if(!thatSequence.isSameSequence(thisSequence)) { throw new AssertionError( String.format(DICT_MISMATCH_TEMPLATE, thatSequence + " was found when " + thisSequence + " was expected") ); + } } } if (thatSequences.hasNext()) throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thatSequences.next() + " is present in only one dictionary")); } + /** + * Non-comprehensive {@link #equals(Object)}-validation: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent + * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call + * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}. + * + * @param that {@link SAMSequenceDictionary} to compare against + * @return true if the dictionaries are the same, false otherwise + * + */ + public boolean isSameDictionary(final SAMSequenceDictionary that) { + if (that == null || that.mSequences == null) return false; + if (this == that) return true; + + final Iterator thatSequences = that.mSequences.iterator(); + for (final SAMSequenceRecord thisSequence : mSequences) { + if (!thatSequences.hasNext()) { + return false; + } else { + final SAMSequenceRecord thatSequence = thatSequences.next(); + if (!thatSequence.isSameSequence(thisSequence)) { + return false; + } + } + } + + return !thatSequences.hasNext(); + } + /** returns true if the two dictionaries are the same, aliases are NOT considered */ @Override public boolean equals(Object o) { @@ -183,9 +211,7 @@ public boolean equals(Object o) { SAMSequenceDictionary that = (SAMSequenceDictionary) o; - if (!mSequences.equals(that.mSequences)) return false; - - return true; + return mSequences.equals(that.mSequences); } /** @@ -318,8 +344,8 @@ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionar finalDict.addSequence(sMerged); final Set allTags = new HashSet<>(); - s1.getAttributes().stream().forEach(a -> allTags.add(a.getKey())); - s2.getAttributes().stream().forEach(a -> allTags.add(a.getKey())); + s1.getAttributes().forEach(a -> allTags.add(a.getKey())); + s2.getAttributes().forEach(a -> allTags.add(a.getKey())); for (final String tag : allTags) { final String value1 = s1.getAttribute(tag); diff --git a/src/main/java/htsjdk/samtools/SAMTestUtil.java b/src/main/java/htsjdk/samtools/SAMTestUtil.java index 83766f3676..ec85ce2da7 100644 --- a/src/main/java/htsjdk/samtools/SAMTestUtil.java +++ b/src/main/java/htsjdk/samtools/SAMTestUtil.java @@ -23,6 +23,8 @@ */ package htsjdk.samtools; +import java.util.List; + /** * Misc methods for SAM-related unit tests. These are in the src tree rather than the tests tree * so that they will be included in sam.jar, and therefore can be used by tests outside of htsjdk.samtools. @@ -55,47 +57,21 @@ public void assertPairValid(final SAMRecord firstEnd, final SAMRecord secondEnd) } /** - * Basic sanity check for a SAMRecord. - * @throws SanityCheckFailedException if the sanity check failed + * Basic sanity check for a SAMRecord. Print errors to screen. + * @param read SAM record + * @throws IllegalArgumentException if read is null + * @throws SanityCheckFailedException if errors */ - public void assertReadValid(final SAMRecord read) throws SanityCheckFailedException { - assertEquals(read.getReadBases().length, read.getBaseQualities().length); - // Note that it is possible to have an unmapped read that has a coordinate - if (read.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { - assertEquals(read.getAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - assertTrue(read.getReadUnmappedFlag()); - } else { - assertNotSame(read.getAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - } - if (read.getReadUnmappedFlag()) { - assertEquals(read.getMappingQuality(), SAMRecord.NO_MAPPING_QUALITY); - assertEquals(read.getCigar().getCigarElements().size(), 0); - } else { - assertNotSame(read.getCigar().getCigarElements(), 0); + public static void assertReadValid(final SAMRecord read) throws SanityCheckFailedException { + if (read == null) { + throw new IllegalArgumentException("SAMRecord is null"); } - if (read.getReadPairedFlag()) { - if (read.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { - assertEquals(read.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - assertTrue(read.getMateUnmappedFlag()); - } else { - // Even if the mate is unmapped, if it has a reference name, it should have a position. - assertNotSame(read.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - } - if (read.getReadUnmappedFlag() || read.getMateUnmappedFlag() || - !read.getReferenceName().equals(read.getMateReferenceName())) { - assertEquals(read.getInferredInsertSize(), 0); - } else { - assertNotSame(read.getInferredInsertSize(), 0); - } - if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { - assertNotSame(read.getReadNegativeStrandFlag(), read.getMateNegativeStrandFlag()); - assertNotSame(read.getMateNegativeStrandFlag(), - read.getReadName()); - } - } else { - assertEquals(read.getInferredInsertSize(), 0); + final List errors = read.isValid(false); + if ( errors != null) { + errors.forEach(v -> System.out.println(v.toString())); } + assertTrue(errors.isEmpty()); } private static void assertEquals(T a, T b) { diff --git a/src/main/java/htsjdk/samtools/SAMValidationError.java b/src/main/java/htsjdk/samtools/SAMValidationError.java index 452e92cf5f..edd49c13c6 100644 --- a/src/main/java/htsjdk/samtools/SAMValidationError.java +++ b/src/main/java/htsjdk/samtools/SAMValidationError.java @@ -208,7 +208,22 @@ public enum Type { MISMATCH_MATE_CIGAR_STRING, /** There is a Cigar String (stored in the MC Tag) for a read whose mate is NOT mapped. */ - MATE_CIGAR_STRING_INVALID_PRESENCE; + MATE_CIGAR_STRING_INVALID_PRESENCE, + + /** The mate reference of the unpaired read should be "*" */ + INVALID_UNPAIRED_MATE_REFERENCE, + + /** The unaligned mate read start position should be 0 */ + INVALID_UNALIGNED_MATE_START, + + /** Mismatch between the number of bases covered by the CIGAR and sequence */ + MISMATCH_CIGAR_SEQ_LENGTH, + + /** Mismatch between the sequence and quality length */ + MISMATCH_SEQ_QUAL_LENGTH, + + /** Mismatch between file and sequence dictionaries */ + MISMATCH_FILE_SEQ_DICT; public final Severity severity; diff --git a/src/main/java/htsjdk/samtools/SamFileValidator.java b/src/main/java/htsjdk/samtools/SamFileValidator.java index d0b745e7fc..3e316a235b 100644 --- a/src/main/java/htsjdk/samtools/SamFileValidator.java +++ b/src/main/java/htsjdk/samtools/SamFileValidator.java @@ -88,6 +88,7 @@ public class SamFileValidator { private Histogram errorsByType; private PairEndInfoMap pairEndInfoByName; private ReferenceSequenceFileWalker refFileWalker; + private SAMSequenceDictionary samSequenceDictionary; private boolean verbose; private int maxVerboseOutput; private SAMSortOrderChecker orderChecker; @@ -154,7 +155,7 @@ public boolean validateSamFileSummary(final SamReader samReader, final Reference for (final Histogram.Bin bin : errorsByType.values()) { errorsAndWarningsByType.increment(bin.getId().getHistogramString(), bin.getValue()); } - final MetricsFile metricsFile = new MetricsFile(); + final MetricsFile metricsFile = new MetricsFile<>(); errorsByType.setBinLabel("Error Type"); errorsByType.setValueLabel("Count"); metricsFile.setHistogram(errorsAndWarningsByType); @@ -180,7 +181,7 @@ public boolean validateSamFileVerbose(final SamReader samReader, final Reference } catch (MaxOutputExceededException e) { out.println("Maximum output of [" + maxVerboseOutput + "] errors reached."); } - boolean result = errorsByType.isEmpty(); + final boolean result = errorsByType.isEmpty(); cleanup(); return result; } @@ -249,13 +250,13 @@ private void validateUnmatchedPairs() { // For the coordinate-sorted map, need to detect mate pairs in which the mateReferenceIndex on one end // does not match the readReference index on the other end, so the pairs weren't united and validated. inMemoryPairMap = new InMemoryPairEndInfoMap(); - CloseableIterator> it = ((CoordinateSortedPairEndInfoMap) pairEndInfoByName).iterator(); + final CloseableIterator> it = pairEndInfoByName.iterator(); while (it.hasNext()) { - Map.Entry entry = it.next(); - PairEndInfo pei = inMemoryPairMap.remove(entry.getValue().readReferenceIndex, entry.getKey()); + final Map.Entry entry = it.next(); + final PairEndInfo pei = inMemoryPairMap.remove(entry.getValue().readReferenceIndex, entry.getKey()); if (pei != null) { // Found a mismatch btw read.mateReferenceIndex and mate.readReferenceIndex - List errors = pei.validateMates(entry.getValue(), entry.getKey()); + final List errors = pei.validateMates(entry.getValue(), entry.getKey()); for (final SAMValidationError error : errors) { addError(error); } @@ -405,10 +406,7 @@ private void validateSecondaryBaseCalls(final SAMRecord record, final long recor } private boolean validateCigar(final SAMRecord record, final long recordNumber) { - if (record.getReadUnmappedFlag()) { - return true; - } - return validateCigar(record, recordNumber, true); + return record.getReadUnmappedFlag() || validateCigar(record, recordNumber, true); } private boolean validateMateCigar(final SAMRecord record, final long recordNumber) { @@ -458,6 +456,7 @@ private void init(final ReferenceSequenceFile reference, final SAMFileHeader hea } if (reference != null) { this.refFileWalker = new ReferenceSequenceFileWalker(reference); + this.samSequenceDictionary = reference.getSequenceDictionary(); } } @@ -525,6 +524,12 @@ private void validateHeader(final SAMFileHeader fileHeader) { } if (fileHeader.getSequenceDictionary().isEmpty()) { sequenceDictionaryEmptyAndNoWarningEmitted = true; + } else { + if (samSequenceDictionary != null) { + if (!fileHeader.getSequenceDictionary().isSameDictionary(samSequenceDictionary)) { + addError(new SAMValidationError(Type.MISMATCH_FILE_SEQ_DICT, "Mismatch between file and sequence dictionary", null)); + } + } } if (fileHeader.getReadGroups().isEmpty()) { addError(new SAMValidationError(Type.MISSING_READ_GROUP, "Read groups is empty", null)); @@ -540,7 +545,7 @@ private void validateHeader(final SAMFileHeader fileHeader) { } final List rgs = fileHeader.getReadGroups(); - final Set readGroupIDs = new HashSet(); + final Set readGroupIDs = new HashSet<>(); for (final SAMReadGroupRecord record : rgs) { final String readGroupID = record.getReadGroupId(); @@ -692,11 +697,10 @@ public PairEndInfo(final SAMRecord record, final long recordNumber) { this.firstOfPairFlag = record.getFirstOfPairFlag(); } - private PairEndInfo(int readAlignmentStart, int readReferenceIndex, boolean readNegStrandFlag, boolean readUnmappedFlag, - String readCigarString, - int mateAlignmentStart, int mateReferenceIndex, boolean mateNegStrandFlag, boolean mateUnmappedFlag, - String mateCigarString, - boolean firstOfPairFlag, long recordNumber) { + private PairEndInfo(final int readAlignmentStart, final int readReferenceIndex, final boolean readNegStrandFlag, final boolean readUnmappedFlag, + final String readCigarString, + final int mateAlignmentStart, final int mateReferenceIndex, final boolean mateNegStrandFlag, final boolean mateUnmappedFlag, + final String mateCigarString, final boolean firstOfPairFlag, final long recordNumber) { this.readAlignmentStart = readAlignmentStart; this.readReferenceIndex = readReferenceIndex; this.readNegStrandFlag = readNegStrandFlag; @@ -712,7 +716,7 @@ private PairEndInfo(int readAlignmentStart, int readReferenceIndex, boolean read } public List validateMates(final PairEndInfo mate, final String readName) { - final List errors = new ArrayList(); + final List errors = new ArrayList<>(); validateMateFields(this, mate, readName, errors); validateMateFields(mate, this, readName, errors); // Validations that should not be repeated on both ends @@ -789,7 +793,7 @@ interface PairEndInfoMap extends Iterable> { private class CoordinateSortedPairEndInfoMap implements PairEndInfoMap { private final CoordinateSortedPairInfoMap onDiskMap = - new CoordinateSortedPairInfoMap(maxTempFiles, new Codec()); + new CoordinateSortedPairInfoMap<>(maxTempFiles, new Codec()); @Override public void put(int mateReferenceIndex, String key, PairEndInfo value) { @@ -877,7 +881,7 @@ public Map.Entry decode() { } private static class InMemoryPairEndInfoMap implements PairEndInfoMap { - private final Map map = new HashMap(); + private final Map map = new HashMap<>(); @Override public void put(int mateReferenceIndex, String key, PairEndInfo value) { diff --git a/src/main/java/htsjdk/samtools/util/AbstractBlockCompressedOutputStream.java b/src/main/java/htsjdk/samtools/util/AbstractBlockCompressedOutputStream.java new file mode 100644 index 0000000000..252d04de41 --- /dev/null +++ b/src/main/java/htsjdk/samtools/util/AbstractBlockCompressedOutputStream.java @@ -0,0 +1,288 @@ +/* + * The MIT License + * + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package htsjdk.samtools.util; + +import htsjdk.samtools.Defaults; +import htsjdk.samtools.util.zip.DeflaterFactory; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.util.zip.Deflater; + +import static htsjdk.samtools.util.BlockCompressedStreamConstants.DEFAULT_COMPRESSION_LEVEL; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.GZIP_ID1; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.GZIP_ID2; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.GZIP_XLEN; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.GZIP_XFL; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.GZIP_FLG; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.GZIP_CM_DEFLATE; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.GZIP_OS_UNKNOWN; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.BGZF_ID1; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.BGZF_ID2; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.BGZF_LEN; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH; +import static htsjdk.samtools.util.BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK; + +/** + * Writer for a file that is a series of gzip blocks (BGZF format). The caller just treats it as an + * OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten + * bytes reaches a threshold. + * + * The advantage of BGZF over conventional gzip is that BGZF allows for seeking without having to scan through + * the entire file up to the position being sought. + * + * Note that the flush() method should not be called by client + * unless you know what you're doing, because it forces a gzip block to be written even if the + * number of buffered bytes has not reached threshold. close(), on the other hand, must be called + * when done writing in order to force the last gzip block to be written. + * + * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF file format. + */ +public abstract class AbstractBlockCompressedOutputStream extends OutputStream implements LocationAware { + + protected static DeflaterFactory defaultDeflaterFactory = new DeflaterFactory(); + protected static int defaultCompressionLevel = DEFAULT_COMPRESSION_LEVEL; + + protected final BinaryCodec codec; + protected final byte[] uncompressedBuffer = new byte[DEFAULT_UNCOMPRESSED_BLOCK_SIZE]; + protected int numUncompressedBytes = 0; + protected long mBlockAddress = 0; + protected File file = null; + + // Really a local variable, but allocate once to reduce GC burden. + protected final byte[] singleByteArray = new byte[1]; + + + /** + * Sets the default {@link DeflaterFactory} that will be used for all instances unless specified otherwise in the constructor. + * If this method is not called the default is a factory that will create the JDK {@link Deflater}. + * @param deflaterFactory non-null default factory. + */ + public static void setDefaultDeflaterFactory(final DeflaterFactory deflaterFactory) { + if (deflaterFactory == null) { + throw new IllegalArgumentException("null deflaterFactory"); + } + defaultDeflaterFactory = deflaterFactory; + } + + public static DeflaterFactory getDefaultDeflaterFactory() { + return defaultDeflaterFactory; + } + + /** + * Sets the GZip compression level for subsequent BlockCompressedOutputStream object creation + * that do not specify the compression level. + * @param compressionLevel 1 <= compressionLevel <= 9 + */ + public static void setDefaultCompressionLevel(final int compressionLevel) { + if (compressionLevel < Deflater.NO_COMPRESSION || compressionLevel > Deflater.BEST_COMPRESSION) { + throw new IllegalArgumentException("Invalid compression level: " + compressionLevel); + } + defaultCompressionLevel = compressionLevel; + } + + public static int getDefaultCompressionLevel() { + return defaultCompressionLevel; + } + + /** + * Prepare to compress at the given compression level + * @param file file to output + */ + public AbstractBlockCompressedOutputStream(final File file) { + this.file = file; + codec = new BinaryCodec(file, true); + } + + + /** + * Prepare to compress at the given compression level + * @param os output stream opened on the file + * @param file file to output + */ + public AbstractBlockCompressedOutputStream(final OutputStream os, final File file) { + this.file = file; + codec = new BinaryCodec(os); + if (file != null) { + codec.setOutputFileName(file.getAbsolutePath()); + } + } + + /** + * Create new {@link AbstractBlockCompressedOutputStream} or return exactly the same if it already the {@link AbstractBlockCompressedOutputStream} + * @param location May be null. Used for error messages, and for checking file termination. + * @param output May or not already be a BlockCompressedOutputStream. + * @return A BlockCompressedOutputStream, either by wrapping the given OutputStream, or by casting if it already + * is a BCOS. + */ + public static AbstractBlockCompressedOutputStream maybeBgzfWrapOutputStream(final File location, OutputStream output) { + if (!(output instanceof AbstractBlockCompressedOutputStream)) { + return Defaults.ZIP_THREADS > 0 + ? new ParallelBlockCompressedOutputStream(output, location) + : new BlockCompressedOutputStream(output, location); + } else { + return (AbstractBlockCompressedOutputStream)output; + } + } + + /** + * Writes b.length bytes from the specified byte array to this output stream. The general contract for write(b) + * is that it should have exactly the same effect as the call write(b, 0, b.length). + * @param bytes the data + */ + @Override + public void write(final byte[] bytes) throws IOException { + write(bytes, 0, bytes.length); + } + + /** + * Writes len bytes from the specified byte array starting at offset off to this output stream. The general + * contract for write(b, off, len) is that some of the bytes in the array b are written to the output stream in order; + * element b[off] is the first byte written and b[off+len-1] is the last byte written by this operation. + * + * @param bytes the data + * @param startIndex the start offset in the data + * @param numBytes the number of bytes to write + */ + @Override + public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException { + assert(numUncompressedBytes < uncompressedBuffer.length); + while (numBytes > 0) { + final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes); + System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite); + numUncompressedBytes += bytesToWrite; + startIndex += bytesToWrite; + numBytes -= bytesToWrite; + assert(numBytes >= 0); + if (numUncompressedBytes == uncompressedBuffer.length) { + deflateBlock(); + } + } + } + + /** + * WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer + * to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush(). + * Instead, call close(), which will flush any unwritten data before closing the underlying stream. + * + */ + @Override + public void flush() throws IOException { + while (numUncompressedBytes > 0) { + deflateBlock(); + } + codec.getOutputStream().flush(); + } + + /** + * close() must be called in order to flush any remaining buffered bytes. An unclosed file will likely be + * defective. + * + */ + @Override + public void close() throws IOException { + flush(); + // For debugging... + // if (numberOfThrottleBacks > 0) { + // System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks + + // " times for file " + codec.getOutputFileName()); + // } + codec.writeBytes(EMPTY_GZIP_BLOCK); + codec.close(); + // Can't re-open something that is not a regular file, e.g. a named pipe or an output stream + if (this.file == null || !this.file.isFile() || !Files.isRegularFile(this.file.toPath())) return; + if (BlockCompressedInputStream.checkTermination(this.file) != + BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) { + throw new IOException("Terminator block not found after closing BGZF file " + this.file); + } + } + + /** + * Writes the specified byte to this output stream. The general contract for write is that one byte is written + * to the output stream. The byte to be written is the eight low-order bits of the argument b. + * The 24 high-order bits of b are ignored. + * @param bite + * @throws IOException + */ + @Override + public void write(final int bite) throws IOException { + singleByteArray[0] = (byte)bite; + write(singleByteArray); + } + + /** Encode virtual file pointer + * Upper 48 bits is the byte offset into the compressed stream of a block. + * Lower 16 bits is the byte offset into the uncompressed stream inside the block. + */ + public long getFilePointer(){ + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, numUncompressedBytes); + } + + @Override + public long getPosition() { + return getFilePointer(); + } + + /** + * Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block. + * If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount + * of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked + * up in the next deflate event. + * @return size of gzip block that was written. + */ + protected abstract int deflateBlock(); + + /** + * Writes the entire gzip block, assuming the compressed data is stored in uncompressedBuffer + * @return size of gzip block that was written. + */ + protected int writeGzipBlock(final byte[] compressedBuffer, final int compressedSize, final int uncompressedSize, final long crc) { + // Init gzip header + codec.writeByte(GZIP_ID1); + codec.writeByte(GZIP_ID2); + codec.writeByte(GZIP_CM_DEFLATE); + codec.writeByte(GZIP_FLG); + codec.writeInt(0); // Modification time + codec.writeByte(GZIP_XFL); + codec.writeByte(GZIP_OS_UNKNOWN); + codec.writeShort(GZIP_XLEN); + codec.writeByte(BGZF_ID1); + codec.writeByte(BGZF_ID2); + codec.writeShort(BGZF_LEN); + final int totalBlockSize = compressedSize + BLOCK_HEADER_LENGTH + + BLOCK_FOOTER_LENGTH; + + // I don't know why we store block size - 1, but that is what the spec says + codec.writeShort((short)(totalBlockSize - 1)); + codec.writeBytes(compressedBuffer, 0, compressedSize); + codec.writeInt((int)crc); + codec.writeInt(uncompressedSize); + return totalBlockSize; + } +} diff --git a/src/main/java/htsjdk/samtools/util/AsyncBufferedIterator.java b/src/main/java/htsjdk/samtools/util/AsyncBufferedIterator.java index bf78ecb9ce..1998ed4039 100644 --- a/src/main/java/htsjdk/samtools/util/AsyncBufferedIterator.java +++ b/src/main/java/htsjdk/samtools/util/AsyncBufferedIterator.java @@ -228,13 +228,13 @@ private void backgroundRun() { } } /** - * Block of records from the underlying iterator + * Block of records from the underlying iterator */ private static class IteratorBuffer implements Iterator { private final Throwable exception; private final Iterator it; public IteratorBuffer(Iterable it) { - this.it = it != null ? it.iterator() : null;; + this.it = it != null ? it.iterator() : null; this.exception = null; } diff --git a/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java b/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java index 4e9a594875..b79054f705 100644 --- a/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java +++ b/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java @@ -26,71 +26,19 @@ import htsjdk.samtools.util.zip.DeflaterFactory; import java.io.File; -import java.io.IOException; import java.io.OutputStream; import java.util.zip.CRC32; import java.util.zip.Deflater; /** - * Writer for a file that is a series of gzip blocks (BGZF format). The caller just treats it as an - * OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten - * bytes reaches a threshold. + * Single threaded realisation of {@link AbstractBlockCompressedOutputStream} + * Each block compressed one-by-one synchronously. * - * The advantage of BGZF over conventional gzip is that BGZF allows for seeking without having to scan through - * the entire file up to the position being sought. - * - * Note that the flush() method should not be called by client - * unless you know what you're doing, because it forces a gzip block to be written even if the - * number of buffered bytes has not reached threshold. close(), on the other hand, must be called - * when done writing in order to force the last gzip block to be written. - * - * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF file format. + * @see AbstractBlockCompressedOutputStream */ public class BlockCompressedOutputStream - extends OutputStream - implements LocationAware -{ - - private static final Log log = Log.getInstance(BlockCompressedOutputStream.class); - - private static int defaultCompressionLevel = BlockCompressedStreamConstants.DEFAULT_COMPRESSION_LEVEL; - private static DeflaterFactory defaultDeflaterFactory = new DeflaterFactory(); - - /** - * Sets the GZip compression level for subsequent BlockCompressedOutputStream object creation - * that do not specify the compression level. - * @param compressionLevel 1 <= compressionLevel <= 9 - */ - public static void setDefaultCompressionLevel(final int compressionLevel) { - if (compressionLevel < Deflater.NO_COMPRESSION || compressionLevel > Deflater.BEST_COMPRESSION) { - throw new IllegalArgumentException("Invalid compression level: " + compressionLevel); - } - defaultCompressionLevel = compressionLevel; - } - - public static int getDefaultCompressionLevel() { - return defaultCompressionLevel; - } - - /** - * Sets the default {@link DeflaterFactory} that will be used for all instances unless specified otherwise in the constructor. - * If this method is not called the default is a factory that will create the JDK {@link Deflater}. - * @param deflaterFactory non-null default factory. - */ - public static void setDefaultDeflaterFactory(final DeflaterFactory deflaterFactory) { - if (deflaterFactory == null) { - throw new IllegalArgumentException("null deflaterFactory"); - } - defaultDeflaterFactory = deflaterFactory; - } - - public static DeflaterFactory getDefaultDeflaterFactory() { - return defaultDeflaterFactory; - } + extends AbstractBlockCompressedOutputStream { - private final BinaryCodec codec; - private final byte[] uncompressedBuffer = new byte[BlockCompressedStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE]; - private int numUncompressedBytes = 0; private final byte[] compressedBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; @@ -110,12 +58,6 @@ public static DeflaterFactory getDefaultDeflaterFactory() { // so just use JDK standard. private final Deflater noCompressionDeflater = new Deflater(Deflater.NO_COMPRESSION, true); private final CRC32 crc32 = new CRC32(); - private File file = null; - private long mBlockAddress = 0; - - - // Really a local variable, but allocate once to reduce GC burden. - private final byte[] singleByteArray = new byte[1]; /** * Uses default compression level, which is 5 unless changed by setCompressionLevel @@ -160,10 +102,8 @@ public BlockCompressedOutputStream(final File file, final int compressionLevel) * @param deflaterFactory custom factory to create deflaters (overrides the default) */ public BlockCompressedOutputStream(final File file, final int compressionLevel, final DeflaterFactory deflaterFactory) { - this.file = file; - codec = new BinaryCodec(file, true); + super(file); deflater = deflaterFactory.makeDeflater(compressionLevel, true); - log.debug("Using deflater: " + deflater.getClass().getSimpleName()); } /** @@ -193,136 +133,12 @@ public BlockCompressedOutputStream(final OutputStream os, final File file, final * @param deflaterFactory custom factory to create deflaters (overrides the default) */ public BlockCompressedOutputStream(final OutputStream os, final File file, final int compressionLevel, final DeflaterFactory deflaterFactory) { - this.file = file; - codec = new BinaryCodec(os); - if (file != null) { - codec.setOutputFileName(file.getAbsolutePath()); - } + super(os, file); deflater = deflaterFactory.makeDeflater(compressionLevel, true); - log.debug("Using deflater: " + deflater.getClass().getSimpleName()); - } - - /** - * - * @param location May be null. Used for error messages, and for checking file termination. - * @param output May or not already be a BlockCompressedOutputStream. - * @return A BlockCompressedOutputStream, either by wrapping the given OutputStream, or by casting if it already - * is a BCOS. - */ - public static BlockCompressedOutputStream maybeBgzfWrapOutputStream(final File location, OutputStream output) { - if (!(output instanceof BlockCompressedOutputStream)) { - return new BlockCompressedOutputStream(output, location); - } else { - return (BlockCompressedOutputStream)output; - } } - /** - * Writes b.length bytes from the specified byte array to this output stream. The general contract for write(b) - * is that it should have exactly the same effect as the call write(b, 0, b.length). - * @param bytes the data - */ - @Override - public void write(final byte[] bytes) throws IOException { - write(bytes, 0, bytes.length); - } - - /** - * Writes len bytes from the specified byte array starting at offset off to this output stream. The general - * contract for write(b, off, len) is that some of the bytes in the array b are written to the output stream in order; - * element b[off] is the first byte written and b[off+len-1] is the last byte written by this operation. - * - * @param bytes the data - * @param startIndex the start offset in the data - * @param numBytes the number of bytes to write - */ @Override - public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException { - assert(numUncompressedBytes < uncompressedBuffer.length); - while (numBytes > 0) { - final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes); - System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite); - numUncompressedBytes += bytesToWrite; - startIndex += bytesToWrite; - numBytes -= bytesToWrite; - assert(numBytes >= 0); - if (numUncompressedBytes == uncompressedBuffer.length) { - deflateBlock(); - } - } - } - - /** - * WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer - * to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush(). - * Instead, call close(), which will flush any unwritten data before closing the underlying stream. - * - */ - @Override - public void flush() throws IOException { - while (numUncompressedBytes > 0) { - deflateBlock(); - } - codec.getOutputStream().flush(); - } - - /** - * close() must be called in order to flush any remaining buffered bytes. An unclosed file will likely be - * defective. - * - */ - @Override - public void close() throws IOException { - flush(); - // For debugging... - // if (numberOfThrottleBacks > 0) { - // System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks + - // " times for file " + codec.getOutputFileName()); - // } - codec.writeBytes(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); - codec.close(); - // Can't re-open something that is not a regular file, e.g. a named pipe or an output stream - if (this.file == null || !this.file.isFile()) return; - if (BlockCompressedInputStream.checkTermination(this.file) != - BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) { - throw new IOException("Terminator block not found after closing BGZF file " + this.file); - } - } - - /** - * Writes the specified byte to this output stream. The general contract for write is that one byte is written - * to the output stream. The byte to be written is the eight low-order bits of the argument b. - * The 24 high-order bits of b are ignored. - * @param bite - * @throws IOException - */ - @Override - public void write(final int bite) throws IOException { - singleByteArray[0] = (byte)bite; - write(singleByteArray); - } - - /** Encode virtual file pointer - * Upper 48 bits is the byte offset into the compressed stream of a block. - * Lower 16 bits is the byte offset into the uncompressed stream inside the block. - */ - public long getFilePointer(){ - return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, numUncompressedBytes); - } - - @Override - public long getPosition() { - return getFilePointer(); - } - - /** - * Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block. - * If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount - * of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked - * up in the next deflate event. - * @return size of gzip block that was written. - */ - private int deflateBlock() { + protected int deflateBlock() { if (numUncompressedBytes == 0) { return 0; } @@ -348,8 +164,8 @@ private int deflateBlock() { crc32.reset(); crc32.update(uncompressedBuffer, 0, bytesToCompress); - final int totalBlockSize = writeGzipBlock(compressedSize, bytesToCompress, crc32.getValue()); - assert(bytesToCompress <= numUncompressedBytes); + final int totalBlockSize = writeGzipBlock(this.compressedBuffer, compressedSize, bytesToCompress, crc32.getValue()); + assert (bytesToCompress <= numUncompressedBytes); // Clear out from uncompressedBuffer the data that was written if (bytesToCompress == numUncompressedBytes) { @@ -363,31 +179,4 @@ private int deflateBlock() { return totalBlockSize; } - /** - * Writes the entire gzip block, assuming the compressed data is stored in compressedBuffer - * @return size of gzip block that was written. - */ - private int writeGzipBlock(final int compressedSize, final int uncompressedSize, final long crc) { - // Init gzip header - codec.writeByte(BlockCompressedStreamConstants.GZIP_ID1); - codec.writeByte(BlockCompressedStreamConstants.GZIP_ID2); - codec.writeByte(BlockCompressedStreamConstants.GZIP_CM_DEFLATE); - codec.writeByte(BlockCompressedStreamConstants.GZIP_FLG); - codec.writeInt(0); // Modification time - codec.writeByte(BlockCompressedStreamConstants.GZIP_XFL); - codec.writeByte(BlockCompressedStreamConstants.GZIP_OS_UNKNOWN); - codec.writeShort(BlockCompressedStreamConstants.GZIP_XLEN); - codec.writeByte(BlockCompressedStreamConstants.BGZF_ID1); - codec.writeByte(BlockCompressedStreamConstants.BGZF_ID2); - codec.writeShort(BlockCompressedStreamConstants.BGZF_LEN); - final int totalBlockSize = compressedSize + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH + - BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH; - - // I don't know why we store block size - 1, but that is what the spec says - codec.writeShort((short)(totalBlockSize - 1)); - codec.writeBytes(compressedBuffer, 0, compressedSize); - codec.writeInt((int)crc); - codec.writeInt(uncompressedSize); - return totalBlockSize; - } } diff --git a/src/main/java/htsjdk/samtools/util/ParallelBlockCompressedOutputStream.java b/src/main/java/htsjdk/samtools/util/ParallelBlockCompressedOutputStream.java new file mode 100644 index 0000000000..31e0b737e3 --- /dev/null +++ b/src/main/java/htsjdk/samtools/util/ParallelBlockCompressedOutputStream.java @@ -0,0 +1,312 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package htsjdk.samtools.util; + +import htsjdk.samtools.Defaults; +import htsjdk.samtools.util.zip.DeflaterFactory; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.function.Supplier; +import java.util.zip.CRC32; +import java.util.zip.Deflater; + +/** + * Parallel implementation of BAM file compression. + *

+ * The main idea is to compress blocks of a BAM file asynchronously. After a next 64K block filled, + * it sends to the first available processor core, put future result into the buffer and continues to fill a next 64K block of data. + * When buffer of futures with compressed block is full next writing task will be submitted. + *

+ * + * @see AbstractBlockCompressedOutputStream + */ +public class ParallelBlockCompressedOutputStream extends AbstractBlockCompressedOutputStream { + + private static final ExecutorService gzipExecutorService = Executors.newFixedThreadPool( + Defaults.ZIP_THREADS, + r -> { + Thread t = Executors.defaultThreadFactory().newThread(r); + t.setDaemon(true); + return t; + } + ); + + private static final Void NOTHING = null; + private static final int BLOCKS_PACK_SIZE = 16 * Defaults.ZIP_THREADS; + + private DeflaterFactory deflaterFactory; + private final int compressionLevel; + + List> compressedBlocksInFuture = new ArrayList<>(BLOCKS_PACK_SIZE); + private CompletableFuture writeBlocksTask = CompletableFuture.completedFuture(NOTHING); + + + public ParallelBlockCompressedOutputStream(final String filename) { + this(filename, defaultCompressionLevel); + } + + /** + * Prepare to compress at the given compression level + * Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}. + * @param compressionLevel 1 <= compressionLevel <= 9 + */ + public ParallelBlockCompressedOutputStream(final String filename, final int compressionLevel) { + this(new File(filename), compressionLevel); + } + + /** + * Uses default compression level, which is 5 unless changed by setCompressionLevel + * Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}. + * Use {@link #ParallelBlockCompressedOutputStream(File, int, DeflaterFactory)} to specify a custom factory. + */ + public ParallelBlockCompressedOutputStream(final File file) { + this(file, defaultCompressionLevel); + } + + /** + * Prepare to compress at the given compression level + * @param compressionLevel 1 <= compressionLevel <= 9 + * Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}. + * Use {@link #ParallelBlockCompressedOutputStream(File, int, DeflaterFactory)} to specify a custom factory. + */ + public ParallelBlockCompressedOutputStream(final File file, final int compressionLevel) { + this(file, compressionLevel, defaultDeflaterFactory); + } + + /** + * Prepare to compress at the given compression level + * @param compressionLevel 1 <= compressionLevel <= 9 + * @param deflaterFactory custom factory to create deflaters (overrides the default) + */ + public ParallelBlockCompressedOutputStream(final File file, final int compressionLevel, final DeflaterFactory deflaterFactory) { + super(file); + this.deflaterFactory = deflaterFactory; + this.compressionLevel = compressionLevel; + } + + /** + * Uses default compression level, which is 5 unless changed by setCompressionLevel + * Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}. + * Use {@link #ParallelBlockCompressedOutputStream(OutputStream, File, int, DeflaterFactory)} to specify a custom factory. + * + * @param file may be null + */ + public ParallelBlockCompressedOutputStream(final OutputStream os, final File file) { + this(os, file, defaultCompressionLevel); + } + + /** + * Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}. + * Use {@link #ParallelBlockCompressedOutputStream(OutputStream, File, int, DeflaterFactory)} to specify a custom factory. + */ + public ParallelBlockCompressedOutputStream(final OutputStream os, final File file, final int compressionLevel) { + this(os, file, compressionLevel, defaultDeflaterFactory); + } + + /** + * Creates the output stream. + * @param os output stream to create a BlockCompressedOutputStream from + * @param file file to which to write the output or null if not available + * @param compressionLevel the compression level (0-9) + * @param deflaterFactory custom factory to create deflaters (overrides the default) + */ + public ParallelBlockCompressedOutputStream(final OutputStream os, final File file, final int compressionLevel, + DeflaterFactory deflaterFactory) { + super(os, file); + this.deflaterFactory = deflaterFactory; + this.compressionLevel = compressionLevel; + } + + @Override + public void flush() throws IOException { + while (numUncompressedBytes > 0) { + deflateBlock(); + } + submitSpillTask(); + writeBlocksTask.join(); + // gzipDeflatingWorkers must be empty then flushing + codec.getOutputStream().flush(); + } + + /** + * @see AbstractBlockCompressedOutputStream#getFilePointer(); + * This implementation of get file pointer is blocking, because we have to wait to the end + * of writing all provided data for getting right pointer. + */ + @Override + public long getFilePointer() { + // we have to wait all writing tasks in other case we risk to get wrong mBlockAddress + submitSpillTask(); + writeBlocksTask.join(); + return super.getFilePointer(); + } + + /** + * Submit new deflate task of the current uncompressed byte buffer. + */ + @Override + protected int deflateBlock() { + if (numUncompressedBytes == 0) { + return 0; + } + submitDeflateTask(); + numUncompressedBytes = 0; + + if (compressedBlocksInFuture.size() == BLOCKS_PACK_SIZE) { + submitSpillTask(); + } + + return 0; + } + + void submitDeflateTask() { + UncompressedBlock uncompressedBlock = new UncompressedBlock( + Arrays.copyOf(this.uncompressedBuffer, numUncompressedBytes), + numUncompressedBytes + ); + compressedBlocksInFuture.add( + CompletableFuture.supplyAsync(new BlockZipper(uncompressedBlock), gzipExecutorService) + ); + } + + void submitSpillTask() { + writeBlocksTask.join(); + writeBlocksTask = CompletableFuture.runAsync(new BlocksSpiller(compressedBlocksInFuture), gzipExecutorService); + compressedBlocksInFuture = new ArrayList<>(BLOCKS_PACK_SIZE); + } + + /** + * Inner class which represent task for spilling block which was passed + * */ + private class BlocksSpiller implements Runnable { + + private List> compressedBlocks; + + BlocksSpiller(List> compressedBlocks) { + this.compressedBlocks = compressedBlocks; + } + + @Override + public void run() { + for (Future compressedBlockFuture : compressedBlocks) { + try { + CompressedBlock compressedBlock = compressedBlockFuture.get(); + mBlockAddress += writeGzipBlock( + compressedBlock.compressedBuffer, + compressedBlock.compressedSize, + compressedBlock.bytesToCompress, + compressedBlock.crc32Value + ); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeIOException("Enable to write Gzip block", e); + } + } + } + } + + + /** + * Inner class which represent task for zipping block which was passed + * */ + private class BlockZipper implements Supplier { + + private UncompressedBlock block; + + BlockZipper(UncompressedBlock block) { + this.block = block; + } + + @Override + public CompressedBlock get() { + return compressBlock(block); + + } + + private CompressedBlock compressBlock(UncompressedBlock uncompressedBlock) { + final CRC32 crc32 = new CRC32(); + final Deflater deflater = deflaterFactory.makeDeflater(compressionLevel, true); + + final byte[] compressedBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE - + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; + // Compress the input + deflater.setInput(uncompressedBlock.uncompressedBuffer, 0, uncompressedBlock.bytesToCompress); + deflater.finish(); + int compressedSize = deflater.deflate(compressedBuffer, 0, compressedBuffer.length); + + // If it didn't all fit in uncompressedBuffer.length, set compression level to NO_COMPRESSION + // and try again. This should always fit. + if (!deflater.finished()) { + final Deflater noCompressionDeflater = deflaterFactory.makeDeflater(Deflater.NO_COMPRESSION, true); + noCompressionDeflater.setInput(uncompressedBlock.uncompressedBuffer, 0, uncompressedBlock.bytesToCompress); + noCompressionDeflater.finish(); + compressedSize = noCompressionDeflater.deflate(compressedBuffer, 0, compressedBuffer.length); + if (!noCompressionDeflater.finished()) { + throw new IllegalStateException("unpossible"); + } + } + + // Data compressed small enough, so write it out. + crc32.update(uncompressedBlock.uncompressedBuffer, 0, uncompressedBlock.bytesToCompress); + return new CompressedBlock(compressedBuffer, compressedSize, uncompressedBlock.bytesToCompress, crc32.getValue()); + } + + } + + static class UncompressedBlock { + + byte[] uncompressedBuffer; + int bytesToCompress; + + UncompressedBlock(byte[] uncompressedBuffer, int bytesToCompress) { + this.uncompressedBuffer = uncompressedBuffer; + this.bytesToCompress = bytesToCompress; + } + + } + static class CompressedBlock { + + byte[] compressedBuffer; + int compressedSize; + int bytesToCompress; + long crc32Value; + + CompressedBlock(byte[] compressedBuffer, int compressedSize, int bytesToCompress, long crc32Value) { + this.compressedBuffer = compressedBuffer; + this.compressedSize = compressedSize; + this.bytesToCompress = bytesToCompress; + this.crc32Value = crc32Value; + } + + } +} diff --git a/src/test/java/htsjdk/samtools/SAMFileHeaderTest.java b/src/test/java/htsjdk/samtools/SAMFileHeaderTest.java new file mode 100644 index 0000000000..0723ed9e4c --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMFileHeaderTest.java @@ -0,0 +1,64 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Nils Homer + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + */ +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SAMFileHeaderTest extends HtsjdkTest { + + @Test + public void testSortOrder() { + final SAMFileHeader header = new SAMFileHeader(); + + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getAttribute(SAMFileHeader.SORT_ORDER_TAG), SAMFileHeader.SortOrder.coordinate.name()); + + header.setAttribute(SAMFileHeader.SORT_ORDER_TAG, SAMFileHeader.SortOrder.queryname.name()); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.queryname); + Assert.assertEquals(header.getAttribute(SAMFileHeader.SORT_ORDER_TAG), SAMFileHeader.SortOrder.queryname.name()); + + header.setAttribute(SAMFileHeader.SORT_ORDER_TAG, SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getAttribute(SAMFileHeader.SORT_ORDER_TAG), SAMFileHeader.SortOrder.coordinate.name()); + } + + @Test + public void testGroupOrder() { + final SAMFileHeader header = new SAMFileHeader(); + + header.setGroupOrder(SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getGroupOrder(), SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getAttribute(SAMFileHeader.GROUP_ORDER_TAG), SAMFileHeader.GroupOrder.query.name()); + + header.setAttribute(SAMFileHeader.GROUP_ORDER_TAG, SAMFileHeader.GroupOrder.reference.name()); + Assert.assertEquals(header.getGroupOrder(), SAMFileHeader.GroupOrder.reference); + Assert.assertEquals(header.getAttribute(SAMFileHeader.GROUP_ORDER_TAG), SAMFileHeader.GroupOrder.reference.name()); + + header.setAttribute(SAMFileHeader.GROUP_ORDER_TAG, SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getGroupOrder(), SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getAttribute(SAMFileHeader.GROUP_ORDER_TAG), SAMFileHeader.GroupOrder.query.name()); + } +} diff --git a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java index 1bfe263457..5fa35f3e9d 100644 --- a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java +++ b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java @@ -463,7 +463,7 @@ public void test_setAttribute_null_removes_tag() { } private SAMRecord createTestRecordHelper() { - return new SAMRecordSetBuilder().addFrag("test", 0, 1, false, false, "3S9M", null, 2); + return new SAMRecordSetBuilder().addFrag("test", 0, 1, false, false, "3S33M", null, 2); } @Test diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java index 8b17630671..a8e60ed501 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java @@ -39,6 +39,7 @@ import java.io.StringWriter; import java.util.Arrays; import java.util.Collections; +import java.util.List; public class SAMSequenceDictionaryTest extends HtsjdkTest { @Test @@ -142,4 +143,27 @@ public void testMergeDictionaries(final SAMSequenceRecord rec1, final SAMSequenc throw new Exception("Expected to not be able to merge dictionaries, but was able"); } } + + @DataProvider + public Object[][] testIsSameDictionaryData() { + + final SAMSequenceRecord rec1, rec2; + rec1 = new SAMSequenceRecord("chr1", 100); + rec2 = new SAMSequenceRecord("chr2", 101); + + return new Object[][]{ + new Object[]{Arrays.asList(rec1), Arrays.asList(rec1), true}, + new Object[]{Arrays.asList(rec1), Arrays.asList(rec2), false}, + new Object[]{Arrays.asList(rec1, rec2), Arrays.asList(rec1), false} + }; + } + + @Test(dataProvider = "testIsSameDictionaryData") + public void testIsSameDictionary(final List recs1, final List recs2, final boolean isSameDictionary) { + + final SAMSequenceDictionary dict1 = new SAMSequenceDictionary(recs1); + final SAMSequenceDictionary dict2 = new SAMSequenceDictionary(recs2); + + Assert.assertEquals(dict1.isSameDictionary(dict2), isSameDictionary); + } } diff --git a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java index 1035d1b10e..e0c73d5026 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java @@ -24,8 +24,11 @@ package htsjdk.samtools; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.Arrays; + /** * Test for SAMReadGroupRecordTest */ @@ -33,10 +36,50 @@ public class SAMSequenceRecordTest { @Test public void testGetSAMString() { - SAMSequenceRecord r = new SAMSequenceRecord("chr5_but_without_a_prefix", 271828); + final SAMSequenceRecord r = new SAMSequenceRecord("chr5_but_without_a_prefix", 271828); r.setSpecies("Psephophorus terrypratchetti"); r.setAssembly("GRCt01"); r.setMd5("7a6dd3d307de916b477e7bf304ac22bc"); Assert.assertEquals("@SQ\tSN:chr5_but_without_a_prefix\tLN:271828\tSP:Psephophorus terrypratchetti\tAS:GRCt01\tM5:7a6dd3d307de916b477e7bf304ac22bc", r.getSAMString()); } + + @DataProvider + public Object[][] testIsSameSequenceData() { + final SAMSequenceRecord rec1 = new SAMSequenceRecord("chr1", 100); + final SAMSequenceRecord rec2 = new SAMSequenceRecord("chr2", 101); + final SAMSequenceRecord rec3 = new SAMSequenceRecord("chr3", 0); + final SAMSequenceRecord rec4 = new SAMSequenceRecord("chr1", 100); + + final String md5One = "1"; + final String md5Two = "2"; + final int index1 = 1; + final int index2 = 2; + + return new Object[][]{ + new Object[]{rec1, rec1, md5One, md5One, index1, index1, true}, + new Object[]{rec1, null, md5One, md5One, index1, index1, false}, + new Object[]{rec1, rec4, md5One, md5One, index1, index1, true}, + new Object[]{rec1, rec4, md5One, md5One, index1, index2, false}, + new Object[]{rec1, rec3, md5One, md5Two, index1, index1, false}, + new Object[]{rec1, rec2, md5One, md5Two, index1, index1, false}, + new Object[]{rec1, rec4, md5One, null, index1, index1, true}, + new Object[]{rec1, rec4, null, md5One, index1, index1, true}, + new Object[]{rec1, rec4, md5One, md5One, index1, index2, false} + }; + } + + @Test(dataProvider = "testIsSameSequenceData") + public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceRecord rec2, final String md5One, final String md5Two, + final int index1, final int index2, final boolean isSame) { + if (rec2 != null) { + rec2.setMd5(md5Two); + rec2.setSequenceIndex(index2); + } + + if (rec1 != null) { + rec1.setMd5(md5One); + rec1.setSequenceIndex(index1); + Assert.assertEquals(rec1.isSameSequence(rec2), isSame); + } + } } diff --git a/src/test/java/htsjdk/samtools/ValidateSamFileTest.java b/src/test/java/htsjdk/samtools/ValidateSamFileTest.java index 292758b8c6..8aac6e2e3c 100644 --- a/src/test/java/htsjdk/samtools/ValidateSamFileTest.java +++ b/src/test/java/htsjdk/samtools/ValidateSamFileTest.java @@ -147,6 +147,7 @@ public void testUnpairedRecords() throws IOException { Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_FLAG_FIRST_OF_PAIR.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_FLAG_SECOND_OF_PAIR.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_MATE_REF_INDEX.getHistogramString()).getValue(), 1.0); + Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_UNPAIRED_MATE_REFERENCE.getHistogramString()).getValue(), 1.0); } @Test @@ -173,6 +174,7 @@ public void testPairedRecords() throws IOException { Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_FLAG_MATE_UNMAPPED.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_MATE_ALIGNMENT_START.getHistogramString()).getValue(), 2.0); Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_MATE_REF_INDEX.getHistogramString()).getValue(), 2.0); + Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_UNALIGNED_MATE_START.getHistogramString()).getValue(), 1.0); } @Test(dataProvider = "missingMateTestCases") @@ -232,6 +234,7 @@ public void testMappedRecords() throws IOException { Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_CIGAR.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_FLAG_READ_UNMAPPED.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.MISSING_TAG_NM.getHistogramString()).getValue(), 1.0); + Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_CIGAR_SEQ_LENGTH.getHistogramString()).getValue(), 1.0); } @Test @@ -300,11 +303,10 @@ public void testMateCigarScenarios(final String scenario, final String inputFile throws Exception { final SamReader reader = SamReaderFactory.makeDefault().open(new File(TEST_DATA_DIR, inputFile)); final Histogram results = executeValidation(reader, null, IndexValidationStringency.EXHAUSTIVE); - Assert.assertNotNull(results.get(expectedError.getHistogramString())); - Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0); + Assert.assertNotNull(results.get(expectedError.getHistogramString()), scenario); + Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0, scenario); } - @DataProvider(name = "testMateCigarScenarios") public Object[][] testMateCigarScenarios() { return new Object[][]{ @@ -318,8 +320,8 @@ public void testTruncated(final String scenario, final String inputFile, final S throws Exception { final SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(new File(TEST_DATA_DIR, inputFile)); final Histogram results = executeValidation(reader, null, IndexValidationStringency.EXHAUSTIVE); - Assert.assertNotNull(results.get(expectedError.getHistogramString())); - Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0); + Assert.assertNotNull(results.get(expectedError.getHistogramString()), scenario); + Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0, scenario); } @DataProvider(name = "testTruncatedScenarios") @@ -400,9 +402,20 @@ public void testRedundantTags() throws Exception { public void testHeaderValidation() throws Exception { final SamReader samReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT) .open(new File(TEST_DATA_DIR, "buggyHeader.sam")); - final Histogram results = executeValidation(samReader, null, IndexValidationStringency.EXHAUSTIVE); + final File referenceFile = new File(TEST_DATA_DIR, "../hg19mini.fasta"); + final ReferenceSequenceFile reference = new FastaSequenceFile(referenceFile, false); + final Histogram results = executeValidation(samReader, reference, IndexValidationStringency.EXHAUSTIVE); Assert.assertEquals(results.get(SAMValidationError.Type.UNRECOGNIZED_HEADER_TYPE.getHistogramString()).getValue(), 3.0); Assert.assertEquals(results.get(SAMValidationError.Type.HEADER_TAG_MULTIPLY_DEFINED.getHistogramString()).getValue(), 1.0); + Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_FILE_SEQ_DICT.getHistogramString()).getValue(), 1.0); + } + + @Test + public void testSeqQualMismatch() throws Exception { + final SamReader samReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT) + .open(new File(TEST_DATA_DIR, "seq_qual_len_mismatch.sam")); + final Histogram results = executeValidation(samReader, null, IndexValidationStringency.EXHAUSTIVE); + Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_SEQ_QUAL_LENGTH.getHistogramString()).getValue(), 8.0); } @Test diff --git a/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java b/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java index a33766762b..73859a46a8 100644 --- a/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java +++ b/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java @@ -119,7 +119,7 @@ private SAMRecord buildSAMRecord(String seqName, String line) { @Test public void test3() { - String line1 = "98573 0 20 1 10 40M * 0 0 AAAAAAAAAA !!!!!!!!!!"; + String line1 = "98573 0 20 1 10 10M * 0 0 AAAAAAAAAA !!!!!!!!!!"; String seqName = "20"; byte[] ref = new byte[40]; diff --git a/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java b/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java index a678c8dcae..b378d30dcd 100644 --- a/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java +++ b/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java @@ -24,6 +24,7 @@ package htsjdk.samtools.util; import htsjdk.HtsjdkTest; +import htsjdk.samtools.Defaults; import htsjdk.samtools.FileTruncatedException; import htsjdk.samtools.util.zip.DeflaterFactory; import org.testng.Assert; @@ -50,7 +51,7 @@ public void testBasic() throws Exception { f.deleteOnExit(); final List linesWritten = new ArrayList<>(); System.out.println("Creating file " + f); - final BlockCompressedOutputStream bcos = new BlockCompressedOutputStream(f); + final AbstractBlockCompressedOutputStream bcos = create(f); String s = "Hi, Mom!\n"; bcos.write(s.getBytes()); linesWritten.add(s); @@ -132,7 +133,7 @@ public void testSeekReadExceptions(final String filePath, final Class c, final S f.deleteOnExit(); final List linesWritten = new ArrayList<>(); System.out.println("Creating file " + f); - final BlockCompressedOutputStream bcos = new BlockCompressedOutputStream(f); + final AbstractBlockCompressedOutputStream bcos = create(f); Random r = new Random(15555); final int INPUT_SIZE = 64 * 1024; byte[] input = new byte[INPUT_SIZE]; @@ -157,7 +158,7 @@ public void testSeekReadExceptions(final String filePath, final Class c, final S // I don't think this will work on Windows, because /dev/null doesn't work @Test(groups = "broken") public void testDevNull() throws Exception { - final BlockCompressedOutputStream bcos = new BlockCompressedOutputStream("/dev/null"); + final AbstractBlockCompressedOutputStream bcos = create("/dev/null"); bcos.write("Hi, Mom!".getBytes()); bcos.close(); } @@ -188,7 +189,7 @@ public Deflater makeDeflater(final int compressionLevel, final boolean gzipCompa } }; final List linesWritten = new ArrayList<>(); - final BlockCompressedOutputStream bcos = new BlockCompressedOutputStream(f, 5, myDeflaterFactory); + final AbstractBlockCompressedOutputStream bcos = create(f, 5, myDeflaterFactory); String s = "Hi, Mom!\n"; bcos.write(s.getBytes()); //Call 1 linesWritten.add(s); @@ -213,4 +214,22 @@ public Deflater makeDeflater(final int compressionLevel, final boolean gzipCompa bcis.close(); Assert.assertEquals(deflateCalls[0], 3, "deflate calls"); } + + static AbstractBlockCompressedOutputStream create(File f) { + return Defaults.ZIP_THREADS > 0 + ? new ParallelBlockCompressedOutputStream(f) + : new BlockCompressedOutputStream(f); + } + + static AbstractBlockCompressedOutputStream create(String name) { + return Defaults.ZIP_THREADS > 0 + ? new ParallelBlockCompressedOutputStream(name) + : new BlockCompressedOutputStream(name); + } + + static AbstractBlockCompressedOutputStream create(File f, int compLevel, DeflaterFactory df) { + return Defaults.ZIP_THREADS > 0 + ? new ParallelBlockCompressedOutputStream(f, compLevel, df) + : new BlockCompressedOutputStream(f, compLevel, df); + } } diff --git a/src/test/java/htsjdk/samtools/util/ParallelBlockCompressedOutputStreamTest.java b/src/test/java/htsjdk/samtools/util/ParallelBlockCompressedOutputStreamTest.java new file mode 100644 index 0000000000..d31633e588 --- /dev/null +++ b/src/test/java/htsjdk/samtools/util/ParallelBlockCompressedOutputStreamTest.java @@ -0,0 +1,95 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package htsjdk.samtools.util; + +import htsjdk.samtools.Defaults; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; + +public class ParallelBlockCompressedOutputStreamTest extends BlockCompressedOutputStreamTest { + + @BeforeClass + public void setup() throws NoSuchFieldException, IllegalAccessException { + changeDefaultsParam(Defaults.class, "ZIP_THREADS", 2); + } + + @AfterClass + public void tearDown() throws NoSuchFieldException, IllegalAccessException { + changeDefaultsParam(Defaults.class, "ZIP_THREADS", 0); + } + + @Test(expectedExceptions = CompletionException.class) + public void pbcosShouldRethrowExceptionFromDeflateTasksOnFlush() throws IOException { + final byte[] data = new byte[65536]; + File tmp = File.createTempFile("pbcos", "tmp"); + AbstractBlockCompressedOutputStream stream = new ParallelBlockCompressedOutputStreamForTests(tmp); + stream.write(data); + stream.flush(); + } + + @Test(expectedExceptions = CompletionException.class) + public void pbcosShouldRethrowExceptionFromDeflateTasksOnClose() throws IOException { + final byte[] data = new byte[65536]; + File tmp = File.createTempFile("pbcos", "tmp"); + AbstractBlockCompressedOutputStream stream = new ParallelBlockCompressedOutputStreamForTests(tmp); + stream.write(data); + stream.close(); + } + + private static class ParallelBlockCompressedOutputStreamForTests extends ParallelBlockCompressedOutputStream { + + public ParallelBlockCompressedOutputStreamForTests(File file) { + super(file); + } + + @Override + void submitDeflateTask() { + compressedBlocksInFuture.add( + CompletableFuture.supplyAsync(() -> { + throw new RuntimeIOException("Bang!"); + }) + ); + } + } + + // for changing Defaults.ZIP_THREADS + private static void changeDefaultsParam(Class clazz, String fieldName, Object newValue) + throws NoSuchFieldException, IllegalAccessException { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + Field modifiers = field.getClass().getDeclaredField("modifiers"); + modifiers.setAccessible(true); + modifiers.setInt(field, field.getModifiers() & ~Modifier.FINAL); + field.set(null, newValue); + } + +} diff --git a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java index e57b8fd089..6a115db9b1 100644 --- a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java @@ -262,11 +262,11 @@ public Object[][] testKmerGenerationTestCases() { @Test(dataProvider = "testKmerGenerationTestCases") public void testKmerGeneration(final int length, final String[] expectedKmers) { - final Set actualSet = new HashSet(); + final Set actualSet = new HashSet<>(); for (final byte[] kmer : SequenceUtil.generateAllKmers(length)) { actualSet.add(StringUtil.bytesToString(kmer)); } - final Set expectedSet = new HashSet(Arrays.asList(expectedKmers)); + final Set expectedSet = new HashSet<>(Arrays.asList(expectedKmers)); Assert.assertTrue(actualSet.equals(expectedSet)); } diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam index 82efe858eb..335d8159c8 100644 --- a/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam @@ -7,4 +7,4 @@ read1 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0 read2 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0 read3 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:0T2A0T2A0t2a0t2a0 read4 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0 -read5 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2 +read5 0 chr2 1 0 4M1D2M2S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2 diff --git a/src/test/resources/htsjdk/samtools/ValidateSamFileTest/seq_qual_len_mismatch.sam b/src/test/resources/htsjdk/samtools/ValidateSamFileTest/seq_qual_len_mismatch.sam new file mode 100644 index 0000000000..3c689b135b --- /dev/null +++ b/src/test/resources/htsjdk/samtools/ValidateSamFileTest/seq_qual_len_mismatch.sam @@ -0,0 +1,21 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:0 SM:Hi,Mom! LB:my-library PL:ILLUMINA +@RG ID:1 SM:Hi,Mom! LB:my-library PL:ILLUMINA +@RG ID:2 SM:Hi,Mom! LB:my-library PL:Illumina +@PG ID:1 PN:Hey! VN:2.0 +both_reads_align_clip_marked 1107 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:0 PG:Z:1 NM:i:0 MQ:i:255 XT:Z:foo OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:1 PG:Z:1 NM:i:3 MQ:i:255 XT:Z:foo OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:2 PG:Z:1 NM:i:8 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:1 PG:Z:1 NM:i:1 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:1 PG:Z:1 NM:i:1 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1' RG:Z:0 PG:Z:1 NM:i:5 MQ:i:255 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1' RG:Z:2 PG:Z:1 NM:i:6 MQ:i:255 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1' RG:Z:1 PG:Z:1