diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 2f934e8df..a36667ec4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -625,16 +625,15 @@ public class SAMDataSource { byte defaultBaseQualities, boolean isLocusBasedTraversal ) { - // ************************************************************************************************ // - // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // - // * (otherwise we will process something that we may end up throwing away) * // - // ************************************************************************************************ // + // Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum, + // this will consolidate the cigar strings into canonical form. This has to be done before the read + // filtering, because not all read filters will behave correctly with things like zero-length cigar + // elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also + // modify the base qualities. + wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); - if (useOriginalBaseQualities || defaultBaseQualities >= 0) - // only wrap if we are replacing the original qualities or using a default base quality - wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); - - // Filters: + // Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads + // that actually survive filtering. Otherwise we could get much less coverage than requested. wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); // Downsampling: @@ -658,7 +657,8 @@ public class SAMDataSource { if (!noValidationOfReadOrder && enableVerification) wrappedIterator = new VerifyingSamIterator(wrappedIterator); - // set up read transformers + // Read transformers: these are applied last, so that we don't bother transforming reads that get discarded + // by the read filters or downsampler. for ( final ReadTransformer readTransformer : readTransformers ) { if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java index c3b4aaa0a..f9d2f4802 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; /** * An iterator which does post-processing of a read, including potentially wrapping @@ -104,6 +105,10 @@ public class ReadFormattingIterator implements StingSAMIterator { public SAMRecord next() { SAMRecord rec = wrappedIterator.next(); + // Always consolidate the cigar string into canonical form, collapsing zero-length / repeated cigar elements. + // Downstream code (like LocusIteratorByState) cannot necessarily handle non-consolidated cigar strings. + rec.setCigar(AlignmentUtils.consolidateCigar(rec.getCigar())); + // if we are using default quals, check if we need them, and add if necessary. // 1. we need if reads are lacking or have incomplete quality scores // 2. we add if defaultBaseQualities has a positive value diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index fe30b60fd..c97ab7301 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; import net.sf.samtools.util.BlockCompressedInputStream; import org.broad.tribble.readers.AsciiLineReader; import org.broadinstitute.sting.WalkerTest; @@ -39,6 +41,7 @@ import org.broadinstitute.sting.gatk.walkers.qc.ErrorThrowing; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; @@ -255,4 +258,23 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, "")); } + @Test + public void testGATKEngineConsolidatesCigars() { + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "zero_length_cigar_elements.bam" + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the cigar + + final File outputBam = executeTest("testGATKEngineConsolidatesCigars", spec).first.get(0); + final SAMFileReader reader = new SAMFileReader(outputBam); + reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + reader.setSAMRecordFactory(new GATKSamRecordFactory()); + + final SAMRecord read = reader.iterator().next(); + reader.close(); + + // Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine: + Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine"); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java new file mode 100644 index 000000000..5d037bc4b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java @@ -0,0 +1,50 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; + + +public class ReadFormattingIteratorUnitTest extends BaseTest { + + @Test + public void testIteratorConsolidatesCigars() { + final Cigar unconsolidatedCigar = TextCigarCodec.getSingleton().decode("3M0M5M0M"); + final SAMRecord unconsolidatedRead = ArtificialSAMUtils.createArtificialRead(unconsolidatedCigar); + + final StingSAMIterator readIterator = StingSAMIteratorAdapter.adapt(Arrays.asList(unconsolidatedRead).iterator()); + final ReadFormattingIterator formattingIterator = new ReadFormattingIterator(readIterator, false, (byte)-1); + final SAMRecord postIterationRead = formattingIterator.next(); + + Assert.assertEquals(postIterationRead.getCigarString(), "8M", "Cigar 3M0M5M0M not consolidated correctly by ReadFormattingIterator"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java index 6b0422c6a..604c0e377 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java @@ -43,7 +43,7 @@ public class BAQIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testPrintReadsNoBAQ() { - WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("11af64ba020262d06b490bae2c5e08f8")); + WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("d1f74074e718c82810512bf40dbc7f72")); executeTest(String.format("testPrintReadsNoBAQ"), spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java index 7482eae60..adc7ad765 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java @@ -59,10 +59,10 @@ public class PrintReadsIntegrationTest extends WalkerTest { {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -simplifyBAM", "1510dc4429f3ed49caf96da41e8ed396")}, {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -n 10", "0e3d1748ad1cb523e3295cab9d09d8fc")}, // See: GATKBAMIndex.getStartOfLastLinearBin(), BAMScheduler.advance(), IntervalOverlapFilteringIterator.advance() - {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "e1cac555f3d720f611c47eec93e84bd9")}, - {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e2558317d409195eab3006dc9e5524c")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "d7f23fd77d7dc7cb50d3397f644c6d8a")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "c601db95b20248d012b0085347fcb6d1")}, {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "2d32440e47e8d9d329902fe573ad94ce")}, - {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e2558317d409195eab3006dc9e5524c")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "c601db95b20248d012b0085347fcb6d1")}, {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "349650b6aa9e574b48a2a62627f37c7d")}, {new PRTest(b37KGReference, "NA12878.1_10mb_2_10mb.bam", "", "0c1cbe67296637a85e80e7a182f828ab")} };