SAMDataSource: always consolidate cigar strings into canonical form
-Collapses zero-length and repeated cigar elements, neither of which
can necessarily be handled correctly by downstream code (like LIBS).
-Consolidation is done before read filters, because not all read filters
behave correctly with non-consoliated cigars.
-Examined other uses of consolidateCigar() throughout the GATK, and
found them to not be redundant with the new engine-level consolidation
(they're all on artificially-created cigars in the HaplotypeCaller
and SmithWaterman classes)
-Improved comments in SAMDataSource.applyDecoratingIterators()
-Updated MD5s; differences were examined and found to be innocuous
-Two tests: -Unit test for ReadFormattingIterator
-Integration test for correct handling of zero-length
cigar elements by the GATK engine as a whole
This commit is contained in:
parent
6a5502c94a
commit
51ec5404d4
|
|
@ -625,16 +625,15 @@ public class SAMDataSource {
|
|||
byte defaultBaseQualities,
|
||||
boolean isLocusBasedTraversal ) {
|
||||
|
||||
// ************************************************************************************************ //
|
||||
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
// * (otherwise we will process something that we may end up throwing away) * //
|
||||
// ************************************************************************************************ //
|
||||
// Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum,
|
||||
// this will consolidate the cigar strings into canonical form. This has to be done before the read
|
||||
// filtering, because not all read filters will behave correctly with things like zero-length cigar
|
||||
// elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also
|
||||
// modify the base qualities.
|
||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
|
||||
|
||||
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
|
||||
// only wrap if we are replacing the original qualities or using a default base quality
|
||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
|
||||
|
||||
// Filters:
|
||||
// Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads
|
||||
// that actually survive filtering. Otherwise we could get much less coverage than requested.
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
|
||||
// Downsampling:
|
||||
|
|
@ -658,7 +657,8 @@ public class SAMDataSource {
|
|||
if (!noValidationOfReadOrder && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
|
||||
|
||||
// set up read transformers
|
||||
// Read transformers: these are applied last, so that we don't bother transforming reads that get discarded
|
||||
// by the read filters or downsampler.
|
||||
for ( final ReadTransformer readTransformer : readTransformers ) {
|
||||
if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
|
||||
wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.iterators;
|
|||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
|
||||
/**
|
||||
* An iterator which does post-processing of a read, including potentially wrapping
|
||||
|
|
@ -104,6 +105,10 @@ public class ReadFormattingIterator implements StingSAMIterator {
|
|||
public SAMRecord next() {
|
||||
SAMRecord rec = wrappedIterator.next();
|
||||
|
||||
// Always consolidate the cigar string into canonical form, collapsing zero-length / repeated cigar elements.
|
||||
// Downstream code (like LocusIteratorByState) cannot necessarily handle non-consolidated cigar strings.
|
||||
rec.setCigar(AlignmentUtils.consolidateCigar(rec.getCigar()));
|
||||
|
||||
// if we are using default quals, check if we need them, and add if necessary.
|
||||
// 1. we need if reads are lacking or have incomplete quality scores
|
||||
// 2. we add if defaultBaseQualities has a positive value
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
|
|
@ -39,6 +41,7 @@ import org.broadinstitute.sting.gatk.walkers.qc.ErrorThrowing;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
|
|
@ -255,4 +258,23 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, ""));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGATKEngineConsolidatesCigars() {
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
|
||||
" -R " + b37KGReference +
|
||||
" -I " + privateTestDir + "zero_length_cigar_elements.bam" +
|
||||
" -o %s",
|
||||
1, Arrays.asList("")); // No MD5s; we only want to check the cigar
|
||||
|
||||
final File outputBam = executeTest("testGATKEngineConsolidatesCigars", spec).first.get(0);
|
||||
final SAMFileReader reader = new SAMFileReader(outputBam);
|
||||
reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
||||
reader.setSAMRecordFactory(new GATKSamRecordFactory());
|
||||
|
||||
final SAMRecord read = reader.iterator().next();
|
||||
reader.close();
|
||||
|
||||
// Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine:
|
||||
Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
public class ReadFormattingIteratorUnitTest extends BaseTest {
|
||||
|
||||
@Test
|
||||
public void testIteratorConsolidatesCigars() {
|
||||
final Cigar unconsolidatedCigar = TextCigarCodec.getSingleton().decode("3M0M5M0M");
|
||||
final SAMRecord unconsolidatedRead = ArtificialSAMUtils.createArtificialRead(unconsolidatedCigar);
|
||||
|
||||
final StingSAMIterator readIterator = StingSAMIteratorAdapter.adapt(Arrays.asList(unconsolidatedRead).iterator());
|
||||
final ReadFormattingIterator formattingIterator = new ReadFormattingIterator(readIterator, false, (byte)-1);
|
||||
final SAMRecord postIterationRead = formattingIterator.next();
|
||||
|
||||
Assert.assertEquals(postIterationRead.getCigarString(), "8M", "Cigar 3M0M5M0M not consolidated correctly by ReadFormattingIterator");
|
||||
}
|
||||
}
|
||||
|
|
@ -43,7 +43,7 @@ public class BAQIntegrationTest extends WalkerTest {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testPrintReadsNoBAQ() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("11af64ba020262d06b490bae2c5e08f8"));
|
||||
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("d1f74074e718c82810512bf40dbc7f72"));
|
||||
executeTest(String.format("testPrintReadsNoBAQ"), spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -59,10 +59,10 @@ public class PrintReadsIntegrationTest extends WalkerTest {
|
|||
{new PRTest(hg18Reference, "HiSeq.1mb.bam", " -simplifyBAM", "1510dc4429f3ed49caf96da41e8ed396")},
|
||||
{new PRTest(hg18Reference, "HiSeq.1mb.bam", " -n 10", "0e3d1748ad1cb523e3295cab9d09d8fc")},
|
||||
// See: GATKBAMIndex.getStartOfLastLinearBin(), BAMScheduler.advance(), IntervalOverlapFilteringIterator.advance()
|
||||
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "e1cac555f3d720f611c47eec93e84bd9")},
|
||||
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e2558317d409195eab3006dc9e5524c")},
|
||||
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "d7f23fd77d7dc7cb50d3397f644c6d8a")},
|
||||
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "c601db95b20248d012b0085347fcb6d1")},
|
||||
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "2d32440e47e8d9d329902fe573ad94ce")},
|
||||
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e2558317d409195eab3006dc9e5524c")},
|
||||
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "c601db95b20248d012b0085347fcb6d1")},
|
||||
{new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "349650b6aa9e574b48a2a62627f37c7d")},
|
||||
{new PRTest(b37KGReference, "NA12878.1_10mb_2_10mb.bam", "", "0c1cbe67296637a85e80e7a182f828ab")}
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in New Issue