Merge pull request #294 from broadinstitute/dr_handle_zero_length_cigar_elements

SAMDataSource: always consolidate cigar strings into canonical form
This commit is contained in:
droazen 2013-06-19 10:32:22 -07:00
commit 573ecadecc
6 changed files with 91 additions and 14 deletions

View File

@ -625,16 +625,15 @@ public class SAMDataSource {
byte defaultBaseQualities,
boolean isLocusBasedTraversal ) {
// ************************************************************************************************ //
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// ************************************************************************************************ //
// Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum,
// this will consolidate the cigar strings into canonical form. This has to be done before the read
// filtering, because not all read filters will behave correctly with things like zero-length cigar
// elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also
// modify the base qualities.
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
// only wrap if we are replacing the original qualities or using a default base quality
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
// Filters:
// Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads
// that actually survive filtering. Otherwise we could get much less coverage than requested.
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
// Downsampling:
@ -658,7 +657,8 @@ public class SAMDataSource {
if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
// set up read transformers
// Read transformers: these are applied last, so that we don't bother transforming reads that get discarded
// by the read filters or downsampler.
for ( final ReadTransformer readTransformer : readTransformers ) {
if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
/**
* An iterator which does post-processing of a read, including potentially wrapping
@ -104,6 +105,10 @@ public class ReadFormattingIterator implements StingSAMIterator {
public SAMRecord next() {
SAMRecord rec = wrappedIterator.next();
// Always consolidate the cigar string into canonical form, collapsing zero-length / repeated cigar elements.
// Downstream code (like LocusIteratorByState) cannot necessarily handle non-consolidated cigar strings.
rec.setCigar(AlignmentUtils.consolidateCigar(rec.getCigar()));
// if we are using default quals, check if we need them, and add if necessary.
// 1. we need if reads are lacking or have incomplete quality scores
// 2. we add if defaultBaseQualities has a positive value

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.gatk;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.BlockCompressedInputStream;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.WalkerTest;
@ -39,6 +41,7 @@ import org.broadinstitute.sting.gatk.walkers.qc.ErrorThrowing;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.broadinstitute.variant.vcf.VCFHeader;
import org.broadinstitute.variant.vcf.VCFHeaderLine;
@ -255,4 +258,23 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, ""));
}
@Test
public void testGATKEngineConsolidatesCigars() {
final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" +
" -R " + b37KGReference +
" -I " + privateTestDir + "zero_length_cigar_elements.bam" +
" -o %s",
1, Arrays.asList("")); // No MD5s; we only want to check the cigar
final File outputBam = executeTest("testGATKEngineConsolidatesCigars", spec).first.get(0);
final SAMFileReader reader = new SAMFileReader(outputBam);
reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
reader.setSAMRecordFactory(new GATKSamRecordFactory());
final SAMRecord read = reader.iterator().next();
reader.close();
// Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine:
Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine");
}
}

View File

@ -0,0 +1,50 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.Arrays;
public class ReadFormattingIteratorUnitTest extends BaseTest {
@Test
public void testIteratorConsolidatesCigars() {
final Cigar unconsolidatedCigar = TextCigarCodec.getSingleton().decode("3M0M5M0M");
final SAMRecord unconsolidatedRead = ArtificialSAMUtils.createArtificialRead(unconsolidatedCigar);
final StingSAMIterator readIterator = StingSAMIteratorAdapter.adapt(Arrays.asList(unconsolidatedRead).iterator());
final ReadFormattingIterator formattingIterator = new ReadFormattingIterator(readIterator, false, (byte)-1);
final SAMRecord postIterationRead = formattingIterator.next();
Assert.assertEquals(postIterationRead.getCigarString(), "8M", "Cigar 3M0M5M0M not consolidated correctly by ReadFormattingIterator");
}
}

View File

@ -43,7 +43,7 @@ public class BAQIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testPrintReadsNoBAQ() {
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("11af64ba020262d06b490bae2c5e08f8"));
WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("d1f74074e718c82810512bf40dbc7f72"));
executeTest(String.format("testPrintReadsNoBAQ"), spec);
}

View File

@ -59,10 +59,10 @@ public class PrintReadsIntegrationTest extends WalkerTest {
{new PRTest(hg18Reference, "HiSeq.1mb.bam", " -simplifyBAM", "1510dc4429f3ed49caf96da41e8ed396")},
{new PRTest(hg18Reference, "HiSeq.1mb.bam", " -n 10", "0e3d1748ad1cb523e3295cab9d09d8fc")},
// See: GATKBAMIndex.getStartOfLastLinearBin(), BAMScheduler.advance(), IntervalOverlapFilteringIterator.advance()
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "e1cac555f3d720f611c47eec93e84bd9")},
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e2558317d409195eab3006dc9e5524c")},
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "d7f23fd77d7dc7cb50d3397f644c6d8a")},
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "c601db95b20248d012b0085347fcb6d1")},
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "2d32440e47e8d9d329902fe573ad94ce")},
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e2558317d409195eab3006dc9e5524c")},
{new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "c601db95b20248d012b0085347fcb6d1")},
{new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "349650b6aa9e574b48a2a62627f37c7d")},
{new PRTest(b37KGReference, "NA12878.1_10mb_2_10mb.bam", "", "0c1cbe67296637a85e80e7a182f828ab")}
};