Taking care of bad cigars in the GATK

* fixed BadCigarFilter to filter out reads starting/ending in deletion and that have adjacent I/D events.
   * added Unit tests for BadCigarFilter
   * updated all exceptions in LocusIteratorByState to tell the user that he can instead run with -rf BadCigar
   * added the BadCigar filter to ReduceReads and RealignTargetCreator (if your walker blows up with these malformed reads, you may want to add it too)
This commit is contained in:
Mauricio Carneiro 2012-03-20 14:31:32 -04:00
parent b290152542
commit 0e93cf5297
4 changed files with 93 additions and 13 deletions

View File

@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter {
public boolean filterOut(final SAMRecord rec) {
Cigar c = rec.getCigar();
boolean lastElementWasIndel = false;
for ( CigarElement ce : c.getCigarElements() ) {
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
if ( lastElementWasIndel )
return true;
lastElementWasIndel = true;
} else {
lastElementWasIndel = false;
boolean previousElementWasIndel = false;
CigarOperator lastOp = c.getCigarElement(0).getOperator();
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
return true;
for (CigarElement ce : c.getCigarElements()) {
CigarOperator op = ce.getOperator();
if (op == CigarOperator.D || op == CigarOperator.I) {
if (previousElementWasIndel)
return true; // filter out reads with adjacent I/D
previousElementWasIndel = true;
}
else // this is a regular base (match/mismatch/hard or soft clip)
previousElementWasIndel = false; // reset the previous element
lastOp = op;
}
return false;
return lastOp == CigarOperator.D;
}
}

View File

@ -199,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator {
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
@ -236,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator {
// we see insertions only once, when we step right onto them; the position on the read is scrolled
// past the insertion right after that
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
eventLength = curElement.getLength();
eventStart = readOffset;
@ -249,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator {
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
if (generateExtendedEvents) {
if (cigarElementCounter == 1) {
// generate an extended event only if we just stepped into the deletion (i.e. don't
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
eventLength = curElement.getLength();
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
eventStart = readOffset;

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -233,7 +234,17 @@ public class ArtificialSAMUtils {
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
}
public static GATKSAMRecord createArtificialRead(Cigar cigar) {
int length = cigar.getReadLength();
byte [] base = {'A'};
byte [] qual = {30};
byte [] bases = Utils.arrayFromArrayWithLength(base, length);
byte [] quals = Utils.arrayFromArrayWithLength(qual, length);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString());
}
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);

View File

@ -0,0 +1,60 @@
package org.broadinstitute.sting.gatk.filters;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
/**
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
*
* @author Mauricio Carneiro
* @since 3/20/12
*/
public class BadCigarFilterUnitTest {
BadCigarFilter filter;
@BeforeClass
public void init() {
filter = new BadCigarFilter();
}
@Test
public void testWonkyCigars () {
byte[] bases = {'A', 'A', 'A', 'A'};
byte[] quals = {30, 30, 30, 30};
GATKSAMRecord read;
// starting with multiple deletions
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
}
}