diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java index 0987c5d74..6a9642d97 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java @@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter { public boolean filterOut(final SAMRecord rec) { Cigar c = rec.getCigar(); - boolean lastElementWasIndel = false; - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) { - if ( lastElementWasIndel ) - return true; - lastElementWasIndel = true; - } else { - lastElementWasIndel = false; + boolean previousElementWasIndel = false; + CigarOperator lastOp = c.getCigarElement(0).getOperator(); + + if (lastOp == CigarOperator.D) // filter out reads starting with deletion + return true; + + for (CigarElement ce : c.getCigarElements()) { + CigarOperator op = ce.getOperator(); + if (op == CigarOperator.D || op == CigarOperator.I) { + if (previousElementWasIndel) + return true; // filter out reads with adjacent I/D + + previousElementWasIndel = true; } + else // this is a regular base (match/mismatch/hard or soft clip) + previousElementWasIndel = false; // reset the previous element + + lastOp = op; } - return false; + return lastOp == CigarOperator.D; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index af856f3f9..8b9674353 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -199,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator { return stepForwardOnGenome(); } else { if (curElement != null && curElement.getOperator() == CigarOperator.D) - throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString()); + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); // Reads that contain indels model the genomeOffset as the following base in the reference. Because // we fall into this else block only when indels end the read, increment genomeOffset such that the @@ -236,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator { // we see insertions only once, when we step right onto them; the position on the read is scrolled // past the insertion right after that if (eventDelayedFlag > 1) - throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString())); insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength()); eventLength = curElement.getLength(); eventStart = readOffset; @@ -249,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator { break; case D: // deletion w.r.t. the reference if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string - throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString()); + throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); if (generateExtendedEvents) { if (cigarElementCounter == 1) { // generate an extended event only if we just stepped into the deletion (i.e. don't // generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!) if (eventDelayedFlag > 1) - throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString())); eventLength = curElement.getLength(); eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only eventStart = readOffset; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index de8c50935..0d3777701 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -233,7 +234,17 @@ public class ArtificialSAMUtils { return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); } + public static GATKSAMRecord createArtificialRead(Cigar cigar) { + int length = cigar.getReadLength(); + byte [] base = {'A'}; + byte [] qual = {30}; + byte [] bases = Utils.arrayFromArrayWithLength(base, length); + byte [] quals = Utils.arrayFromArrayWithLength(qual, length); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString()); + } + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java new file mode 100644 index 000000000..333d35641 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.gatk.filters; + +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +/** + * Checks that the Bad Cigar filter works for all kinds of wonky cigars + * + * @author Mauricio Carneiro + * @since 3/20/12 + */ +public class BadCigarFilterUnitTest { + + BadCigarFilter filter; + + @BeforeClass + public void init() { + filter = new BadCigarFilter(); + } + + @Test + public void testWonkyCigars () { + byte[] bases = {'A', 'A', 'A', 'A'}; + byte[] quals = {30, 30, 30, 30}; + GATKSAMRecord read; + // starting with multiple deletions + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M"); + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + } +}