diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java index 9a1455859..cda7392ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java @@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import java.util.Iterator; + /** * Filter out reads with wonky cigar strings. * + * - No reads with Hard/Soft clips in the middle of the cigar + * - No reads starting with deletions (with or without preceding clips) + * - No reads ending in deletions (with or without follow-up clips) + * - No reads that are fully hard or soft clipped + * - No reads that have consecutive indels in the cigar (II, DD, ID or DI) + * + * ps: apparently an empty cigar is okay... + * * @author ebanks * @version 0.1 */ @@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter { public boolean filterOut(final SAMRecord rec) { final Cigar c = rec.getCigar(); - if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad - boolean previousElementWasIndel = false; - CigarOperator lastOp = c.getCigarElement(0).getOperator(); - - if (lastOp == CigarOperator.D) // filter out reads starting with deletion - return true; - - for (CigarElement ce : c.getCigarElements()) { - CigarOperator op = ce.getOperator(); - if (op == CigarOperator.D || op == CigarOperator.I) { - if (previousElementWasIndel) - return true; // filter out reads with adjacent I/D - - previousElementWasIndel = true; - } - else // this is a regular base (match/mismatch/hard or soft clip) - previousElementWasIndel = false; // reset the previous element - - lastOp = op; + // if there is no Cigar then it can't be bad + if( c.isEmpty() ) { + return false; } - return lastOp == CigarOperator.D; + Iterator elementIterator = c.getCigarElements().iterator(); + + CigarOperator firstOp = CigarOperator.H; + while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) { + CigarOperator op = elementIterator.next().getOperator(); + + // No reads with Hard/Soft clips in the middle of the cigar + if (firstOp != CigarOperator.H && op == CigarOperator.H) { + return true; + } + firstOp = op; + } + + // No reads starting with deletions (with or without preceding clips) + if (firstOp == CigarOperator.D) { + return true; + } + + boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S); + boolean previousElementWasIndel = firstOp == CigarOperator.I; + CigarOperator lastOp = firstOp; + CigarOperator previousOp = firstOp; + + while (elementIterator.hasNext()) { + CigarOperator op = elementIterator.next().getOperator(); + + if (op != CigarOperator.S && op != CigarOperator.H) { + + // No reads with Hard/Soft clips in the middle of the cigar + if (previousOp == CigarOperator.S || previousOp == CigarOperator.H) + return true; + + lastOp = op; + + if (!hasMeaningfulElements && op.consumesReadBases()) { + hasMeaningfulElements = true; + } + + if (op == CigarOperator.I || op == CigarOperator.D) { + + // No reads that have consecutive indels in the cigar (II, DD, ID or DI) + if (previousElementWasIndel) { + return true; + } + previousElementWasIndel = true; + } + else { + previousElementWasIndel = false; + } + } + // No reads with Hard/Soft clips in the middle of the cigar + else if (op == CigarOperator.S && previousOp == CigarOperator.H) { + return true; + } + + previousOp = op; + } + + // No reads ending in deletions (with or without follow-up clips) + // No reads that are fully hard or soft clipped + return lastOp == CigarOperator.D || !hasMeaningfulElements; } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java index 333d35641..ff918db68 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java @@ -1,11 +1,14 @@ package org.broadinstitute.sting.gatk.filters; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import net.sf.samtools.Cigar; +import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.List; + /** * Checks that the Bad Cigar filter works for all kinds of wonky cigars * @@ -14,6 +17,29 @@ import org.testng.annotations.Test; */ public class BadCigarFilterUnitTest { + public static final String[] BAD_CIGAR_LIST = { + "2D4M", // starting with multiple deletions + "4M2D", // ending with multiple deletions + "3M1I1D", // adjacent indels AND ends in deletion + "1M1I1D2M", // adjacent indels I->D + "1M1D2I1M", // adjacent indels D->I + "1M1I2M1D", // ends in single deletion with insertion in the middle + "4M1D", // ends in single deletion + "1D4M", // starts with single deletion + "2M1D1D2M", // adjacent D's + "1M1I1I1M", // adjacent I's + "1H1D4M", // starting with deletion after H + "1S1D3M", // starting with deletion after S + "1H1S1D3M", // starting with deletion after HS + "4M1D1H", // ending with deletion before H + "3M1D1S", // ending with deletion before S + "3M1D1S1H", // ending with deletion before HS + "10M2H10M", // H in the middle + "10M2S10M", // S in the middle + "1H1S10M2S10M1S1H", // deceiving S in the middle + "1H1S10M2H10M1S1H" // deceiving H in the middle + }; + BadCigarFilter filter; @BeforeClass @@ -21,40 +47,20 @@ public class BadCigarFilterUnitTest { filter = new BadCigarFilter(); } - @Test + @Test(enabled = true) public void testWonkyCigars () { - byte[] bases = {'A', 'A', 'A', 'A'}; - byte[] quals = {30, 30, 30, 30}; - GATKSAMRecord read; - // starting with multiple deletions - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M"); - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + for (String cigarString : BAD_CIGAR_LIST) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString); + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + } + } - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + @Test(enabled = true) + public void testGoodCigars() { + List cigarList = ReadClipperTestUtils.generateCigarList(10); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + Assert.assertFalse(filter.filterOut(read), read.getCigarString()); + } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index baa2f6218..208c14fbd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -4,6 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -37,17 +38,22 @@ public class ReadClipperTestUtils { return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); } - /** - * This function generates every valid permutation of cigar strings with a given length. - * - * A valid cigar object obeys the following rules: - * - No Hard/Soft clips in the middle of the read - * - No deletions in the beginning / end of the read - * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) - * - * @param maximumLength the maximum number of elements in the cigar - * @return a list with all valid Cigar objects - */ + public static GATKSAMRecord makeReadFromCigar(String cigarString) { + return makeReadFromCigar(cigarFromString(cigarString)); + } + + /** + * This function generates every valid permutation of cigar strings with a given length. + * + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * - No consecutive I/D elements + * + * @param maximumLength the maximum number of elements in the cigar + * @return a list with all valid Cigar objects + */ public static List generateCigarList(int maximumLength) { int numCigarElements = cigarElements.length; LinkedList cigarList = new LinkedList(); @@ -137,7 +143,10 @@ public class ReadClipperTestUtils { CigarElement lastElement = null; int lastElementLength = 0; for (CigarElement cigarElement : rawCigar.getCigarElements()) { - if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator()) + if (lastElement != null && + ((lastElement.getOperator() == cigarElement.getOperator()) || + (lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) || + (lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I))) lastElementLength += cigarElement.getLength(); else { @@ -191,7 +200,7 @@ public class ReadClipperTestUtils { /** * Checks whether or not the read has any cigar element that is not H or S * - * @param read + * @param read the read * @return true if it has any M, I or D, false otherwise */ public static boolean readHasNonClippedBases(GATKSAMRecord read) { @@ -201,5 +210,79 @@ public class ReadClipperTestUtils { return false; } + public static Cigar cigarFromString(String cigarString) { + Cigar cigar = new Cigar(); + + boolean isNumber = false; + int number = 0; + for (int i = 0; i < cigarString.length(); i++) { + char x = cigarString.charAt(i); + + if (x >= '0' && x <='9') { + if (isNumber) { + number *= 10; + } + else { + isNumber = true; + } + number += x - '0'; + } + + else { + CigarElement e; + switch (x) { + case 'M': + case 'm': + e = new CigarElement(number, CigarOperator.M); + break; + + case 'I': + case 'i': + e = new CigarElement(number, CigarOperator.I); + break; + + case 'D': + case 'd': + e = new CigarElement(number, CigarOperator.D); + break; + + case 'S': + case 's': + e = new CigarElement(number, CigarOperator.S); + break; + + case 'N': + case 'n': + e = new CigarElement(number, CigarOperator.N); + break; + + case 'H': + case 'h': + e = new CigarElement(number, CigarOperator.H); + break; + + case 'P': + case 'p': + e = new CigarElement(number, CigarOperator.P); + break; + + case '=': + e = new CigarElement(number, CigarOperator.EQ); + break; + + case 'X': + case 'x': + e = new CigarElement(number, CigarOperator.X); + break; + + default: + throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")"); + } + cigar.add(e); + } + } + return cigar; + } + }