Updated and more thorough version of the BadCigar read filter

* No reads with Hard/Soft clips in the middle of the cigar
   * No reads starting with deletions (with or without preceding clips)
   * No reads ending in deletions (with or without follow-up clips)
   * No reads that are fully hard or soft clipped
   * No reads that have consecutive indels in the cigar (II, DD, ID or DI)

 Also added systematic test for good cigars and iterative test for bad cigars.
This commit is contained in:
Mauricio Carneiro 2012-08-17 14:20:15 -04:00
parent 980685af16
commit d16cb68539
3 changed files with 210 additions and 67 deletions

View File

@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator; import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import java.util.Iterator;
/** /**
* Filter out reads with wonky cigar strings. * Filter out reads with wonky cigar strings.
* *
* - No reads with Hard/Soft clips in the middle of the cigar
* - No reads starting with deletions (with or without preceding clips)
* - No reads ending in deletions (with or without follow-up clips)
* - No reads that are fully hard or soft clipped
* - No reads that have consecutive indels in the cigar (II, DD, ID or DI)
*
* ps: apparently an empty cigar is okay...
*
* @author ebanks * @author ebanks
* @version 0.1 * @version 0.1
*/ */
@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter {
public boolean filterOut(final SAMRecord rec) { public boolean filterOut(final SAMRecord rec) {
final Cigar c = rec.getCigar(); final Cigar c = rec.getCigar();
if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad
boolean previousElementWasIndel = false; // if there is no Cigar then it can't be bad
CigarOperator lastOp = c.getCigarElement(0).getOperator(); if( c.isEmpty() ) {
return false;
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
return true;
for (CigarElement ce : c.getCigarElements()) {
CigarOperator op = ce.getOperator();
if (op == CigarOperator.D || op == CigarOperator.I) {
if (previousElementWasIndel)
return true; // filter out reads with adjacent I/D
previousElementWasIndel = true;
}
else // this is a regular base (match/mismatch/hard or soft clip)
previousElementWasIndel = false; // reset the previous element
lastOp = op;
} }
return lastOp == CigarOperator.D; Iterator<CigarElement> elementIterator = c.getCigarElements().iterator();
CigarOperator firstOp = CigarOperator.H;
while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) {
CigarOperator op = elementIterator.next().getOperator();
// No reads with Hard/Soft clips in the middle of the cigar
if (firstOp != CigarOperator.H && op == CigarOperator.H) {
return true;
}
firstOp = op;
}
// No reads starting with deletions (with or without preceding clips)
if (firstOp == CigarOperator.D) {
return true;
}
boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S);
boolean previousElementWasIndel = firstOp == CigarOperator.I;
CigarOperator lastOp = firstOp;
CigarOperator previousOp = firstOp;
while (elementIterator.hasNext()) {
CigarOperator op = elementIterator.next().getOperator();
if (op != CigarOperator.S && op != CigarOperator.H) {
// No reads with Hard/Soft clips in the middle of the cigar
if (previousOp == CigarOperator.S || previousOp == CigarOperator.H)
return true;
lastOp = op;
if (!hasMeaningfulElements && op.consumesReadBases()) {
hasMeaningfulElements = true;
}
if (op == CigarOperator.I || op == CigarOperator.D) {
// No reads that have consecutive indels in the cigar (II, DD, ID or DI)
if (previousElementWasIndel) {
return true;
}
previousElementWasIndel = true;
}
else {
previousElementWasIndel = false;
}
}
// No reads with Hard/Soft clips in the middle of the cigar
else if (op == CigarOperator.S && previousOp == CigarOperator.H) {
return true;
}
previousOp = op;
}
// No reads ending in deletions (with or without follow-up clips)
// No reads that are fully hard or soft clipped
return lastOp == CigarOperator.D || !hasMeaningfulElements;
} }
} }

View File

@ -1,11 +1,14 @@
package org.broadinstitute.sting.gatk.filters; package org.broadinstitute.sting.gatk.filters;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import net.sf.samtools.Cigar;
import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.util.List;
/** /**
* Checks that the Bad Cigar filter works for all kinds of wonky cigars * Checks that the Bad Cigar filter works for all kinds of wonky cigars
* *
@ -14,6 +17,29 @@ import org.testng.annotations.Test;
*/ */
public class BadCigarFilterUnitTest { public class BadCigarFilterUnitTest {
public static final String[] BAD_CIGAR_LIST = {
"2D4M", // starting with multiple deletions
"4M2D", // ending with multiple deletions
"3M1I1D", // adjacent indels AND ends in deletion
"1M1I1D2M", // adjacent indels I->D
"1M1D2I1M", // adjacent indels D->I
"1M1I2M1D", // ends in single deletion with insertion in the middle
"4M1D", // ends in single deletion
"1D4M", // starts with single deletion
"2M1D1D2M", // adjacent D's
"1M1I1I1M", // adjacent I's
"1H1D4M", // starting with deletion after H
"1S1D3M", // starting with deletion after S
"1H1S1D3M", // starting with deletion after HS
"4M1D1H", // ending with deletion before H
"3M1D1S", // ending with deletion before S
"3M1D1S1H", // ending with deletion before HS
"10M2H10M", // H in the middle
"10M2S10M", // S in the middle
"1H1S10M2S10M1S1H", // deceiving S in the middle
"1H1S10M2H10M1S1H" // deceiving H in the middle
};
BadCigarFilter filter; BadCigarFilter filter;
@BeforeClass @BeforeClass
@ -21,40 +47,20 @@ public class BadCigarFilterUnitTest {
filter = new BadCigarFilter(); filter = new BadCigarFilter();
} }
@Test @Test(enabled = true)
public void testWonkyCigars () { public void testWonkyCigars () {
byte[] bases = {'A', 'A', 'A', 'A'}; for (String cigarString : BAD_CIGAR_LIST) {
byte[] quals = {30, 30, 30, 30}; GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString);
GATKSAMRecord read; Assert.assertTrue(filter.filterOut(read), read.getCigarString());
// starting with multiple deletions }
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M"); }
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions @Test(enabled = true)
Assert.assertTrue(filter.filterOut(read), read.getCigarString()); public void testGoodCigars() {
List<Cigar> cigarList = ReadClipperTestUtils.generateCigarList(10);
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion for (Cigar cigar : cigarList) {
Assert.assertTrue(filter.filterOut(read), read.getCigarString()); GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
Assert.assertFalse(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D }
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
} }
} }

View File

@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement; import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator; import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert; import org.testng.Assert;
@ -37,17 +38,22 @@ public class ReadClipperTestUtils {
return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString());
} }
/** public static GATKSAMRecord makeReadFromCigar(String cigarString) {
* This function generates every valid permutation of cigar strings with a given length. return makeReadFromCigar(cigarFromString(cigarString));
* }
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read /**
* - No deletions in the beginning / end of the read * This function generates every valid permutation of cigar strings with a given length.
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M) *
* * A valid cigar object obeys the following rules:
* @param maximumLength the maximum number of elements in the cigar * - No Hard/Soft clips in the middle of the read
* @return a list with all valid Cigar objects * - No deletions in the beginning / end of the read
*/ * - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
* - No consecutive I/D elements
*
* @param maximumLength the maximum number of elements in the cigar
* @return a list with all valid Cigar objects
*/
public static List<Cigar> generateCigarList(int maximumLength) { public static List<Cigar> generateCigarList(int maximumLength) {
int numCigarElements = cigarElements.length; int numCigarElements = cigarElements.length;
LinkedList<Cigar> cigarList = new LinkedList<Cigar>(); LinkedList<Cigar> cigarList = new LinkedList<Cigar>();
@ -137,7 +143,10 @@ public class ReadClipperTestUtils {
CigarElement lastElement = null; CigarElement lastElement = null;
int lastElementLength = 0; int lastElementLength = 0;
for (CigarElement cigarElement : rawCigar.getCigarElements()) { for (CigarElement cigarElement : rawCigar.getCigarElements()) {
if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator()) if (lastElement != null &&
((lastElement.getOperator() == cigarElement.getOperator()) ||
(lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) ||
(lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I)))
lastElementLength += cigarElement.getLength(); lastElementLength += cigarElement.getLength();
else else
{ {
@ -191,7 +200,7 @@ public class ReadClipperTestUtils {
/** /**
* Checks whether or not the read has any cigar element that is not H or S * Checks whether or not the read has any cigar element that is not H or S
* *
* @param read * @param read the read
* @return true if it has any M, I or D, false otherwise * @return true if it has any M, I or D, false otherwise
*/ */
public static boolean readHasNonClippedBases(GATKSAMRecord read) { public static boolean readHasNonClippedBases(GATKSAMRecord read) {
@ -201,5 +210,79 @@ public class ReadClipperTestUtils {
return false; return false;
} }
public static Cigar cigarFromString(String cigarString) {
Cigar cigar = new Cigar();
boolean isNumber = false;
int number = 0;
for (int i = 0; i < cigarString.length(); i++) {
char x = cigarString.charAt(i);
if (x >= '0' && x <='9') {
if (isNumber) {
number *= 10;
}
else {
isNumber = true;
}
number += x - '0';
}
else {
CigarElement e;
switch (x) {
case 'M':
case 'm':
e = new CigarElement(number, CigarOperator.M);
break;
case 'I':
case 'i':
e = new CigarElement(number, CigarOperator.I);
break;
case 'D':
case 'd':
e = new CigarElement(number, CigarOperator.D);
break;
case 'S':
case 's':
e = new CigarElement(number, CigarOperator.S);
break;
case 'N':
case 'n':
e = new CigarElement(number, CigarOperator.N);
break;
case 'H':
case 'h':
e = new CigarElement(number, CigarOperator.H);
break;
case 'P':
case 'p':
e = new CigarElement(number, CigarOperator.P);
break;
case '=':
e = new CigarElement(number, CigarOperator.EQ);
break;
case 'X':
case 'x':
e = new CigarElement(number, CigarOperator.X);
break;
default:
throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")");
}
cigar.add(e);
}
}
return cigar;
}
} }