Updated and more thorough version of the BadCigar read filter

* No reads with Hard/Soft clips in the middle of the cigar
   * No reads starting with deletions (with or without preceding clips)
   * No reads ending in deletions (with or without follow-up clips)
   * No reads that are fully hard or soft clipped
   * No reads that have consecutive indels in the cigar (II, DD, ID or DI)

 Also added systematic test for good cigars and iterative test for bad cigars.
This commit is contained in:
Mauricio Carneiro 2012-08-17 14:20:15 -04:00
parent 980685af16
commit d16cb68539
3 changed files with 210 additions and 67 deletions

View File

@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
/**
* Filter out reads with wonky cigar strings.
*
* - No reads with Hard/Soft clips in the middle of the cigar
* - No reads starting with deletions (with or without preceding clips)
* - No reads ending in deletions (with or without follow-up clips)
* - No reads that are fully hard or soft clipped
* - No reads that have consecutive indels in the cigar (II, DD, ID or DI)
*
* ps: apparently an empty cigar is okay...
*
* @author ebanks
* @version 0.1
*/
@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter {
public boolean filterOut(final SAMRecord rec) {
final Cigar c = rec.getCigar();
if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad
boolean previousElementWasIndel = false;
CigarOperator lastOp = c.getCigarElement(0).getOperator();
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
return true;
for (CigarElement ce : c.getCigarElements()) {
CigarOperator op = ce.getOperator();
if (op == CigarOperator.D || op == CigarOperator.I) {
if (previousElementWasIndel)
return true; // filter out reads with adjacent I/D
previousElementWasIndel = true;
}
else // this is a regular base (match/mismatch/hard or soft clip)
previousElementWasIndel = false; // reset the previous element
lastOp = op;
// if there is no Cigar then it can't be bad
if( c.isEmpty() ) {
return false;
}
return lastOp == CigarOperator.D;
Iterator<CigarElement> elementIterator = c.getCigarElements().iterator();
CigarOperator firstOp = CigarOperator.H;
while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) {
CigarOperator op = elementIterator.next().getOperator();
// No reads with Hard/Soft clips in the middle of the cigar
if (firstOp != CigarOperator.H && op == CigarOperator.H) {
return true;
}
firstOp = op;
}
// No reads starting with deletions (with or without preceding clips)
if (firstOp == CigarOperator.D) {
return true;
}
boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S);
boolean previousElementWasIndel = firstOp == CigarOperator.I;
CigarOperator lastOp = firstOp;
CigarOperator previousOp = firstOp;
while (elementIterator.hasNext()) {
CigarOperator op = elementIterator.next().getOperator();
if (op != CigarOperator.S && op != CigarOperator.H) {
// No reads with Hard/Soft clips in the middle of the cigar
if (previousOp == CigarOperator.S || previousOp == CigarOperator.H)
return true;
lastOp = op;
if (!hasMeaningfulElements && op.consumesReadBases()) {
hasMeaningfulElements = true;
}
if (op == CigarOperator.I || op == CigarOperator.D) {
// No reads that have consecutive indels in the cigar (II, DD, ID or DI)
if (previousElementWasIndel) {
return true;
}
previousElementWasIndel = true;
}
else {
previousElementWasIndel = false;
}
}
// No reads with Hard/Soft clips in the middle of the cigar
else if (op == CigarOperator.S && previousOp == CigarOperator.H) {
return true;
}
previousOp = op;
}
// No reads ending in deletions (with or without follow-up clips)
// No reads that are fully hard or soft clipped
return lastOp == CigarOperator.D || !hasMeaningfulElements;
}
}

View File

@ -1,11 +1,14 @@
package org.broadinstitute.sting.gatk.filters;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import net.sf.samtools.Cigar;
import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.List;
/**
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
*
@ -14,6 +17,29 @@ import org.testng.annotations.Test;
*/
public class BadCigarFilterUnitTest {
public static final String[] BAD_CIGAR_LIST = {
"2D4M", // starting with multiple deletions
"4M2D", // ending with multiple deletions
"3M1I1D", // adjacent indels AND ends in deletion
"1M1I1D2M", // adjacent indels I->D
"1M1D2I1M", // adjacent indels D->I
"1M1I2M1D", // ends in single deletion with insertion in the middle
"4M1D", // ends in single deletion
"1D4M", // starts with single deletion
"2M1D1D2M", // adjacent D's
"1M1I1I1M", // adjacent I's
"1H1D4M", // starting with deletion after H
"1S1D3M", // starting with deletion after S
"1H1S1D3M", // starting with deletion after HS
"4M1D1H", // ending with deletion before H
"3M1D1S", // ending with deletion before S
"3M1D1S1H", // ending with deletion before HS
"10M2H10M", // H in the middle
"10M2S10M", // S in the middle
"1H1S10M2S10M1S1H", // deceiving S in the middle
"1H1S10M2H10M1S1H" // deceiving H in the middle
};
BadCigarFilter filter;
@BeforeClass
@ -21,40 +47,20 @@ public class BadCigarFilterUnitTest {
filter = new BadCigarFilter();
}
@Test
@Test(enabled = true)
public void testWonkyCigars () {
byte[] bases = {'A', 'A', 'A', 'A'};
byte[] quals = {30, 30, 30, 30};
GATKSAMRecord read;
// starting with multiple deletions
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
for (String cigarString : BAD_CIGAR_LIST) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString);
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
}
}
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
@Test(enabled = true)
public void testGoodCigars() {
List<Cigar> cigarList = ReadClipperTestUtils.generateCigarList(10);
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
Assert.assertFalse(filter.filterOut(read), read.getCigarString());
}
}
}

View File

@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
@ -37,17 +38,22 @@ public class ReadClipperTestUtils {
return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString());
}
/**
* This function generates every valid permutation of cigar strings with a given length.
*
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read
* - No deletions in the beginning / end of the read
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
*
* @param maximumLength the maximum number of elements in the cigar
* @return a list with all valid Cigar objects
*/
public static GATKSAMRecord makeReadFromCigar(String cigarString) {
return makeReadFromCigar(cigarFromString(cigarString));
}
/**
* This function generates every valid permutation of cigar strings with a given length.
*
* A valid cigar object obeys the following rules:
* - No Hard/Soft clips in the middle of the read
* - No deletions in the beginning / end of the read
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
* - No consecutive I/D elements
*
* @param maximumLength the maximum number of elements in the cigar
* @return a list with all valid Cigar objects
*/
public static List<Cigar> generateCigarList(int maximumLength) {
int numCigarElements = cigarElements.length;
LinkedList<Cigar> cigarList = new LinkedList<Cigar>();
@ -137,7 +143,10 @@ public class ReadClipperTestUtils {
CigarElement lastElement = null;
int lastElementLength = 0;
for (CigarElement cigarElement : rawCigar.getCigarElements()) {
if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator())
if (lastElement != null &&
((lastElement.getOperator() == cigarElement.getOperator()) ||
(lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) ||
(lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I)))
lastElementLength += cigarElement.getLength();
else
{
@ -191,7 +200,7 @@ public class ReadClipperTestUtils {
/**
* Checks whether or not the read has any cigar element that is not H or S
*
* @param read
* @param read the read
* @return true if it has any M, I or D, false otherwise
*/
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
@ -201,5 +210,79 @@ public class ReadClipperTestUtils {
return false;
}
public static Cigar cigarFromString(String cigarString) {
Cigar cigar = new Cigar();
boolean isNumber = false;
int number = 0;
for (int i = 0; i < cigarString.length(); i++) {
char x = cigarString.charAt(i);
if (x >= '0' && x <='9') {
if (isNumber) {
number *= 10;
}
else {
isNumber = true;
}
number += x - '0';
}
else {
CigarElement e;
switch (x) {
case 'M':
case 'm':
e = new CigarElement(number, CigarOperator.M);
break;
case 'I':
case 'i':
e = new CigarElement(number, CigarOperator.I);
break;
case 'D':
case 'd':
e = new CigarElement(number, CigarOperator.D);
break;
case 'S':
case 's':
e = new CigarElement(number, CigarOperator.S);
break;
case 'N':
case 'n':
e = new CigarElement(number, CigarOperator.N);
break;
case 'H':
case 'h':
e = new CigarElement(number, CigarOperator.H);
break;
case 'P':
case 'p':
e = new CigarElement(number, CigarOperator.P);
break;
case '=':
e = new CigarElement(number, CigarOperator.EQ);
break;
case 'X':
case 'x':
e = new CigarElement(number, CigarOperator.X);
break;
default:
throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")");
}
cigar.add(e);
}
}
return cigar;
}
}