Updated and more thorough version of the BadCigar read filter
* No reads with Hard/Soft clips in the middle of the cigar * No reads starting with deletions (with or without preceding clips) * No reads ending in deletions (with or without follow-up clips) * No reads that are fully hard or soft clipped * No reads that have consecutive indels in the cigar (II, DD, ID or DI) Also added systematic test for good cigars and iterative test for bad cigars.
This commit is contained in:
parent
980685af16
commit
d16cb68539
|
|
@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Filter out reads with wonky cigar strings.
|
||||
*
|
||||
* - No reads with Hard/Soft clips in the middle of the cigar
|
||||
* - No reads starting with deletions (with or without preceding clips)
|
||||
* - No reads ending in deletions (with or without follow-up clips)
|
||||
* - No reads that are fully hard or soft clipped
|
||||
* - No reads that have consecutive indels in the cigar (II, DD, ID or DI)
|
||||
*
|
||||
* ps: apparently an empty cigar is okay...
|
||||
*
|
||||
* @author ebanks
|
||||
* @version 0.1
|
||||
*/
|
||||
|
|
@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter {
|
|||
|
||||
public boolean filterOut(final SAMRecord rec) {
|
||||
final Cigar c = rec.getCigar();
|
||||
if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad
|
||||
|
||||
boolean previousElementWasIndel = false;
|
||||
CigarOperator lastOp = c.getCigarElement(0).getOperator();
|
||||
|
||||
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
|
||||
return true;
|
||||
|
||||
for (CigarElement ce : c.getCigarElements()) {
|
||||
CigarOperator op = ce.getOperator();
|
||||
if (op == CigarOperator.D || op == CigarOperator.I) {
|
||||
if (previousElementWasIndel)
|
||||
return true; // filter out reads with adjacent I/D
|
||||
|
||||
previousElementWasIndel = true;
|
||||
}
|
||||
else // this is a regular base (match/mismatch/hard or soft clip)
|
||||
previousElementWasIndel = false; // reset the previous element
|
||||
|
||||
lastOp = op;
|
||||
// if there is no Cigar then it can't be bad
|
||||
if( c.isEmpty() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return lastOp == CigarOperator.D;
|
||||
Iterator<CigarElement> elementIterator = c.getCigarElements().iterator();
|
||||
|
||||
CigarOperator firstOp = CigarOperator.H;
|
||||
while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) {
|
||||
CigarOperator op = elementIterator.next().getOperator();
|
||||
|
||||
// No reads with Hard/Soft clips in the middle of the cigar
|
||||
if (firstOp != CigarOperator.H && op == CigarOperator.H) {
|
||||
return true;
|
||||
}
|
||||
firstOp = op;
|
||||
}
|
||||
|
||||
// No reads starting with deletions (with or without preceding clips)
|
||||
if (firstOp == CigarOperator.D) {
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S);
|
||||
boolean previousElementWasIndel = firstOp == CigarOperator.I;
|
||||
CigarOperator lastOp = firstOp;
|
||||
CigarOperator previousOp = firstOp;
|
||||
|
||||
while (elementIterator.hasNext()) {
|
||||
CigarOperator op = elementIterator.next().getOperator();
|
||||
|
||||
if (op != CigarOperator.S && op != CigarOperator.H) {
|
||||
|
||||
// No reads with Hard/Soft clips in the middle of the cigar
|
||||
if (previousOp == CigarOperator.S || previousOp == CigarOperator.H)
|
||||
return true;
|
||||
|
||||
lastOp = op;
|
||||
|
||||
if (!hasMeaningfulElements && op.consumesReadBases()) {
|
||||
hasMeaningfulElements = true;
|
||||
}
|
||||
|
||||
if (op == CigarOperator.I || op == CigarOperator.D) {
|
||||
|
||||
// No reads that have consecutive indels in the cigar (II, DD, ID or DI)
|
||||
if (previousElementWasIndel) {
|
||||
return true;
|
||||
}
|
||||
previousElementWasIndel = true;
|
||||
}
|
||||
else {
|
||||
previousElementWasIndel = false;
|
||||
}
|
||||
}
|
||||
// No reads with Hard/Soft clips in the middle of the cigar
|
||||
else if (op == CigarOperator.S && previousOp == CigarOperator.H) {
|
||||
return true;
|
||||
}
|
||||
|
||||
previousOp = op;
|
||||
}
|
||||
|
||||
// No reads ending in deletions (with or without follow-up clips)
|
||||
// No reads that are fully hard or soft clipped
|
||||
return lastOp == CigarOperator.D || !hasMeaningfulElements;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,11 +1,14 @@
|
|||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import net.sf.samtools.Cigar;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
|
||||
*
|
||||
|
|
@ -14,6 +17,29 @@ import org.testng.annotations.Test;
|
|||
*/
|
||||
public class BadCigarFilterUnitTest {
|
||||
|
||||
public static final String[] BAD_CIGAR_LIST = {
|
||||
"2D4M", // starting with multiple deletions
|
||||
"4M2D", // ending with multiple deletions
|
||||
"3M1I1D", // adjacent indels AND ends in deletion
|
||||
"1M1I1D2M", // adjacent indels I->D
|
||||
"1M1D2I1M", // adjacent indels D->I
|
||||
"1M1I2M1D", // ends in single deletion with insertion in the middle
|
||||
"4M1D", // ends in single deletion
|
||||
"1D4M", // starts with single deletion
|
||||
"2M1D1D2M", // adjacent D's
|
||||
"1M1I1I1M", // adjacent I's
|
||||
"1H1D4M", // starting with deletion after H
|
||||
"1S1D3M", // starting with deletion after S
|
||||
"1H1S1D3M", // starting with deletion after HS
|
||||
"4M1D1H", // ending with deletion before H
|
||||
"3M1D1S", // ending with deletion before S
|
||||
"3M1D1S1H", // ending with deletion before HS
|
||||
"10M2H10M", // H in the middle
|
||||
"10M2S10M", // S in the middle
|
||||
"1H1S10M2S10M1S1H", // deceiving S in the middle
|
||||
"1H1S10M2H10M1S1H" // deceiving H in the middle
|
||||
};
|
||||
|
||||
BadCigarFilter filter;
|
||||
|
||||
@BeforeClass
|
||||
|
|
@ -21,40 +47,20 @@ public class BadCigarFilterUnitTest {
|
|||
filter = new BadCigarFilter();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(enabled = true)
|
||||
public void testWonkyCigars () {
|
||||
byte[] bases = {'A', 'A', 'A', 'A'};
|
||||
byte[] quals = {30, 30, 30, 30};
|
||||
GATKSAMRecord read;
|
||||
// starting with multiple deletions
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
for (String cigarString : BAD_CIGAR_LIST) {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString);
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
}
|
||||
}
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
@Test(enabled = true)
|
||||
public void testGoodCigars() {
|
||||
List<Cigar> cigarList = ReadClipperTestUtils.generateCigarList(10);
|
||||
for (Cigar cigar : cigarList) {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
|
||||
Assert.assertFalse(filter.filterOut(read), read.getCigarString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -37,17 +38,22 @@ public class ReadClipperTestUtils {
|
|||
return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* This function generates every valid permutation of cigar strings with a given length.
|
||||
*
|
||||
* A valid cigar object obeys the following rules:
|
||||
* - No Hard/Soft clips in the middle of the read
|
||||
* - No deletions in the beginning / end of the read
|
||||
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
|
||||
*
|
||||
* @param maximumLength the maximum number of elements in the cigar
|
||||
* @return a list with all valid Cigar objects
|
||||
*/
|
||||
public static GATKSAMRecord makeReadFromCigar(String cigarString) {
|
||||
return makeReadFromCigar(cigarFromString(cigarString));
|
||||
}
|
||||
|
||||
/**
|
||||
* This function generates every valid permutation of cigar strings with a given length.
|
||||
*
|
||||
* A valid cigar object obeys the following rules:
|
||||
* - No Hard/Soft clips in the middle of the read
|
||||
* - No deletions in the beginning / end of the read
|
||||
* - No repeated adjacent element (e.g. 1M2M -> this should be 3M)
|
||||
* - No consecutive I/D elements
|
||||
*
|
||||
* @param maximumLength the maximum number of elements in the cigar
|
||||
* @return a list with all valid Cigar objects
|
||||
*/
|
||||
public static List<Cigar> generateCigarList(int maximumLength) {
|
||||
int numCigarElements = cigarElements.length;
|
||||
LinkedList<Cigar> cigarList = new LinkedList<Cigar>();
|
||||
|
|
@ -137,7 +143,10 @@ public class ReadClipperTestUtils {
|
|||
CigarElement lastElement = null;
|
||||
int lastElementLength = 0;
|
||||
for (CigarElement cigarElement : rawCigar.getCigarElements()) {
|
||||
if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator())
|
||||
if (lastElement != null &&
|
||||
((lastElement.getOperator() == cigarElement.getOperator()) ||
|
||||
(lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) ||
|
||||
(lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I)))
|
||||
lastElementLength += cigarElement.getLength();
|
||||
else
|
||||
{
|
||||
|
|
@ -191,7 +200,7 @@ public class ReadClipperTestUtils {
|
|||
/**
|
||||
* Checks whether or not the read has any cigar element that is not H or S
|
||||
*
|
||||
* @param read
|
||||
* @param read the read
|
||||
* @return true if it has any M, I or D, false otherwise
|
||||
*/
|
||||
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
|
||||
|
|
@ -201,5 +210,79 @@ public class ReadClipperTestUtils {
|
|||
return false;
|
||||
}
|
||||
|
||||
public static Cigar cigarFromString(String cigarString) {
|
||||
Cigar cigar = new Cigar();
|
||||
|
||||
boolean isNumber = false;
|
||||
int number = 0;
|
||||
for (int i = 0; i < cigarString.length(); i++) {
|
||||
char x = cigarString.charAt(i);
|
||||
|
||||
if (x >= '0' && x <='9') {
|
||||
if (isNumber) {
|
||||
number *= 10;
|
||||
}
|
||||
else {
|
||||
isNumber = true;
|
||||
}
|
||||
number += x - '0';
|
||||
}
|
||||
|
||||
else {
|
||||
CigarElement e;
|
||||
switch (x) {
|
||||
case 'M':
|
||||
case 'm':
|
||||
e = new CigarElement(number, CigarOperator.M);
|
||||
break;
|
||||
|
||||
case 'I':
|
||||
case 'i':
|
||||
e = new CigarElement(number, CigarOperator.I);
|
||||
break;
|
||||
|
||||
case 'D':
|
||||
case 'd':
|
||||
e = new CigarElement(number, CigarOperator.D);
|
||||
break;
|
||||
|
||||
case 'S':
|
||||
case 's':
|
||||
e = new CigarElement(number, CigarOperator.S);
|
||||
break;
|
||||
|
||||
case 'N':
|
||||
case 'n':
|
||||
e = new CigarElement(number, CigarOperator.N);
|
||||
break;
|
||||
|
||||
case 'H':
|
||||
case 'h':
|
||||
e = new CigarElement(number, CigarOperator.H);
|
||||
break;
|
||||
|
||||
case 'P':
|
||||
case 'p':
|
||||
e = new CigarElement(number, CigarOperator.P);
|
||||
break;
|
||||
|
||||
case '=':
|
||||
e = new CigarElement(number, CigarOperator.EQ);
|
||||
break;
|
||||
|
||||
case 'X':
|
||||
case 'x':
|
||||
e = new CigarElement(number, CigarOperator.X);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")");
|
||||
}
|
||||
cigar.add(e);
|
||||
}
|
||||
}
|
||||
return cigar;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue