Taking care of bad cigars in the GATK
* fixed BadCigarFilter to filter out reads starting/ending in deletion and that have adjacent I/D events. * added Unit tests for BadCigarFilter * updated all exceptions in LocusIteratorByState to tell the user that he can instead run with -rf BadCigar * added the BadCigar filter to ReduceReads and RealignTargetCreator (if your walker blows up with these malformed reads, you may want to add it too)
This commit is contained in:
parent
b290152542
commit
0e93cf5297
|
|
@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter {
|
|||
|
||||
public boolean filterOut(final SAMRecord rec) {
|
||||
Cigar c = rec.getCigar();
|
||||
boolean lastElementWasIndel = false;
|
||||
for ( CigarElement ce : c.getCigarElements() ) {
|
||||
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
|
||||
if ( lastElementWasIndel )
|
||||
return true;
|
||||
lastElementWasIndel = true;
|
||||
} else {
|
||||
lastElementWasIndel = false;
|
||||
boolean previousElementWasIndel = false;
|
||||
CigarOperator lastOp = c.getCigarElement(0).getOperator();
|
||||
|
||||
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
|
||||
return true;
|
||||
|
||||
for (CigarElement ce : c.getCigarElements()) {
|
||||
CigarOperator op = ce.getOperator();
|
||||
if (op == CigarOperator.D || op == CigarOperator.I) {
|
||||
if (previousElementWasIndel)
|
||||
return true; // filter out reads with adjacent I/D
|
||||
|
||||
previousElementWasIndel = true;
|
||||
}
|
||||
else // this is a regular base (match/mismatch/hard or soft clip)
|
||||
previousElementWasIndel = false; // reset the previous element
|
||||
|
||||
lastOp = op;
|
||||
}
|
||||
|
||||
return false;
|
||||
return lastOp == CigarOperator.D;
|
||||
}
|
||||
}
|
||||
|
|
@ -199,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
return stepForwardOnGenome();
|
||||
} else {
|
||||
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
|
||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||
|
||||
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
||||
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
||||
|
|
@ -236,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
||||
// past the insertion right after that
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
||||
eventLength = curElement.getLength();
|
||||
eventStart = readOffset;
|
||||
|
|
@ -249,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
break;
|
||||
case D: // deletion w.r.t. the reference
|
||||
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
|
||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||
if (generateExtendedEvents) {
|
||||
if (cigarElementCounter == 1) {
|
||||
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
||||
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||
eventLength = curElement.getLength();
|
||||
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
||||
eventStart = readOffset;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.sam;
|
|||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -233,7 +234,17 @@ public class ArtificialSAMUtils {
|
|||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord createArtificialRead(Cigar cigar) {
|
||||
int length = cigar.getReadLength();
|
||||
byte [] base = {'A'};
|
||||
byte [] qual = {30};
|
||||
byte [] bases = Utils.arrayFromArrayWithLength(base, length);
|
||||
byte [] quals = Utils.arrayFromArrayWithLength(qual, length);
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString());
|
||||
}
|
||||
|
||||
|
||||
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
|
||||
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
|
||||
GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/20/12
|
||||
*/
|
||||
public class BadCigarFilterUnitTest {
|
||||
|
||||
BadCigarFilter filter;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
filter = new BadCigarFilter();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWonkyCigars () {
|
||||
byte[] bases = {'A', 'A', 'A', 'A'};
|
||||
byte[] quals = {30, 30, 30, 30};
|
||||
GATKSAMRecord read;
|
||||
// starting with multiple deletions
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue