Taking care of bad cigars in the GATK
* fixed BadCigarFilter to filter out reads starting/ending in deletion and that have adjacent I/D events. * added Unit tests for BadCigarFilter * updated all exceptions in LocusIteratorByState to tell the user that he can instead run with -rf BadCigar * added the BadCigar filter to ReduceReads and RealignTargetCreator (if your walker blows up with these malformed reads, you may want to add it too)
This commit is contained in:
parent
b290152542
commit
0e93cf5297
|
|
@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter {
|
||||||
|
|
||||||
public boolean filterOut(final SAMRecord rec) {
|
public boolean filterOut(final SAMRecord rec) {
|
||||||
Cigar c = rec.getCigar();
|
Cigar c = rec.getCigar();
|
||||||
boolean lastElementWasIndel = false;
|
boolean previousElementWasIndel = false;
|
||||||
for ( CigarElement ce : c.getCigarElements() ) {
|
CigarOperator lastOp = c.getCigarElement(0).getOperator();
|
||||||
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
|
|
||||||
if ( lastElementWasIndel )
|
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
|
||||||
return true;
|
return true;
|
||||||
lastElementWasIndel = true;
|
|
||||||
} else {
|
for (CigarElement ce : c.getCigarElements()) {
|
||||||
lastElementWasIndel = false;
|
CigarOperator op = ce.getOperator();
|
||||||
|
if (op == CigarOperator.D || op == CigarOperator.I) {
|
||||||
|
if (previousElementWasIndel)
|
||||||
|
return true; // filter out reads with adjacent I/D
|
||||||
|
|
||||||
|
previousElementWasIndel = true;
|
||||||
}
|
}
|
||||||
|
else // this is a regular base (match/mismatch/hard or soft clip)
|
||||||
|
previousElementWasIndel = false; // reset the previous element
|
||||||
|
|
||||||
|
lastOp = op;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return lastOp == CigarOperator.D;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -199,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
return stepForwardOnGenome();
|
return stepForwardOnGenome();
|
||||||
} else {
|
} else {
|
||||||
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
||||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
|
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||||
|
|
||||||
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
||||||
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
||||||
|
|
@ -236,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
||||||
// past the insertion right after that
|
// past the insertion right after that
|
||||||
if (eventDelayedFlag > 1)
|
if (eventDelayedFlag > 1)
|
||||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||||
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
||||||
eventLength = curElement.getLength();
|
eventLength = curElement.getLength();
|
||||||
eventStart = readOffset;
|
eventStart = readOffset;
|
||||||
|
|
@ -249,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
break;
|
break;
|
||||||
case D: // deletion w.r.t. the reference
|
case D: // deletion w.r.t. the reference
|
||||||
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
||||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
|
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||||
if (generateExtendedEvents) {
|
if (generateExtendedEvents) {
|
||||||
if (cigarElementCounter == 1) {
|
if (cigarElementCounter == 1) {
|
||||||
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
||||||
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
||||||
if (eventDelayedFlag > 1)
|
if (eventDelayedFlag > 1)
|
||||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||||
eventLength = curElement.getLength();
|
eventLength = curElement.getLength();
|
||||||
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
||||||
eventStart = readOffset;
|
eventStart = readOffset;
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.sam;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
|
@ -233,6 +234,16 @@ public class ArtificialSAMUtils {
|
||||||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
|
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static GATKSAMRecord createArtificialRead(Cigar cigar) {
|
||||||
|
int length = cigar.getReadLength();
|
||||||
|
byte [] base = {'A'};
|
||||||
|
byte [] qual = {30};
|
||||||
|
byte [] bases = Utils.arrayFromArrayWithLength(base, length);
|
||||||
|
byte [] quals = Utils.arrayFromArrayWithLength(qual, length);
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
|
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
|
||||||
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
|
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.broadinstitute.sting.gatk.filters;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
|
||||||
|
*
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/20/12
|
||||||
|
*/
|
||||||
|
public class BadCigarFilterUnitTest {
|
||||||
|
|
||||||
|
BadCigarFilter filter;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public void init() {
|
||||||
|
filter = new BadCigarFilter();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWonkyCigars () {
|
||||||
|
byte[] bases = {'A', 'A', 'A', 'A'};
|
||||||
|
byte[] quals = {30, 30, 30, 30};
|
||||||
|
GATKSAMRecord read;
|
||||||
|
// starting with multiple deletions
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue