From eca9613356d2d53198671637ef9e1b18e30c0844 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 10 Aug 2012 14:54:07 -0400 Subject: [PATCH] Adding support of X and = CIGAR operators to the GATK --- .../haplotypecaller/GenotypingEngine.java | 2 + .../gatk/iterators/LocusIteratorByState.java | 2 + .../gatk/walkers/indels/IndelRealigner.java | 8 +- .../walkers/indels/SomaticIndelDetector.java | 8 +- .../broadinstitute/sting/utils/baq/BAQ.java | 2 +- .../sting/utils/sam/AlignmentUtils.java | 172 +++--------------- .../reads/TheoreticalMinimaBenchmark.java | 2 +- .../LocusIteratorByStateUnitTest.java | 40 ++++ 8 files changed, 83 insertions(+), 153 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 52c13d124..6afdc58ea 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -568,6 +568,8 @@ public class GenotypingEngine { refPos += elementLength; break; case M: + case EQ: + case X: int numSinceMismatch = -1; int stopOfMismatch = -1; int startOfMismatch = -1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 7d035d208..1606c227d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -195,6 +195,8 @@ public class LocusIteratorByState extends LocusIterator { done = true; break; case M: + case EQ: + case X: readOffset++; genomeOffset++; done = true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 5e0f15e6a..d61b9e9b6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -1025,7 +1025,9 @@ public class IndelRealigner extends ReadWalker { elements.add(ce); break; case M: - altIdx += elementLength; + case EQ: + case X: + altIdx += elementLength; case N: if ( reference.length < refIdx + elementLength ) ok_flag = false; @@ -1287,6 +1289,8 @@ public class IndelRealigner extends ReadWalker { int elementLength = ce.getLength(); switch ( ce.getOperator() ) { case M: + case EQ: + case X: for (int k = 0 ; k < elementLength ; k++, refIdx++, altIdx++ ) { if ( refIdx >= reference.length ) break; @@ -1432,6 +1436,8 @@ public class IndelRealigner extends ReadWalker { fromIndex += elementLength; break; case M: + case EQ: + case X: case I: System.arraycopy(actualReadBases, fromIndex, readBases, toIndex, elementLength); System.arraycopy(actualBaseQuals, fromIndex, baseQuals, toIndex, elementLength); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java index ba16fd709..b0c09f78e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java @@ -2057,7 +2057,9 @@ public class SomaticIndelDetector extends ReadWalker { break; // do not count gaps or clipped bases case I: case M: - readLength += cel.getLength(); + case EQ: + case X: + readLength += cel.getLength(); break; // advance along the gapless block in the alignment default : throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator()); @@ -2094,7 +2096,9 @@ public class SomaticIndelDetector extends ReadWalker { break; case M: - for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { + case EQ: + case X: + for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch! mms++; mismatch_flags[posOnRef] = 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index 186452294..439a0d8ed 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -551,7 +551,7 @@ public class BAQ { switch (elt.getOperator()) { case N: return null; // cannot handle these case H : case P : case D: break; // ignore pads, hard clips, and deletions - case I : case S: case M: + case I : case S: case M: case EQ: case X: int prev = readI; readI += elt.getLength(); if ( includeClippedBases || elt.getOperator() != CigarOperator.S) { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 2c388a1e0..4f1e66ba2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.recalibration.EventType; import java.util.ArrayList; import java.util.Arrays; -import java.util.BitSet; public class AlignmentUtils { @@ -71,9 +70,18 @@ public class AlignmentUtils { if (readIdx > endOnRead) break; CigarElement ce = c.getCigarElement(i); + final int elementLength = ce.getLength(); switch (ce.getOperator()) { + case X: + mc.numMismatches += elementLength; + for (int j = 0; j < elementLength; j++) + mc.mismatchQualities += r.getBaseQualities()[readIdx+j]; + case EQ: + refIndex += elementLength; + readIdx += elementLength; + break; case M: - for (int j = 0; j < ce.getLength(); j++, refIndex++, readIdx++) { + for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) { if (refIndex >= refSeq.length) continue; if (readIdx < startOnRead) continue; @@ -92,11 +100,11 @@ public class AlignmentUtils { break; case I: case S: - readIdx += ce.getLength(); + readIdx += elementLength; break; case D: case N: - refIndex += ce.getLength(); + refIndex += elementLength; break; case H: case P: @@ -164,6 +172,8 @@ public class AlignmentUtils { CigarElement ce = c.getCigarElement(i); int cigarElementLength = ce.getLength(); switch (ce.getOperator()) { + case EQ: + case X: case M: for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { // are we past the ref window? @@ -204,111 +214,6 @@ public class AlignmentUtils { return sum; } - /** - * Returns the number of mismatches in the pileup element within the given reference context. - * - * @param read the SAMRecord - * @param ref the reference context - * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good - * @param windowSize window size (on each side) to test - * @return a bitset representing which bases are good - */ - public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { - // first determine the positions with mismatches - int readLength = read.getReadLength(); - BitSet mismatches = new BitSet(readLength); - - // it's possible we aren't starting at the beginning of a read, - // and we don't need to look at any of the previous context outside our window - // (although we do need future context) - int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize); - int currentReadPos = read.getAlignmentStart(); - - byte[] refBases = ref.getBases(); - int refIndex = readStartPos - ref.getWindow().getStart(); - if (refIndex < 0) { - throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); - } - - byte[] readBases = read.getReadBases(); - int readIndex = 0; - - Cigar c = read.getCigar(); - - for (int i = 0; i < c.numCigarElements(); i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch (ce.getOperator()) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++) { - // skip over unwanted bases - if (currentReadPos++ < readStartPos) - continue; - - // this is possible if reads extend beyond the contig end - if (refIndex >= refBases.length) - break; - - byte refChr = refBases[refIndex]; - byte readChr = readBases[readIndex]; - if (readChr != refChr) - mismatches.set(readIndex); - - refIndex++; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - if (currentReadPos >= readStartPos) - refIndex += cigarElementLength; - currentReadPos += cigarElementLength; - break; - case H: - case P: - break; - } - } - - // all bits are set to false by default - BitSet result = new BitSet(readLength); - - int currentPos = 0, leftPos = 0, rightPos; - int mismatchCount = 0; - - // calculate how many mismatches exist in the windows to the left/right - for (rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { - if (mismatches.get(rightPos)) - mismatchCount++; - } - if (mismatchCount <= maxMismatches) - result.set(currentPos); - - // now, traverse over the read positions - while (currentPos < readLength) { - // add a new rightmost position - if (rightPos < readLength && mismatches.get(rightPos++)) - mismatchCount++; - // re-penalize the previous position - if (mismatches.get(currentPos++)) - mismatchCount++; - // don't penalize the current position - if (mismatches.get(currentPos)) - mismatchCount--; - // subtract the leftmost position - if (leftPos < currentPos - windowSize && mismatches.get(leftPos++)) - mismatchCount--; - - if (mismatchCount <= maxMismatches) - result.set(currentPos); - } - - return result; - } - /** * Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but @@ -367,45 +272,6 @@ public class AlignmentUtils { return n; } - public static byte[] alignmentToByteArray(final Cigar cigar, final byte[] read, final byte[] ref) { - - final byte[] alignment = new byte[read.length]; - int refPos = 0; - int alignPos = 0; - - for (int iii = 0; iii < cigar.numCigarElements(); iii++) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch (ce.getOperator()) { - case I: - case S: - for (int jjj = 0; jjj < elementLength; jjj++) { - alignment[alignPos++] = '+'; - } - break; - case D: - case N: - refPos += elementLength; - break; - case M: - for (int jjj = 0; jjj < elementLength; jjj++) { - alignment[alignPos] = ref[refPos]; - alignPos++; - refPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); - } - } - return alignment; - } - public static int calcNumHighQualitySoftClips( final GATKSAMRecord read, final byte qualThreshold ) { int numHQSoftClips = 0; @@ -426,6 +292,8 @@ public class AlignmentUtils { break; case M: case I: + case EQ: + case X: alignPos += elementLength; break; case H: @@ -488,6 +356,8 @@ public class AlignmentUtils { } break; case M: + case EQ: + case X: if (pos + elementLength - 1 >= pileupOffset) { return alignmentPos + (pileupOffset - pos); } else { @@ -519,6 +389,8 @@ public class AlignmentUtils { case D: case N: case M: + case EQ: + case X: alignmentLength += elementLength; break; case I: @@ -565,6 +437,8 @@ public class AlignmentUtils { } break; case M: + case EQ: + case X: for (int jjj = 0; jjj < elementLength; jjj++) { alignment[alignPos] = read[readPos]; alignPos++; @@ -798,6 +672,8 @@ public class AlignmentUtils { switch (ce.getOperator()) { case M: + case EQ: + case X: readIndex += length; refIndex += length; totalRefBases += length; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java index 8e67c9efc..1abca5487 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java @@ -94,7 +94,7 @@ public class TheoreticalMinimaBenchmark extends ReadProcessingBenchmark { int elementSize = cigarElement.getLength(); while(elementSize > 0) { switch(cigarElement.getOperator()) { - case M: matchMismatches++; break; + case M: case EQ: case X: matchMismatches++; break; case I: insertions++; break; case D: deletions++; break; default: others++; break; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 218548b00..dc908c323 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -41,6 +41,46 @@ public class LocusIteratorByStateUnitTest extends BaseTest { return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); } + @Test + public void testXandEQOperators() { + final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); + r1.setReadBases(bases1); + r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r1.setCigarString("10M"); + + SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); + r2.setReadBases(bases2); + r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r2.setCigarString("3=1X5=1X"); + + SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); + r3.setReadBases(bases2); + r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r3.setCigarString("3=1X5M1X"); + + SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); + r4.setReadBases(bases2); + r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r4.setCigarString("10M"); + + List reads = Arrays.asList(r1, r2, r3, r4); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 4); + } + } + @Test public void testIndelsInRegularPileup() { final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'};