From dda2173c6669128de215870ede334621d32c5bc2 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 4 Apr 2012 16:04:29 -0400 Subject: [PATCH] Moved the Smith-Watermaning of haplotypes to earlier in the process so that alleles sent to genotyping would have the exact genomic sequence of the active region they represent. As a side effect cleaned up some edge case problems with variants, both real and false, which show up on the edges of active regions. Removed code that was replicated between the Haplotype class and ReadUtils. Finally figured out how to ensure that the indel calls coming out of the HC were left aligned. --- .../broadinstitute/sting/utils/Haplotype.java | 112 ++++-------------- .../sting/utils/sam/ReadUtils.java | 59 +++++++-- .../sting/utils/HaplotypeUnitTest.java | 4 +- 3 files changed, 73 insertions(+), 102 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 1820ddbc9..a8c622a96 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -31,6 +31,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.*; @@ -41,6 +42,8 @@ public class Haplotype { private GenomeLoc genomeLocation = null; private HashMap readLikelihoodsPerSample = null; private boolean isRef = false; + private Cigar cigar; + private int alignmentStartHapwrtRef; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual @@ -112,11 +115,7 @@ public class Haplotype { @Override public String toString() { - String returnString = ""; - for(int iii = 0; iii < bases.length; iii++) { - returnString += (char) bases[iii]; - } - return returnString; + return new String(bases); } public double[] getQuals() { @@ -134,11 +133,27 @@ public class Haplotype { return genomeLocation.getStop(); } - @Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"}) - public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) { + public int getAlignmentStartHapwrtRef() { + return alignmentStartHapwrtRef; + } + + public void setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef ) { + this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; + } + + public Cigar getCigar() { + return cigar; + } + + public void setCigar( final Cigar cigar ) { + this.cigar = cigar; + } + + @Requires({"refInsertLocation >= 0"}) + public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation ) { if( refAllele.length() != altAllele.length() ) { refInsertLocation++; } - int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStartInRefCoords, haplotypeCigar, refInsertLocation); + int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype return bases.clone(); } @@ -233,85 +248,4 @@ public class Haplotype { return haplotypeMap; } - - // BUGBUG: copied from ReadClipper and slightly modified since we don't have the data in a GATKSAMRecord - private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) { - int readBases = 0; - int refBases = 0; - boolean fallsInsideDeletion = false; - - int goal = refCoord - haplotypeStart; // The goal is to move this many reference bases - boolean goalReached = refBases == goal; - - Iterator cigarElementIterator = haplotypeCigar.getCigarElements().iterator(); - while (!goalReached && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - int shift = 0; - - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { - if (refBases + cigarElement.getLength() < goal) - shift = cigarElement.getLength(); - else - shift = goal - refBases; - - refBases += shift; - } - goalReached = refBases == goal; - - if (!goalReached && cigarElement.getOperator().consumesReadBases()) - readBases += cigarElement.getLength(); - - if (goalReached) { - // Is this base's reference position within this cigar element? Or did we use it all? - boolean endsWithinCigar = shift < cigarElement.getLength(); - - // If it isn't, we need to check the next one. There should *ALWAYS* be a next one - // since we checked if the goal coordinate is within the read length, so this is just a sanity check. - if (!endsWithinCigar && !cigarElementIterator.hasNext()) - return -1; - - CigarElement nextCigarElement; - - // if we end inside the current cigar element, we just have to check if it is a deletion - if (endsWithinCigar) - fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. - else { - nextCigarElement = cigarElementIterator.next(); - - // if it's an insertion, we need to clip the whole insertion before looking at the next element - if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { - readBases += nextCigarElement.getLength(); - if (!cigarElementIterator.hasNext()) - return -1; - - nextCigarElement = cigarElementIterator.next(); - } - - // if it's a deletion, we will pass the information on to be handled downstream. - fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; - } - - // If we reached our goal outside a deletion, add the shift - if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) - readBases += shift; - - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar) - readBases += shift - 1; - - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion - else if (fallsInsideDeletion && endsWithinCigar) - readBases--; - } - } - - if (!goalReached) - return -1; - - return (fallsInsideDeletion ? -1 : readBases); - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index cbb4120dd..f9975148a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -49,6 +49,7 @@ public class ReadUtils { } private static int DEFAULT_ADAPTOR_SIZE = 100; + public static int CLIPPING_GOAL_NOT_REACHED = -1; /** * A marker to tell which end of the read has been clipped @@ -362,7 +363,11 @@ public class ReadUtils { @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { - Pair result = getReadCoordinateForReferenceCoordinate(read, refCoord); + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); + } + + public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { + Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); int readCoord = result.getFirst(); // Corner case one: clipping the right tail and falls on deletion, move to the next @@ -374,9 +379,9 @@ public class ReadUtils { // clipping the left tail and first base is insertion, go to the next read coordinate // with the same reference coordinate. Advance to the next cigar element, or to the // end of the read if there is no next element. - Pair firstElementIsInsertion = readStartsWithInsertion(read); + Pair firstElementIsInsertion = readStartsWithInsertion(cigar); if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst()) - readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1); + readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), cigar.getReadLength() - 1); return readCoord; } @@ -400,14 +405,25 @@ public class ReadUtils { @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false); + } + + public static Pair getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletion = false; - int goal = refCoord - read.getSoftStart(); // The goal is to move this many reference bases + int goal = refCoord - alignmentStart; // The goal is to move this many reference bases + if (goal < 0) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + } + } boolean goalReached = refBases == goal; - Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); while (!goalReached && cigarElementIterator.hasNext()) { CigarElement cigarElement = cigarElementIterator.next(); int shift = 0; @@ -431,8 +447,13 @@ public class ReadUtils { // If it isn't, we need to check the next one. There should *ALWAYS* be a next one // since we checked if the goal coordinate is within the read length, so this is just a sanity check. - if (!endsWithinCigar && !cigarElementIterator.hasNext()) - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + if (!endsWithinCigar && !cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + } + } CigarElement nextCigarElement; @@ -447,8 +468,13 @@ public class ReadUtils { // if it's an insertion, we need to clip the whole insertion before looking at the next element if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { readBases += nextCigarElement.getLength(); - if (!cigarElementIterator.hasNext()) - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + if (!cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + } + } nextCigarElement = cigarElementIterator.next(); } @@ -473,8 +499,13 @@ public class ReadUtils { } } - if (!goalReached) - throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + if (!goalReached) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + } + } return new Pair(readBases, fallsInsideDeletion); } @@ -527,7 +558,11 @@ public class ReadUtils { * @return A pair with the answer (true/false) and the element or null if it doesn't exist */ public static Pair readStartsWithInsertion(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + return readStartsWithInsertion(read.getCigar()); + } + + public static Pair readStartsWithInsertion(final Cigar cigar) { + for (CigarElement cigarElement : cigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.INSERTION) return new Pair(true, cigarElement); diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java index 86bc2d59b..87852f9ca 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -152,7 +152,9 @@ public class HaplotypeUnitTest extends BaseTest { final Haplotype h = new Haplotype(hap.getBytes()); final Allele h1refAllele = Allele.create(ref, true); final Allele h1altAllele = Allele.create(alt, false); - final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE, 0, cigar) ); + h.setAlignmentStartHapwrtRef(0); + h.setCigar(cigar); + final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE) ); final Haplotype h1expected = new Haplotype(newHap.getBytes()); Assert.assertEquals(h1, h1expected); }