diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 13969eb54..c4a0480ef 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -180,7 +180,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot for (final PileupElement p : pileup) { final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus); - candidateHaplotypeQueue.add(haplotypeFromRead); + if ( haplotypeFromRead != null ) + candidateHaplotypeQueue.add(haplotypeFromRead); } // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes @@ -230,8 +231,18 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot return null; } + /** + * Return a haplotype object constructed from the read or null if read's cigar is null + * + * @param p pileup element representing the read + * @param contextSize the context size to use + * @param locus the position + * @return possibly null Haplotype object constructed from the read + */ private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); + if ( read.getCigar() == null ) + return null; final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); @@ -347,6 +358,10 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot double expected = 0.0; double mismatches = 0.0; + final GATKSAMRecord read = p.getRead(); + if ( read.getCigar() == null ) + return 0.0; + // What's the expected mismatch rate under the model that this read is actually sampled from // this haplotype? Let's assume the consensus base c is a random choice one of A, C, G, or T, and that // the observed base is actually from a c with an error rate e. Since e is the rate at which we'd @@ -358,14 +373,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n final byte[] haplotypeBases = haplotype.getBases(); - final GATKSAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string - int readOffsetFromPileup = p.getOffset(); - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, read.getAlignmentStart(), locus); + int readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index ddca5e0b8..df05a5ea2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -87,7 +87,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio if (alleleLikelihoodMap == null) { // use old UG SNP-based version if we don't have per-read allele likelihoods for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { + if ( isUsableBase(p) && p.getRead().getCigar() != null ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); readPos = getFinalReadPosition(p.getRead(),readPos); @@ -105,7 +105,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = el.getKey(); final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); - if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) + if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED || read.getCigar() == null ) continue; int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, 0, 0 ); final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index a45123b8b..c675289d4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -404,7 +404,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); - haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0) ); + haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0, true) ); if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 ) { // protect against SW failures return false; @@ -445,7 +445,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); - h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) ); + h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0, true) ); if ( haplotype.isArtificialHaplotype() ) { h.setArtificialEvent(haplotype.getArtificialEvent()); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index ad554a130..044fb1dcf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -767,7 +767,7 @@ public class IndelRealigner extends ReadWalker { final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0); if ( improvement >= LOD_THRESHOLD ) { - bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference); + bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference, true); // start cleaning the appropriate reads for ( Pair indexPair : bestConsensus.readIndexes ) { @@ -926,7 +926,7 @@ public class IndelRealigner extends ReadWalker { // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); if ( numBlocks == 2 ) { - Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-leftmostIndex, 0); + Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-leftmostIndex, 0, true); aRead.setCigar(newCigar, false); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index 796b04923..6e91b8514 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -110,7 +110,7 @@ public class LeftAlignIndels extends ReadWalker { // move existing indels (for 1 indel reads only) to leftmost position within identical sequence int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); if ( numBlocks == 2 ) { - Cigar newCigar = AlignmentUtils.leftAlignIndel(IndelRealigner.unclipCigar(read.getCigar()), ref.getBases(), read.getReadBases(), 0, 0); + Cigar newCigar = AlignmentUtils.leftAlignIndel(IndelRealigner.unclipCigar(read.getCigar()), ref.getBases(), read.getReadBases(), 0, 0, true); newCigar = IndelRealigner.reclipCigar(newCigar, read); read.setCigar(newCigar); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 125c738d3..22561a66d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -153,7 +153,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("87fe31a4bbd68a9eb5d5910db5011c82")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("bd8c30b99d0ac7c4108e3d88c272a996")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index 17f75229a..95c42a336 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -163,7 +163,7 @@ public class LeftAlignVariants extends RodWalker { Cigar originalCigar = new Cigar(elements); // left align the CIGAR - Cigar newCigar = AlignmentUtils.leftAlignIndel(originalCigar, refSeq, originalIndel, 0, 0); + Cigar newCigar = AlignmentUtils.leftAlignIndel(originalCigar, refSeq, originalIndel, 0, 0, true); // update if necessary and write if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index f29721a7e..eec615491 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,11 +31,9 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.recalibration.EventType; import java.util.ArrayList; @@ -66,7 +64,27 @@ public final class AlignmentUtils { // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single // todo -- high performance implementation. We can do a lot better than this right now + + /** + * Count how many bases mismatch the reference. Indels are not considered mismatching. + * + * @param r the sam record to check against + * @param refSeq the byte array representing the reference sequence + * @param refIndex the index in the reference byte array of the read's first base + * @param startOnRead the index in the read's bases from which we start counting + * @param nReadBases the number of bases after (but including) startOnRead that we check + * @return non-null object representing the mismatch count + */ + @Ensures("result != null") public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { + if ( r == null ) throw new IllegalArgumentException("attempting to calculate the mismatch count from a read that is null"); + if ( refSeq == null ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a reference sequence that is null"); + if ( refIndex < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a reference index that is negative"); + if ( startOnRead < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count with a read start that is negative"); + if ( nReadBases < 0 ) throw new IllegalArgumentException("attempting to calculate the mismatch count for a negative number of read bases"); + if ( refSeq.length - refIndex < r.getReadLength() ) + throw new IllegalArgumentException("attempting to calculate the mismatch count against a reference string that is smaller than the read"); + MismatchCount mc = new MismatchCount(); int readIdx = 0; @@ -241,7 +259,24 @@ public final class AlignmentUtils { return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), pileupElement.isDeletion(), alignmentStart, refLocus ); } + /** + * Calculate the index into the read's bases of the beginning of the encompassing cigar element for a given cigar and offset + * + * @param cigar the read's CIGAR -- cannot be null + * @param offset the offset to use for the calculation or -1 if in the middle of a deletion + * @param isDeletion are we in the middle of a deletion? + * @param alignmentStart the alignment start of the read + * @param refLocus the reference position of the offset + * @return a non-negative int index + */ + @Ensures("result >= 0") public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus) { + if ( cigar == null ) throw new IllegalArgumentException("attempting to find the alignment position from a CIGAR that is null"); + if ( offset < -1 ) throw new IllegalArgumentException("attempting to find the alignment position with an offset that is negative (and not -1)"); + if ( alignmentStart < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from an alignment start that is negative"); + if ( refLocus < 0 ) throw new IllegalArgumentException("attempting to find the alignment position from a reference position that is negative"); + if ( offset >= cigar.getReadLength() ) throw new IllegalArgumentException("attempting to find the alignment position of an offset than is larger than the read length"); + int pileupOffset = offset; // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases @@ -302,32 +337,19 @@ public final class AlignmentUtils { return alignmentPos; } + /** + * Generate an array of bases for just those that are aligned to the reference (i.e. no clips or insertions) + * + * @param cigar the read's CIGAR -- cannot be null + * @param read the read's base array + * @return a non-null array of bases (bytes) + */ + @Ensures("result != null") public static byte[] readToAlignmentByteArray(final Cigar cigar, final byte[] read) { + if ( cigar == null ) throw new IllegalArgumentException("attempting to generate an alignment from a CIGAR that is null"); + if ( read == null ) throw new IllegalArgumentException("attempting to generate an alignment from a read sequence that is null"); - int alignmentLength = 0; - for (int iii = 0; iii < cigar.numCigarElements(); iii++) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch (ce.getOperator()) { - case D: - case N: - case M: - case EQ: - case X: - alignmentLength += elementLength; - break; - case I: - case S: - case H: - case P: - break; - default: - throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); - } - } - + final int alignmentLength = cigar.getReferenceLength(); final byte[] alignment = new byte[alignmentLength]; int alignPos = 0; int readPos = 0; @@ -339,35 +361,31 @@ public final class AlignmentUtils { switch (ce.getOperator()) { case I: if (alignPos > 0) { - if (alignment[alignPos - 1] == BaseUtils.Base.A.base) { - alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.Base.C.base) { - alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.Base.T.base) { - alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.Base.G.base) { - alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; + final int prevPos = alignPos - 1; + if (alignment[prevPos] == BaseUtils.Base.A.base) { + alignment[prevPos] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[prevPos] == BaseUtils.Base.C.base) { + alignment[prevPos] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[prevPos] == BaseUtils.Base.T.base) { + alignment[prevPos] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; + } else if (alignment[prevPos] == BaseUtils.Base.G.base) { + alignment[prevPos] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } } case S: - for (int jjj = 0; jjj < elementLength; jjj++) { - readPos++; - } + readPos += elementLength; break; case D: case N: for (int jjj = 0; jjj < elementLength; jjj++) { - alignment[alignPos] = PileupElement.DELETION_BASE; - alignPos++; + alignment[alignPos++] = PileupElement.DELETION_BASE; } break; case M: case EQ: case X: for (int jjj = 0; jjj < elementLength; jjj++) { - alignment[alignPos] = read[readPos]; - alignPos++; - readPos++; + alignment[alignPos++] = read[readPos++]; } break; case H: @@ -450,34 +468,98 @@ public final class AlignmentUtils { * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are * always the positions where the cigar starts on the ref and on the read, respectively. *

- * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar - * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT - * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence - * is not repeated), the original cigar is returned. + * If the alignment has one or more indels, this method attempts to move them left across a stretch of repetitive bases. + * For instance, if the original cigar specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output + * cigar will always mark the leftmost AT as deleted. If there is no indel in the original cigar or if the indel position + * is determined unambiguously (i.e. inserted/deleted sequence is not repeated), the original cigar is returned. + * + * Note that currently we do not actually support the case where there is more than one indel in the alignment. We will throw + * an exception if there is -- unless the * * @param cigar structure of the original alignment * @param refSeq reference sequence the read is aligned to * @param readSeq read sequence * @param refIndex 0-based alignment start position on ref * @param readIndex 0-based alignment start position on read - * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) + * @param doNotThrowExceptionForMultipleIndels if true we will not throw an exception if we encounter multiple indels in the alignment will instead will return the original cigar + * @return a non-null cigar, in which the indels are guaranteed to be placed at the leftmost possible position across a repeat (if any) */ - public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { + @Ensures("result != null") + public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex, final boolean doNotThrowExceptionForMultipleIndels) { + ensureLeftAlignmentHasGoodArguments(cigar, refSeq, readSeq, refIndex, readIndex); + + final int numIndels = countIndelElements(cigar); + if ( numIndels == 0 ) + return cigar; + if ( numIndels == 1 ) + return leftAlignSingleIndel(cigar, refSeq, readSeq, refIndex, readIndex); + + // if we got here then there is more than 1 indel in the alignment + if ( doNotThrowExceptionForMultipleIndels ) + return cigar; + + throw new UnsupportedOperationException("attempting to left align a CIGAR that has more than 1 indel in its alignment but this functionality has not been implemented yet"); + } + + private static void ensureLeftAlignmentHasGoodArguments(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { + if ( cigar == null ) throw new IllegalArgumentException("attempting to left align a CIGAR that is null"); + if ( refSeq == null ) throw new IllegalArgumentException("attempting to left align a reference sequence that is null"); + if ( readSeq == null ) throw new IllegalArgumentException("attempting to left align a read sequence that is null"); + if ( refIndex < 0 ) throw new IllegalArgumentException("attempting to left align with a reference index less than 0"); + if ( readIndex < 0 ) throw new IllegalArgumentException("attempting to left align with a read index less than 0"); + } + + /** + * Counts the number of I/D operators + * + * @param cigar cigar to check -- cannot be null + * @return non-negative count of indel operators + */ + @Requires("cigar != null") + @Ensures("result >= 0") + private static int countIndelElements(final Cigar cigar) { + int indelCount = 0; + for ( CigarElement ce : cigar.getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) + indelCount++; + } + return indelCount; + } + + /** + * See the documentation for AlignmentUtils.leftAlignIndel() for more details. + * + * This flavor of the left alignment works if and only if the alignment has one - and only one - indel. + * An exception is thrown if there are no indels or more than 1 indel in the alignment. + * + * @param cigar structure of the original alignment -- cannot be null + * @param refSeq reference sequence the read is aligned to + * @param readSeq read sequence + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return a non-null cigar, in which the single indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) + */ + @Ensures("result != null") + public static Cigar leftAlignSingleIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { + ensureLeftAlignmentHasGoodArguments(cigar, refSeq, readSeq, refIndex, readIndex); int indexOfIndel = -1; for (int i = 0; i < cigar.numCigarElements(); i++) { CigarElement ce = cigar.getCigarElement(i); if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { - // if there is more than 1 indel, don't left align + // if there is more than 1 indel, exception out if (indexOfIndel != -1) - return cigar; + throw new IllegalArgumentException("attempting to left align a CIGAR that has more than 1 indel in its alignment"); indexOfIndel = i; } } - // if there is no indel or if the alignment starts with an insertion (so that there - // is no place on the read to move that insertion further left), we are done - if (indexOfIndel < 1) return cigar; + // if there is no indel, exception out + if ( indexOfIndel == -1 ) + throw new IllegalArgumentException("attempting to left align a CIGAR that has no indels in its alignment"); + // if the alignment starts with an insertion (so that there is no place on the read to move that insertion further left), we are done + if ( indexOfIndel == 0 ) + return cigar; final int indelLength = cigar.getCigarElement(indexOfIndel).getLength(); @@ -545,6 +627,15 @@ public final class AlignmentUtils { return new Cigar(elements); } + /** + * Move the indel in a given cigar string one base to the left + * + * @param cigar original cigar + * @param indexOfIndel the index of the indel cigar element + * @return non-null cigar with indel moved one base to the left + */ + @Requires("cigar != null && indexOfIndel >= 0 && indexOfIndel < cigar.numCigarElements()") + @Ensures("result != null") private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) { // get the first few elements ArrayList elements = new ArrayList(cigar.numCigarElements()); @@ -568,6 +659,19 @@ public final class AlignmentUtils { return new Cigar(elements); } + /** + * Create the string (really a byte array) representation of an indel-containing cigar against the reference. + * + * @param cigar the indel-containing cigar + * @param indexOfIndel the index of the indel cigar element + * @param refSeq the reference sequence + * @param readSeq the read sequence for the cigar + * @param refIndex the starting reference index into refSeq + * @param readIndex the starting read index into readSeq + * @return non-null byte array which is the indel representation against the reference + */ + @Requires("cigar != null && indexOfIndel >= 0 && indexOfIndel < cigar.numCigarElements() && refSeq != null && readSeq != null && refIndex >= 0 && readIndex >= 0") + @Ensures("result != null") private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { CigarElement indel = cigar.getCigarElement(indexOfIndel); int indelLength = indel.getLength(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index d9f514593..4338d27e4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -325,4 +326,318 @@ public class AlignmentUtilsUnitTest { final int actual = AlignmentUtils.calcNumHighQualitySoftClips(read, (byte) qualThreshold); Assert.assertEquals(actual, numExpected, "Wrong number of soft clips detected for read " + read.getSAMString()); } + + //////////////////////////////////////////// + // Test AlignmentUtils.getMismatchCount() // + //////////////////////////////////////////// + + @DataProvider(name = "MismatchCountDataProvider") + public Object[][] makeMismatchCountDataProvider() { + List tests = new ArrayList(); + + final int readLength = 20; + final int lengthOfIndel = 2; + final int locationOnReference = 10; + final byte[] reference = Utils.dupBytes((byte)'A', readLength); + final byte[] quals = Utils.dupBytes((byte)'A', readLength); + + + for ( int startOnRead = 0; startOnRead <= readLength; startOnRead++ ) { + for ( int basesToRead = 0; basesToRead <= readLength; basesToRead++ ) { + for ( final int lengthOfSoftClip : Arrays.asList(0, 1, 10) ) { + for ( final int lengthOfFirstM : Arrays.asList(0, 3) ) { + for ( final char middleOp : Arrays.asList('M', 'D', 'I') ) { + for ( final int mismatchLocation : Arrays.asList(-1, 0, 5, 10, 15, 19) ) { + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, locationOnReference, readLength); + + // set the read's bases and quals + final byte[] readBases = reference.clone(); + // create the mismatch if requested + if ( mismatchLocation != -1 ) + readBases[mismatchLocation] = (byte)'C'; + read.setReadBases(readBases); + read.setBaseQualities(quals); + + // create the CIGAR string + read.setCigarString(buildTestCigarString(middleOp, lengthOfSoftClip, lengthOfFirstM, lengthOfIndel, readLength)); + + // now, determine whether or not there's a mismatch + final boolean isMismatch; + if ( mismatchLocation < startOnRead || mismatchLocation >= startOnRead + basesToRead || mismatchLocation < lengthOfSoftClip ) { + isMismatch = false; + } else if ( middleOp == 'M' || middleOp == 'D' || mismatchLocation < lengthOfSoftClip + lengthOfFirstM || mismatchLocation >= lengthOfSoftClip + lengthOfFirstM + lengthOfIndel ) { + isMismatch = true; + } else { + isMismatch = false; + } + + tests.add(new Object[]{read, locationOnReference, startOnRead, basesToRead, isMismatch}); + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MismatchCountDataProvider") + public void testMismatchCountData(final GATKSAMRecord read, final int refIndex, final int startOnRead, final int basesToRead, final boolean isMismatch) { + final byte[] reference = Utils.dupBytes((byte)'A', 100); + final int actual = AlignmentUtils.getMismatchCount(read, reference, refIndex, startOnRead, basesToRead).numMismatches; + Assert.assertEquals(actual, isMismatch ? 1 : 0, "Wrong number of mismatches detected for read " + read.getSAMString()); + } + + private static String buildTestCigarString(final char middleOp, final int lengthOfSoftClip, final int lengthOfFirstM, final int lengthOfIndel, final int readLength) { + final StringBuilder cigar = new StringBuilder(); + int remainingLength = readLength; + if ( lengthOfSoftClip > 0 ) { + cigar.append(lengthOfSoftClip + "S"); + remainingLength -= lengthOfSoftClip; + } + + if ( middleOp == 'M' ) { + cigar.append(remainingLength + "M"); + } else { + if ( lengthOfFirstM > 0 ) { + cigar.append(lengthOfFirstM + "M"); + remainingLength -= lengthOfFirstM; + } + + if ( middleOp == 'D' ) { + cigar.append(lengthOfIndel + "D"); + } else { + cigar.append(lengthOfIndel + "I"); + remainingLength -= lengthOfIndel; + } + cigar.append(remainingLength + "M"); + } + + return cigar.toString(); + } + + //////////////////////////////////////////////////////// + // Test AlignmentUtils.calcAlignmentByteArrayOffset() // + //////////////////////////////////////////////////////// + + @DataProvider(name = "AlignmentByteArrayOffsetDataProvider") + public Object[][] makeAlignmentByteArrayOffsetDataProvider() { + List tests = new ArrayList(); + + final int readLength = 20; + final int lengthOfIndel = 2; + final int locationOnReference = 20; + + for ( int offset = 0; offset < readLength; offset++ ) { + for ( final int lengthOfSoftClip : Arrays.asList(0, 1, 10) ) { + for ( final int lengthOfFirstM : Arrays.asList(0, 3) ) { + for ( final char middleOp : Arrays.asList('M', 'D', 'I') ) { + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, locationOnReference, readLength); + // create the CIGAR string + read.setCigarString(buildTestCigarString(middleOp, lengthOfSoftClip, lengthOfFirstM, lengthOfIndel, readLength)); + + // now, determine the expected alignment offset + final int expected; + boolean isDeletion = false; + if ( offset < lengthOfSoftClip ) { + expected = 0; + } else if ( middleOp == 'M' || offset < lengthOfSoftClip + lengthOfFirstM ) { + expected = offset - lengthOfSoftClip; + } else if ( offset < lengthOfSoftClip + lengthOfFirstM + lengthOfIndel ) { + if ( middleOp == 'D' ) { + isDeletion = true; + expected = offset - lengthOfSoftClip; + } else { + expected = lengthOfFirstM; + } + } else { + expected = offset - lengthOfSoftClip - (middleOp == 'I' ? lengthOfIndel : -lengthOfIndel); + } + + tests.add(new Object[]{read.getCigar(), offset, expected, isDeletion, lengthOfSoftClip}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AlignmentByteArrayOffsetDataProvider") + public void testAlignmentByteArrayOffsetData(final Cigar cigar, final int offset, final int expectedResult, final boolean isDeletion, final int lengthOfSoftClip) { + final int actual = AlignmentUtils.calcAlignmentByteArrayOffset(cigar, isDeletion ? -1 : offset, isDeletion, 20, 20 + offset - lengthOfSoftClip); + Assert.assertEquals(actual, expectedResult, "Wrong alignment offset detected for cigar " + cigar.toString()); + } + + //////////////////////////////////////////////////// + // Test AlignmentUtils.readToAlignmentByteArray() // + //////////////////////////////////////////////////// + + @DataProvider(name = "ReadToAlignmentByteArrayDataProvider") + public Object[][] makeReadToAlignmentByteArrayDataProvider() { + List tests = new ArrayList(); + + final int readLength = 20; + final int lengthOfIndel = 2; + final int locationOnReference = 20; + + for ( final int lengthOfSoftClip : Arrays.asList(0, 1, 10) ) { + for ( final int lengthOfFirstM : Arrays.asList(0, 3) ) { + for ( final char middleOp : Arrays.asList('M', 'D', 'I') ) { + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, locationOnReference, readLength); + // create the CIGAR string + read.setCigarString(buildTestCigarString(middleOp, lengthOfSoftClip, lengthOfFirstM, lengthOfIndel, readLength)); + + // now, determine the byte array size + final int expected = readLength - lengthOfSoftClip - (middleOp == 'I' ? lengthOfIndel : (middleOp == 'D' ? -lengthOfIndel : 0)); + final int indelBasesStart = middleOp != 'M' ? lengthOfFirstM : -1; + + tests.add(new Object[]{read.getCigar(), expected, middleOp, indelBasesStart, lengthOfIndel}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadToAlignmentByteArrayDataProvider") + public void testReadToAlignmentByteArrayData(final Cigar cigar, final int expectedLength, final char middleOp, final int startOfIndelBases, final int lengthOfDeletion) { + final byte[] read = Utils.dupBytes((byte)'A', cigar.getReadLength()); + final byte[] alignment = AlignmentUtils.readToAlignmentByteArray(cigar, read); + + Assert.assertEquals(alignment.length, expectedLength, "Wrong alignment length detected for cigar " + cigar.toString()); + + for ( int i = 0; i < alignment.length; i++ ) { + final byte expectedBase; + if ( middleOp == 'D' && i >= startOfIndelBases && i < startOfIndelBases + lengthOfDeletion ) + expectedBase = PileupElement.DELETION_BASE; + else if ( middleOp == 'I' && i == startOfIndelBases - 1 ) + expectedBase = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; + else + expectedBase = (byte)'A'; + Assert.assertEquals(alignment[i], expectedBase, "Wrong base detected at position " + i); + } + } + + ////////////////////////////////////////// + // Test AlignmentUtils.leftAlignIndel() // + ////////////////////////////////////////// + + @DataProvider(name = "LeftAlignIndelDataProvider") + public Object[][] makeLeftAlignIndelDataProvider() { + List tests = new ArrayList(); + + final byte[] repeat1Reference = "ABCDEFGHIJKLMNOPXXXXXXXXXXABCDEFGHIJKLMNOP".getBytes(); + final byte[] repeat2Reference = "ABCDEFGHIJKLMNOPXYXYXYXYXYABCDEFGHIJKLMNOP".getBytes(); + final byte[] repeat3Reference = "ABCDEFGHIJKLMNOPXYZXYZXYZXYZABCDEFGHIJKLMN".getBytes(); + final int referenceLength = repeat1Reference.length; + + for ( int indelStart = 0; indelStart < repeat1Reference.length; indelStart++ ) { + for ( final int indelSize : Arrays.asList(0, 1, 2, 3, 4) ) { + for ( final char indelOp : Arrays.asList('D', 'I') ) { + + if ( indelOp == 'D' && indelStart + indelSize >= repeat1Reference.length ) + continue; + + final int readLength = referenceLength - (indelOp == 'D' ? indelSize : -indelSize); + + // create the original CIGAR string + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLength); + read.setCigarString(buildTestCigarString(indelSize == 0 ? 'M' : indelOp, 0, indelStart, indelSize, readLength)); + final Cigar originalCigar = read.getCigar(); + + final Cigar expectedCigar1 = makeExpectedCigar1(originalCigar, indelOp, indelStart, indelSize, readLength); + final byte[] readString1 = makeReadString(repeat1Reference, indelOp, indelStart, indelSize, readLength, 1); + tests.add(new Object[]{originalCigar, expectedCigar1, repeat1Reference, readString1, 1}); + + final Cigar expectedCigar2 = makeExpectedCigar2(originalCigar, indelOp, indelStart, indelSize, readLength); + final byte[] readString2 = makeReadString(repeat2Reference, indelOp, indelStart, indelSize, readLength, 2); + tests.add(new Object[]{originalCigar, expectedCigar2, repeat2Reference, readString2, 2}); + + final Cigar expectedCigar3 = makeExpectedCigar3(originalCigar, indelOp, indelStart, indelSize, readLength); + final byte[] readString3 = makeReadString(repeat3Reference, indelOp, indelStart, indelSize, readLength, 3); + tests.add(new Object[]{originalCigar, expectedCigar3, repeat3Reference, readString3, 3}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Cigar makeExpectedCigar1(final Cigar originalCigar, final char indelOp, final int indelStart, final int indelSize, final int readLength) { + if ( indelSize == 0 || indelStart < 17 || indelStart > (26 - (indelOp == 'D' ? indelSize : 0)) ) + return originalCigar; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLength); + read.setCigarString(buildTestCigarString(indelOp, 0, 16, indelSize, readLength)); + return read.getCigar(); + } + + private Cigar makeExpectedCigar2(final Cigar originalCigar, final char indelOp, final int indelStart, final int indelSize, final int readLength) { + if ( indelStart < 17 || indelStart > (26 - (indelOp == 'D' ? indelSize : 0)) ) + return originalCigar; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLength); + + if ( indelOp == 'I' && (indelSize == 1 || indelSize == 3) && indelStart % 2 == 1 ) + read.setCigarString(buildTestCigarString(indelOp, 0, Math.max(indelStart - indelSize, 16), indelSize, readLength)); + else if ( (indelSize == 2 || indelSize == 4) && (indelOp == 'D' || indelStart % 2 == 0) ) + read.setCigarString(buildTestCigarString(indelOp, 0, 16, indelSize, readLength)); + else + return originalCigar; + + return read.getCigar(); + } + + private Cigar makeExpectedCigar3(final Cigar originalCigar, final char indelOp, final int indelStart, final int indelSize, final int readLength) { + if ( indelStart < 17 || indelStart > (28 - (indelOp == 'D' ? indelSize : 0)) ) + return originalCigar; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLength); + + if ( indelSize == 3 && (indelOp == 'D' || indelStart % 3 == 1) ) + read.setCigarString(buildTestCigarString(indelOp, 0, 16, indelSize, readLength)); + else if ( (indelOp == 'I' && indelSize == 4 && indelStart % 3 == 2) || + (indelOp == 'I' && indelSize == 2 && indelStart % 3 == 0) || + (indelOp == 'I' && indelSize == 1 && indelStart < 28 && indelStart % 3 == 2) ) + read.setCigarString(buildTestCigarString(indelOp, 0, Math.max(indelStart - indelSize, 16), indelSize, readLength)); + else + return originalCigar; + + return read.getCigar(); + } + + private static byte[] makeReadString(final byte[] reference, final char indelOp, final int indelStart, final int indelSize, final int readLength, final int repeatLength) { + final byte[] readString = new byte[readLength]; + + if ( indelOp == 'D' && indelSize > 0 ) { + System.arraycopy(reference, 0, readString, 0, indelStart); + System.arraycopy(reference, indelStart + indelSize, readString, indelStart, readLength - indelStart); + } else if ( indelOp == 'I' && indelSize > 0 ) { + System.arraycopy(reference, 0, readString, 0, indelStart); + for ( int i = 0; i < indelSize; i++ ) { + if ( i % repeatLength == 0 ) + readString[indelStart + i] = 'X'; + else if ( i % repeatLength == 1 ) + readString[indelStart + i] = 'Y'; + else + readString[indelStart + i] = 'Z'; + } + System.arraycopy(reference, indelStart, readString, indelStart + indelSize, readLength - indelStart - indelSize); + } else { + System.arraycopy(reference, 0, readString, 0, readLength); + } + + return readString; + } + + @Test(dataProvider = "LeftAlignIndelDataProvider", enabled = true) + public void testLeftAlignIndelData(final Cigar originalCigar, final Cigar expectedCigar, final byte[] reference, final byte[] read, final int repeatLength) { + final Cigar actualCigar = AlignmentUtils.leftAlignIndel(originalCigar, reference, read, 0, 0, true); + Assert.assertTrue(expectedCigar.equals(actualCigar), "Wrong left alignment detected for cigar " + originalCigar.toString() + " to " + actualCigar.toString() + " but expected " + expectedCigar.toString() + " with repeat length " + repeatLength); + } }