diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
index bf25023d8..b8e892101 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@@ -1,756 +1,745 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.utils.sam;
-
-import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.BitSet;
-
-
-public class AlignmentUtils {
-
- public static class MismatchCount {
- public int numMismatches = 0;
- public long mismatchQualities = 0;
- }
-
- public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) {
- return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
- }
-
- public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
- return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
- }
-
- // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
- // todo -- high performance implementation. We can do a lot better than this right now
- public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
- MismatchCount mc = new MismatchCount();
-
- int readIdx = 0;
- int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count
- byte[] readSeq = r.getReadBases();
- Cigar c = r.getCigar();
- for (int i = 0 ; i < c.numCigarElements() ; i++) {
-
- if ( readIdx > endOnRead ) break;
-
- CigarElement ce = c.getCigarElement(i);
- switch ( ce.getOperator() ) {
- case M:
- for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) {
- if ( refIndex >= refSeq.length )
- continue;
- if ( readIdx < startOnRead ) continue;
- if ( readIdx > endOnRead ) break;
- byte refChr = refSeq[refIndex];
- byte readChr = readSeq[readIdx];
- // Note: we need to count X/N's as mismatches because that's what SAM requires
- //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
- // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
- // continue; // do not count Ns/Xs/etc ?
- if ( readChr != refChr ) {
- mc.numMismatches++;
- mc.mismatchQualities += r.getBaseQualities()[readIdx];
- }
- }
- break;
- case I:
- case S:
- readIdx += ce.getLength();
- break;
- case D:
- case N:
- refIndex += ce.getLength();
- break;
- case H:
- case P:
- break;
- default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported");
- }
-
- }
- return mc;
- }
-
- /** Returns the number of mismatches in the pileup within the given reference context.
- *
- * @param pileup the pileup with reads
- * @param ref the reference context
- * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
- * @return the number of mismatches
- */
- public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) {
- int mismatches = 0;
- for ( PileupElement p : pileup )
- mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite);
- return mismatches;
- }
-
- /** Returns the number of mismatches in the pileup element within the given reference context.
- *
- * @param p the pileup element
- * @param ref the reference context
- * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
- * @return the number of mismatches
- */
- public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) {
- return mismatchesInRefWindow(p, ref, ignoreTargetSite, false);
- }
-
- /** Returns the number of mismatches in the pileup element within the given reference context.
- *
- * @param p the pileup element
- * @param ref the reference context
- * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
- * @param qualitySumInsteadOfMismatchCount if true, return the quality score sum of the mismatches rather than the count
- * @return the number of mismatches
- */
- public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) {
- int sum = 0;
-
- int windowStart = ref.getWindow().getStart();
- int windowStop = ref.getWindow().getStop();
- byte[] refBases = ref.getBases();
- byte[] readBases = p.getRead().getReadBases();
- byte[] readQualities = p.getRead().getBaseQualities();
- Cigar c = p.getRead().getCigar();
-
- int readIndex = 0;
- int currentPos = p.getRead().getAlignmentStart();
- int refIndex = Math.max(0, currentPos - windowStart);
-
- for (int i = 0 ; i < c.numCigarElements() ; i++) {
- CigarElement ce = c.getCigarElement(i);
- int cigarElementLength = ce.getLength();
- switch ( ce.getOperator() ) {
- case M:
- for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) {
- // are we past the ref window?
- if ( currentPos > windowStop )
- break;
-
- // are we before the ref window?
- if ( currentPos < windowStart )
- continue;
-
- byte refChr = refBases[refIndex++];
-
- // do we need to skip the target site?
- if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos )
- continue;
-
- byte readChr = readBases[readIndex];
- if ( readChr != refChr )
- sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1;
- }
- break;
- case I:
- case S:
- readIndex += cigarElementLength;
- break;
- case D:
- case N:
- currentPos += cigarElementLength;
- if ( currentPos > windowStart )
- refIndex += Math.min(cigarElementLength, currentPos - windowStart);
- break;
- case H:
- case P:
- break;
- }
- }
-
- return sum;
- }
-
- /** Returns the mismatches in the read within the given reference context.
- *
- * @param read the SAMRecord
- * @param ref the reference context
- * @return each base is represented by a bit in the BitSet. True for mismatch, false for ref
- */
- public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref) {
- // first determine the positions with mismatches
- int readLength = read.getReadLength();
- BitSet mismatches = new BitSet(readLength);
-
- // it's possible we aren't starting at the beginning of a read,
- // and we don't need to look at any of the previous context outside our window
- // (although we do need future context)
- int readStartPos = read.getAlignmentStart();
- int currentReadPos = read.getAlignmentStart();
-
- byte[] refBases = ref.getBases();
- int refIndex = readStartPos - ref.getWindow().getStart();
- if ( refIndex < 0 ) {
- throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus());
- }
-
- byte[] readBases = read.getReadBases();
- int readIndex = 0;
-
- Cigar c = read.getCigar();
-
- for (int i = 0 ; i < c.numCigarElements() ; i++) {
- CigarElement ce = c.getCigarElement(i);
- int cigarElementLength = ce.getLength();
- switch ( ce.getOperator() ) {
- case M:
- for (int j = 0; j < cigarElementLength; j++, readIndex++) {
- // skip over unwanted bases
- if ( currentReadPos++ < readStartPos )
- continue;
-
- // this is possible if reads extend beyond the contig end
- if ( refIndex >= refBases.length )
- break;
-
- byte refChr = refBases[refIndex];
- byte readChr = readBases[readIndex];
- if ( readChr != refChr )
- mismatches.set(readIndex);
-
- refIndex++;
- }
- break;
- case I:
- case S:
- readIndex += cigarElementLength;
- break;
- case D:
- case N:
- if ( currentReadPos >= readStartPos )
- refIndex += cigarElementLength;
- currentReadPos += cigarElementLength;
- break;
- case H:
- case P:
- break;
- }
- }
- return mismatches;
- }
- /** Returns a BitSet showing what bases are good according to the criteria of maxMismatches in the window
- *
- * @param read the SAMRecord
- * @param ref the reference context
- * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good
- * @param windowSize window size (on each side) to test
- * @return a bitset representing which bases are good
- */
- public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) {
- int readLength = read.getReadLength();
-
- // all bits are set to false by default
- BitSet result = new BitSet(readLength);
- BitSet mismatches = mismatchesInRefWindow(read, ref);
-
- int currentPos = 0, leftPos = 0, rightPos;
- int mismatchCount = 0;
-
- // calculate how many mismatches exist in the windows to the left/right
- for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) {
- if ( mismatches.get(rightPos) )
- mismatchCount++;
- }
- if ( mismatchCount <= maxMismatches )
- result.set(currentPos);
-
- // now, traverse over the read positions
- while ( currentPos < readLength ) {
- // add a new rightmost position
- if ( rightPos < readLength && mismatches.get(rightPos++) )
- mismatchCount++;
- // re-penalize the previous position
- if ( mismatches.get(currentPos++) )
- mismatchCount++;
- // don't penalize the current position
- if ( mismatches.get(currentPos) )
- mismatchCount--;
- // subtract the leftmost position
- if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) )
- mismatchCount--;
-
- if ( mismatchCount <= maxMismatches )
- result.set(currentPos);
- }
-
- return result;
- }
- /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
- * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
- * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
- * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed.
- * Formally, this method simply returns the number of M elements in the cigar.
- * @param r alignment
- * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored).
- */
- public static int getNumAlignmentBlocks(final SAMRecord r) {
- int n = 0;
- final Cigar cigar = r.getCigar();
- if (cigar == null) return 0;
-
- for (final CigarElement e : cigar.getCigarElements()) {
- if (e.getOperator() == CigarOperator.M ) n++;
- }
-
- return n;
- }
-
- public static int getNumAlignedBases(final SAMRecord r) {
- int n = 0;
- final Cigar cigar = r.getCigar();
- if (cigar == null) return 0;
-
- for (final CigarElement e : cigar.getCigarElements()) {
- if (e.getOperator() == CigarOperator.M ) { n += e.getLength(); }
- }
-
- return n;
- }
-
- public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) {
-
- final byte[] alignment = new byte[read.length];
- int refPos = 0;
- int alignPos = 0;
-
- for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
-
- final CigarElement ce = cigar.getCigarElement(iii);
- final int elementLength = ce.getLength();
-
- switch( ce.getOperator() ) {
- case I:
- case S:
- for ( int jjj = 0; jjj < elementLength; jjj++ ) {
- alignment[alignPos++] = '+';
- }
- break;
- case D:
- case N:
- refPos += elementLength;
- break;
- case M:
- for ( int jjj = 0; jjj < elementLength; jjj++ ) {
- alignment[alignPos] = ref[refPos];
- alignPos++;
- refPos++;
- }
- break;
- case H:
- case P:
- break;
- default:
- throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
- }
- }
- return alignment;
- }
-
- public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) {
-
- boolean atDeletion = false;
- if(pileupOffset == -1) {
- atDeletion = true;
- pileupOffset = refLocus - alignmentStart;
- final CigarElement ce = cigar.getCigarElement(0);
- if( ce.getOperator() == CigarOperator.S ) {
- pileupOffset += ce.getLength();
- }
- }
- int pos = 0;
- int alignmentPos = 0;
- for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
-
- final CigarElement ce = cigar.getCigarElement(iii);
- final int elementLength = ce.getLength();
-
- switch( ce.getOperator() ) {
- case I:
- case S:
- pos += elementLength;
- if( pos >= pileupOffset ) {
- return alignmentPos;
- }
- break;
- case D:
- case N:
- if(!atDeletion) {
- alignmentPos += elementLength;
- } else {
- if( pos + elementLength - 1 >= pileupOffset ) {
- return alignmentPos + (pileupOffset - pos);
- } else {
- pos += elementLength;
- alignmentPos += elementLength;
- }
- }
- break;
- case M:
- if( pos + elementLength - 1 >= pileupOffset ) {
- return alignmentPos + (pileupOffset - pos);
- } else {
- pos += elementLength;
- alignmentPos += elementLength;
- }
- break;
- case H:
- case P:
- break;
- default:
- throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
- }
- }
- return alignmentPos;
- }
-
- public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) {
-
- int alignmentLength = 0;
- for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
-
- final CigarElement ce = cigar.getCigarElement(iii);
- final int elementLength = ce.getLength();
-
- switch( ce.getOperator() ) {
- case I:
- case S:
- break;
- case D:
- case N:
- alignmentLength += elementLength;
- break;
- case M:
- alignmentLength += elementLength;
- break;
- case H:
- case P:
- break;
- default:
- throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
- }
- }
-
- final byte[] alignment = new byte[alignmentLength];
- int alignPos = 0;
- int readPos = 0;
- for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
-
- final CigarElement ce = cigar.getCigarElement(iii);
- final int elementLength = ce.getLength();
-
- switch( ce.getOperator() ) {
- case I:
- if( alignPos > 0 ) {
- if( alignment[alignPos-1] == BaseUtils.A ) { alignment[alignPos-1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; }
- else if( alignment[alignPos-1] == BaseUtils.C ) { alignment[alignPos-1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; }
- else if( alignment[alignPos-1] == BaseUtils.T ) { alignment[alignPos-1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; }
- else if( alignment[alignPos-1] == BaseUtils.G ) { alignment[alignPos-1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; }
- }
- case S:
- for ( int jjj = 0; jjj < elementLength; jjj++ ) {
- readPos++;
- }
- break;
- case D:
- case N:
- for ( int jjj = 0; jjj < elementLength; jjj++ ) {
- alignment[alignPos] = PileupElement.DELETION_BASE;
- alignPos++;
- }
- break;
- case M:
- for ( int jjj = 0; jjj < elementLength; jjj++ ) {
- alignment[alignPos] = read[readPos];
- alignPos++;
- readPos++;
- }
- break;
- case H:
- case P:
- break;
- default:
- throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
- }
- }
- return alignment;
- }
-
- /**
- * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
- * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
- * alignment reference index/start.
- * @param r record
- * @return true if read is unmapped
- */
- public static boolean isReadUnmapped(final SAMRecord r) {
- if ( r.getReadUnmappedFlag() ) return true;
-
- // our life would be so much easier if all sam files followed the specs. In reality,
- // sam files (including those generated by maq or bwa) miss headers altogether. When
- // reading such a SAM file, reference name is set, but since there is no sequence dictionary,
- // null is always returned for referenceIndex. Let's be paranoid here, and make sure that
- // we do not call the read "unmapped" when it has only reference name set with ref. index missing
- // or vice versa.
- if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
- || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) )
- && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ;
- return true;
- }
-
- /**
- * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format
- * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and
- * alignment reference index/start of the mate.
- * @param r sam record for the read
- * @return true if read's mate is unmapped
- */
- public static boolean isMateUnmapped(final SAMRecord r) {
- if ( r.getMateUnmappedFlag() ) return true;
-
- // our life would be so much easier if all sam files followed the specs. In reality,
- // sam files (including those generated by maq or bwa) miss headers altogether. When
- // reading such a SAM file, reference name is set, but since there is no sequence dictionary,
- // null is always returned for referenceIndex. Let's be paranoid here, and make sure that
- // we do not call the read "unmapped" when it has only reference name set with ref. index missing
- // or vice versa.
- if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
- || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) )
- && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ;
- return true;
- }
-
- /** Returns true is read is mapped and mapped uniquely (Q>0).
- *
- * @param read
- * @return
- */
- public static boolean isReadUniquelyMapped(SAMRecord read) {
- return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0;
- }
-
- /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
- * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
- * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
- * of read's base qualitites is inverted (in this case new array is allocated and returned).
- * @param read
- * @return
- */
- public static byte [] getQualsInCycleOrder(SAMRecord read) {
- if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getBaseQualities();
-
- return Utils.reverse(read.getBaseQualities());
- }
-
- /** Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from
- * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
- * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
- * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities
- * are available this method will throw a runtime exception.
- * @param read
- * @return
- */
- public static byte [] getOriginalQualsInCycleOrder(SAMRecord read) {
- if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getOriginalBaseQualities();
-
- return Utils.reverse(read.getOriginalBaseQualities());
- }
-
- /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq
- * starting at 0-based position refIndex on the refSeq and specified by its cigar.
- * The last argument readIndex specifies 0-based position on the read where the alignment described by the
- * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
- * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
- * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
- * always the positions where the cigar starts on the ref and on the read, respectively.
- *
- * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
- * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
- * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
- * is not repeated), the original cigar is returned.
- * @param cigar structure of the original alignment
- * @param refSeq reference sequence the read is aligned to
- * @param readSeq read sequence
- * @param refIndex 0-based alignment start position on ref
- * @param readIndex 0-based alignment start position on read
- * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
- */
- public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
-
- int indexOfIndel = -1;
- for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
- CigarElement ce = cigar.getCigarElement(i);
- if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
- // if there is more than 1 indel, don't left align
- if ( indexOfIndel != -1 )
- return cigar;
- indexOfIndel = i;
- }
- }
-
- // if there is no indel or if the alignment starts with an insertion (so that there
- // is no place on the read to move that insertion further left), we are done
- if ( indexOfIndel < 1 ) return cigar;
-
- final int indelLength = cigar.getCigarElement(indexOfIndel).getLength();
-
- byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
- if ( altString == null )
- return cigar;
-
- Cigar newCigar = cigar;
- for ( int i = 0; i < indelLength; i++ ) {
- newCigar = moveCigarLeft(newCigar, indexOfIndel);
- byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
-
- // check to make sure we haven't run off the end of the read
- boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar);
-
- if ( Arrays.equals(altString, newAltString) ) {
- cigar = newCigar;
- i = -1;
- if ( reachedEndOfRead )
- cigar = cleanUpCigar(cigar);
- }
-
- if ( reachedEndOfRead )
- break;
- }
-
- return cigar;
- }
-
- private static boolean cigarHasZeroSizeElement(Cigar c) {
- for ( CigarElement ce : c.getCigarElements() ) {
- if ( ce.getLength() == 0 )
- return true;
- }
- return false;
- }
-
- private static Cigar cleanUpCigar(Cigar c) {
- ArrayList elements = new ArrayList(c.numCigarElements()-1);
- for ( CigarElement ce : c.getCigarElements() ) {
- if ( ce.getLength() != 0 &&
- (elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) {
- elements.add(ce);
- }
- }
- return new Cigar(elements);
- }
-
- private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) {
- // get the first few elements
- ArrayList elements = new ArrayList(cigar.numCigarElements());
- for ( int i = 0; i < indexOfIndel - 1; i++)
- elements.add(cigar.getCigarElement(i));
-
- // get the indel element and move it left one base
- CigarElement ce = cigar.getCigarElement(indexOfIndel-1);
- elements.add(new CigarElement(ce.getLength()-1, ce.getOperator()));
- elements.add(cigar.getCigarElement(indexOfIndel));
- if ( indexOfIndel+1 < cigar.numCigarElements() ) {
- ce = cigar.getCigarElement(indexOfIndel+1);
- elements.add(new CigarElement(ce.getLength()+1, ce.getOperator()));
- } else {
- elements.add(new CigarElement(1, CigarOperator.M));
- }
-
- // get the last few elements
- for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++)
- elements.add(cigar.getCigarElement(i));
- return new Cigar(elements);
- }
-
- private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
- CigarElement indel = cigar.getCigarElement(indexOfIndel);
- int indelLength = indel.getLength();
-
- int totalRefBases = 0;
- for ( int i = 0; i < indexOfIndel; i++ ) {
- CigarElement ce = cigar.getCigarElement(i);
- int length = ce.getLength();
-
- switch( ce.getOperator() ) {
- case M:
- readIndex += length;
- refIndex += length;
- totalRefBases += length;
- break;
- case S:
- readIndex += length;
- break;
- case N:
- refIndex += length;
- totalRefBases += length;
- break;
- default:
- break;
- }
- }
-
- // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them
- if ( totalRefBases + indelLength > refSeq.length )
- indelLength -= (totalRefBases + indelLength - refSeq.length);
-
- // the indel-based reference string
- byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))];
-
- // add the bases before the indel, making sure it's not aligned off the end of the reference
- if ( refIndex > alt.length || refIndex > refSeq.length )
- return null;
- System.arraycopy(refSeq, 0, alt, 0, refIndex);
- int currentPos = refIndex;
-
- // take care of the indel
- if ( indel.getOperator() == CigarOperator.D ) {
- refIndex += indelLength;
- } else {
- System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength);
- currentPos += indelLength;
- }
-
- // add the bases after the indel, making sure it's not aligned off the end of the reference
- if ( refSeq.length - refIndex > alt.length - currentPos )
- return null;
- System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex);
-
- return alt;
- }
-}
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.sam;
+
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+
+
+public class AlignmentUtils {
+
+ public static class MismatchCount {
+ public int numMismatches = 0;
+ public long mismatchQualities = 0;
+ }
+
+ public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) {
+ return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
+ }
+
+ public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
+ return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
+ }
+
+ // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
+ // todo -- high performance implementation. We can do a lot better than this right now
+ public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
+ MismatchCount mc = new MismatchCount();
+
+ int readIdx = 0;
+ int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count
+ byte[] readSeq = r.getReadBases();
+ Cigar c = r.getCigar();
+ for (int i = 0 ; i < c.numCigarElements() ; i++) {
+
+ if ( readIdx > endOnRead ) break;
+
+ CigarElement ce = c.getCigarElement(i);
+ switch ( ce.getOperator() ) {
+ case M:
+ for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) {
+ if ( refIndex >= refSeq.length )
+ continue;
+ if ( readIdx < startOnRead ) continue;
+ if ( readIdx > endOnRead ) break;
+ byte refChr = refSeq[refIndex];
+ byte readChr = readSeq[readIdx];
+ // Note: we need to count X/N's as mismatches because that's what SAM requires
+ //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
+ // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
+ // continue; // do not count Ns/Xs/etc ?
+ if ( readChr != refChr ) {
+ mc.numMismatches++;
+ mc.mismatchQualities += r.getBaseQualities()[readIdx];
+ }
+ }
+ break;
+ case I:
+ case S:
+ readIdx += ce.getLength();
+ break;
+ case D:
+ case N:
+ refIndex += ce.getLength();
+ break;
+ case H:
+ case P:
+ break;
+ default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported");
+ }
+
+ }
+ return mc;
+ }
+
+ /** Returns the number of mismatches in the pileup within the given reference context.
+ *
+ * @param pileup the pileup with reads
+ * @param ref the reference context
+ * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
+ * @return the number of mismatches
+ */
+ public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) {
+ int mismatches = 0;
+ for ( PileupElement p : pileup )
+ mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite);
+ return mismatches;
+ }
+
+ /** Returns the number of mismatches in the pileup element within the given reference context.
+ *
+ * @param p the pileup element
+ * @param ref the reference context
+ * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
+ * @return the number of mismatches
+ */
+ public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) {
+ return mismatchesInRefWindow(p, ref, ignoreTargetSite, false);
+ }
+
+ /** Returns the number of mismatches in the pileup element within the given reference context.
+ *
+ * @param p the pileup element
+ * @param ref the reference context
+ * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
+ * @param qualitySumInsteadOfMismatchCount if true, return the quality score sum of the mismatches rather than the count
+ * @return the number of mismatches
+ */
+ public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) {
+ int sum = 0;
+
+ int windowStart = ref.getWindow().getStart();
+ int windowStop = ref.getWindow().getStop();
+ byte[] refBases = ref.getBases();
+ byte[] readBases = p.getRead().getReadBases();
+ byte[] readQualities = p.getRead().getBaseQualities();
+ Cigar c = p.getRead().getCigar();
+
+ int readIndex = 0;
+ int currentPos = p.getRead().getAlignmentStart();
+ int refIndex = Math.max(0, currentPos - windowStart);
+
+ for (int i = 0 ; i < c.numCigarElements() ; i++) {
+ CigarElement ce = c.getCigarElement(i);
+ int cigarElementLength = ce.getLength();
+ switch ( ce.getOperator() ) {
+ case M:
+ for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) {
+ // are we past the ref window?
+ if ( currentPos > windowStop )
+ break;
+
+ // are we before the ref window?
+ if ( currentPos < windowStart )
+ continue;
+
+ byte refChr = refBases[refIndex++];
+
+ // do we need to skip the target site?
+ if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos )
+ continue;
+
+ byte readChr = readBases[readIndex];
+ if ( readChr != refChr )
+ sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1;
+ }
+ break;
+ case I:
+ case S:
+ readIndex += cigarElementLength;
+ break;
+ case D:
+ case N:
+ currentPos += cigarElementLength;
+ if ( currentPos > windowStart )
+ refIndex += Math.min(cigarElementLength, currentPos - windowStart);
+ break;
+ case H:
+ case P:
+ break;
+ }
+ }
+
+ return sum;
+ }
+
+ /** Returns the number of mismatches in the pileup element within the given reference context.
+ *
+ * @param read the SAMRecord
+ * @param ref the reference context
+ * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good
+ * @param windowSize window size (on each side) to test
+ * @return a bitset representing which bases are good
+ */
+ public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) {
+ // first determine the positions with mismatches
+ int readLength = read.getReadLength();
+ BitSet mismatches = new BitSet(readLength);
+
+ // it's possible we aren't starting at the beginning of a read,
+ // and we don't need to look at any of the previous context outside our window
+ // (although we do need future context)
+ int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize);
+ int currentReadPos = read.getAlignmentStart();
+
+ byte[] refBases = ref.getBases();
+ int refIndex = readStartPos - ref.getWindow().getStart();
+ if ( refIndex < 0 ) {
+ throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus());
+ }
+
+ byte[] readBases = read.getReadBases();
+ int readIndex = 0;
+
+ Cigar c = read.getCigar();
+
+ for (int i = 0 ; i < c.numCigarElements() ; i++) {
+ CigarElement ce = c.getCigarElement(i);
+ int cigarElementLength = ce.getLength();
+ switch ( ce.getOperator() ) {
+ case M:
+ for (int j = 0; j < cigarElementLength; j++, readIndex++) {
+ // skip over unwanted bases
+ if ( currentReadPos++ < readStartPos )
+ continue;
+
+ // this is possible if reads extend beyond the contig end
+ if ( refIndex >= refBases.length )
+ break;
+
+ byte refChr = refBases[refIndex];
+ byte readChr = readBases[readIndex];
+ if ( readChr != refChr )
+ mismatches.set(readIndex);
+
+ refIndex++;
+ }
+ break;
+ case I:
+ case S:
+ readIndex += cigarElementLength;
+ break;
+ case D:
+ case N:
+ if ( currentReadPos >= readStartPos )
+ refIndex += cigarElementLength;
+ currentReadPos += cigarElementLength;
+ break;
+ case H:
+ case P:
+ break;
+ }
+ }
+
+ // all bits are set to false by default
+ BitSet result = new BitSet(readLength);
+
+ int currentPos = 0, leftPos = 0, rightPos;
+ int mismatchCount = 0;
+
+ // calculate how many mismatches exist in the windows to the left/right
+ for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) {
+ if ( mismatches.get(rightPos) )
+ mismatchCount++;
+ }
+ if ( mismatchCount <= maxMismatches )
+ result.set(currentPos);
+
+ // now, traverse over the read positions
+ while ( currentPos < readLength ) {
+ // add a new rightmost position
+ if ( rightPos < readLength && mismatches.get(rightPos++) )
+ mismatchCount++;
+ // re-penalize the previous position
+ if ( mismatches.get(currentPos++) )
+ mismatchCount++;
+ // don't penalize the current position
+ if ( mismatches.get(currentPos) )
+ mismatchCount--;
+ // subtract the leftmost position
+ if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) )
+ mismatchCount--;
+
+ if ( mismatchCount <= maxMismatches )
+ result.set(currentPos);
+ }
+
+ return result;
+ }
+ /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
+ * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
+ * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
+ * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed.
+ * Formally, this method simply returns the number of M elements in the cigar.
+ * @param r alignment
+ * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored).
+ */
+ public static int getNumAlignmentBlocks(final SAMRecord r) {
+ int n = 0;
+ final Cigar cigar = r.getCigar();
+ if (cigar == null) return 0;
+
+ for (final CigarElement e : cigar.getCigarElements()) {
+ if (e.getOperator() == CigarOperator.M ) n++;
+ }
+
+ return n;
+ }
+
+ public static int getNumAlignedBases(final SAMRecord r) {
+ int n = 0;
+ final Cigar cigar = r.getCigar();
+ if (cigar == null) return 0;
+
+ for (final CigarElement e : cigar.getCigarElements()) {
+ if (e.getOperator() == CigarOperator.M ) { n += e.getLength(); }
+ }
+
+ return n;
+ }
+
+ public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) {
+
+ final byte[] alignment = new byte[read.length];
+ int refPos = 0;
+ int alignPos = 0;
+
+ for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
+
+ final CigarElement ce = cigar.getCigarElement(iii);
+ final int elementLength = ce.getLength();
+
+ switch( ce.getOperator() ) {
+ case I:
+ case S:
+ for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+ alignment[alignPos++] = '+';
+ }
+ break;
+ case D:
+ case N:
+ refPos += elementLength;
+ break;
+ case M:
+ for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+ alignment[alignPos] = ref[refPos];
+ alignPos++;
+ refPos++;
+ }
+ break;
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
+ }
+ }
+ return alignment;
+ }
+
+ public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) {
+
+ boolean atDeletion = false;
+ if(pileupOffset == -1) {
+ atDeletion = true;
+ pileupOffset = refLocus - alignmentStart;
+ final CigarElement ce = cigar.getCigarElement(0);
+ if( ce.getOperator() == CigarOperator.S ) {
+ pileupOffset += ce.getLength();
+ }
+ }
+ int pos = 0;
+ int alignmentPos = 0;
+ for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
+
+ final CigarElement ce = cigar.getCigarElement(iii);
+ final int elementLength = ce.getLength();
+
+ switch( ce.getOperator() ) {
+ case I:
+ case S:
+ pos += elementLength;
+ if( pos >= pileupOffset ) {
+ return alignmentPos;
+ }
+ break;
+ case D:
+ case N:
+ if(!atDeletion) {
+ alignmentPos += elementLength;
+ } else {
+ if( pos + elementLength - 1 >= pileupOffset ) {
+ return alignmentPos + (pileupOffset - pos);
+ } else {
+ pos += elementLength;
+ alignmentPos += elementLength;
+ }
+ }
+ break;
+ case M:
+ if( pos + elementLength - 1 >= pileupOffset ) {
+ return alignmentPos + (pileupOffset - pos);
+ } else {
+ pos += elementLength;
+ alignmentPos += elementLength;
+ }
+ break;
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
+ }
+ }
+ return alignmentPos;
+ }
+
+ public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) {
+
+ int alignmentLength = 0;
+ for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
+
+ final CigarElement ce = cigar.getCigarElement(iii);
+ final int elementLength = ce.getLength();
+
+ switch( ce.getOperator() ) {
+ case I:
+ case S:
+ break;
+ case D:
+ case N:
+ alignmentLength += elementLength;
+ break;
+ case M:
+ alignmentLength += elementLength;
+ break;
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
+ }
+ }
+
+ final byte[] alignment = new byte[alignmentLength];
+ int alignPos = 0;
+ int readPos = 0;
+ for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
+
+ final CigarElement ce = cigar.getCigarElement(iii);
+ final int elementLength = ce.getLength();
+
+ switch( ce.getOperator() ) {
+ case I:
+ if( alignPos > 0 ) {
+ if( alignment[alignPos-1] == BaseUtils.A ) { alignment[alignPos-1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; }
+ else if( alignment[alignPos-1] == BaseUtils.C ) { alignment[alignPos-1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; }
+ else if( alignment[alignPos-1] == BaseUtils.T ) { alignment[alignPos-1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; }
+ else if( alignment[alignPos-1] == BaseUtils.G ) { alignment[alignPos-1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; }
+ }
+ case S:
+ for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+ readPos++;
+ }
+ break;
+ case D:
+ case N:
+ for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+ alignment[alignPos] = PileupElement.DELETION_BASE;
+ alignPos++;
+ }
+ break;
+ case M:
+ for ( int jjj = 0; jjj < elementLength; jjj++ ) {
+ alignment[alignPos] = read[readPos];
+ alignPos++;
+ readPos++;
+ }
+ break;
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
+ }
+ }
+ return alignment;
+ }
+
+ /**
+ * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
+ * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
+ * alignment reference index/start.
+ * @param r record
+ * @return true if read is unmapped
+ */
+ public static boolean isReadUnmapped(final SAMRecord r) {
+ if ( r.getReadUnmappedFlag() ) return true;
+
+ // our life would be so much easier if all sam files followed the specs. In reality,
+ // sam files (including those generated by maq or bwa) miss headers altogether. When
+ // reading such a SAM file, reference name is set, but since there is no sequence dictionary,
+ // null is always returned for referenceIndex. Let's be paranoid here, and make sure that
+ // we do not call the read "unmapped" when it has only reference name set with ref. index missing
+ // or vice versa.
+ if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
+ || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) )
+ && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ;
+ return true;
+ }
+
+ /**
+ * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format
+ * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and
+ * alignment reference index/start of the mate.
+ * @param r sam record for the read
+ * @return true if read's mate is unmapped
+ */
+ public static boolean isMateUnmapped(final SAMRecord r) {
+ if ( r.getMateUnmappedFlag() ) return true;
+
+ // our life would be so much easier if all sam files followed the specs. In reality,
+ // sam files (including those generated by maq or bwa) miss headers altogether. When
+ // reading such a SAM file, reference name is set, but since there is no sequence dictionary,
+ // null is always returned for referenceIndex. Let's be paranoid here, and make sure that
+ // we do not call the read "unmapped" when it has only reference name set with ref. index missing
+ // or vice versa.
+ if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
+ || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) )
+ && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ;
+ return true;
+ }
+
+ /** Returns true is read is mapped and mapped uniquely (Q>0).
+ *
+ * @param read
+ * @return
+ */
+ public static boolean isReadUniquelyMapped(SAMRecord read) {
+ return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0;
+ }
+
+ /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
+ * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
+ * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
+ * of read's base qualitites is inverted (in this case new array is allocated and returned).
+ * @param read
+ * @return
+ */
+ public static byte [] getQualsInCycleOrder(SAMRecord read) {
+ if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getBaseQualities();
+
+ return Utils.reverse(read.getBaseQualities());
+ }
+
+ /** Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from
+ * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
+ * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
+ * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities
+ * are available this method will throw a runtime exception.
+ * @param read
+ * @return
+ */
+ public static byte [] getOriginalQualsInCycleOrder(SAMRecord read) {
+ if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getOriginalBaseQualities();
+
+ return Utils.reverse(read.getOriginalBaseQualities());
+ }
+
+ /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq
+ * starting at 0-based position refIndex on the refSeq and specified by its cigar.
+ * The last argument readIndex specifies 0-based position on the read where the alignment described by the
+ * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
+ * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
+ * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
+ * always the positions where the cigar starts on the ref and on the read, respectively.
+ *
+ * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
+ * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
+ * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
+ * is not repeated), the original cigar is returned.
+ * @param cigar structure of the original alignment
+ * @param refSeq reference sequence the read is aligned to
+ * @param readSeq read sequence
+ * @param refIndex 0-based alignment start position on ref
+ * @param readIndex 0-based alignment start position on read
+ * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
+ */
+ public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
+
+ int indexOfIndel = -1;
+ for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
+ CigarElement ce = cigar.getCigarElement(i);
+ if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
+ // if there is more than 1 indel, don't left align
+ if ( indexOfIndel != -1 )
+ return cigar;
+ indexOfIndel = i;
+ }
+ }
+
+ // if there is no indel or if the alignment starts with an insertion (so that there
+ // is no place on the read to move that insertion further left), we are done
+ if ( indexOfIndel < 1 ) return cigar;
+
+ final int indelLength = cigar.getCigarElement(indexOfIndel).getLength();
+
+ byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
+ if ( altString == null )
+ return cigar;
+
+ Cigar newCigar = cigar;
+ for ( int i = 0; i < indelLength; i++ ) {
+ newCigar = moveCigarLeft(newCigar, indexOfIndel);
+ byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
+
+ // check to make sure we haven't run off the end of the read
+ boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar);
+
+ if ( Arrays.equals(altString, newAltString) ) {
+ cigar = newCigar;
+ i = -1;
+ if ( reachedEndOfRead )
+ cigar = cleanUpCigar(cigar);
+ }
+
+ if ( reachedEndOfRead )
+ break;
+ }
+
+ return cigar;
+ }
+
+ private static boolean cigarHasZeroSizeElement(Cigar c) {
+ for ( CigarElement ce : c.getCigarElements() ) {
+ if ( ce.getLength() == 0 )
+ return true;
+ }
+ return false;
+ }
+
+ private static Cigar cleanUpCigar(Cigar c) {
+ ArrayList elements = new ArrayList(c.numCigarElements()-1);
+ for ( CigarElement ce : c.getCigarElements() ) {
+ if ( ce.getLength() != 0 &&
+ (elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) {
+ elements.add(ce);
+ }
+ }
+ return new Cigar(elements);
+ }
+
+ private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) {
+ // get the first few elements
+ ArrayList elements = new ArrayList(cigar.numCigarElements());
+ for ( int i = 0; i < indexOfIndel - 1; i++)
+ elements.add(cigar.getCigarElement(i));
+
+ // get the indel element and move it left one base
+ CigarElement ce = cigar.getCigarElement(indexOfIndel-1);
+ elements.add(new CigarElement(ce.getLength()-1, ce.getOperator()));
+ elements.add(cigar.getCigarElement(indexOfIndel));
+ if ( indexOfIndel+1 < cigar.numCigarElements() ) {
+ ce = cigar.getCigarElement(indexOfIndel+1);
+ elements.add(new CigarElement(ce.getLength()+1, ce.getOperator()));
+ } else {
+ elements.add(new CigarElement(1, CigarOperator.M));
+ }
+
+ // get the last few elements
+ for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++)
+ elements.add(cigar.getCigarElement(i));
+ return new Cigar(elements);
+ }
+
+ private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
+ CigarElement indel = cigar.getCigarElement(indexOfIndel);
+ int indelLength = indel.getLength();
+
+ int totalRefBases = 0;
+ for ( int i = 0; i < indexOfIndel; i++ ) {
+ CigarElement ce = cigar.getCigarElement(i);
+ int length = ce.getLength();
+
+ switch( ce.getOperator() ) {
+ case M:
+ readIndex += length;
+ refIndex += length;
+ totalRefBases += length;
+ break;
+ case S:
+ readIndex += length;
+ break;
+ case N:
+ refIndex += length;
+ totalRefBases += length;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them
+ if ( totalRefBases + indelLength > refSeq.length )
+ indelLength -= (totalRefBases + indelLength - refSeq.length);
+
+ // the indel-based reference string
+ byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))];
+
+ // add the bases before the indel, making sure it's not aligned off the end of the reference
+ if ( refIndex > alt.length || refIndex > refSeq.length )
+ return null;
+ System.arraycopy(refSeq, 0, alt, 0, refIndex);
+ int currentPos = refIndex;
+
+ // take care of the indel
+ if ( indel.getOperator() == CigarOperator.D ) {
+ refIndex += indelLength;
+ } else {
+ System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength);
+ currentPos += indelLength;
+ }
+
+ // add the bases after the indel, making sure it's not aligned off the end of the reference
+ if ( refSeq.length - refIndex > alt.length - currentPos )
+ return null;
+ System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex);
+
+ return alt;
+ }
+}