/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.Utils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
public class AlignmentUtils {
public static class MismatchCount {
public int numMismatches = 0;
public long mismatchQualities = 0;
}
/** Returns number of mismatches in the alignment r to the reference sequence
* refSeq assuming the alignment starts at (ZERO-based) position refIndex on the
* specified reference sequence; in other words, refIndex is used in place of alignment's own
* getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment r
* (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected.
*
* THIS CODE ASSUMES THAT ALL BYTES COME FROM UPPERCASED CHARS.
*
* @param r alignment
* @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of
* the reference string, IndexOutOfBound exception will be thrown at runtime).
* @param refIndex zero-based position, at which the alignment starts on the specified reference string.
* @return the number of mismatches
*/
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) {
return getMismatchCount(r, refSeq, refIndex).numMismatches;
}
/** Same as #numMismatches(SAMRecord, byte[], refIndex), but counts mismatches only along the partial stretch
* on the read of length nReadBases starting at (0-based) position readIndex.
* @param r Aligned read to count mismatches for
* @param refSeq Chunk of reference sequence that subsumes the alignment
* @param refIndex Zero-based position on refSeq where the alighnment for the whole read starts
* @param readIndex Zero-based position on the read, the mismatches will be counted only from this position on
* @param nReadBases Length of continuous stretch on the read, along which mismatches will be counted
* @return
*/
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) {
if ( r.getReadUnmappedFlag() ) return 1000000;
return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).numMismatches;
}
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) {
return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).mismatchQualities;
}
@Deprecated
public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) {
if ( r.getReadUnmappedFlag() ) return 1000000;
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
}
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) {
return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
}
@Deprecated
public static long mismatchingQualities(SAMRecord r, String refSeq, int refIndex ) {
if ( r.getReadUnmappedFlag() ) return 1000000;
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
}
public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
}
// todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
// todo -- high performance implementation. We can do a lot better than this right now
public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
MismatchCount mc = new MismatchCount();
int readIdx = 0;
int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count
byte[] readSeq = r.getReadBases();
Cigar c = r.getCigar();
for (int i = 0 ; i < c.numCigarElements() ; i++) {
if ( readIdx > endOnRead ) break;
CigarElement ce = c.getCigarElement(i);
switch ( ce.getOperator() ) {
case M:
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) {
if ( refIndex >= refSeq.length )
continue;
if ( readIdx < startOnRead ) continue;
if ( readIdx > endOnRead ) break;
byte refChr = refSeq[refIndex];
byte readChr = readSeq[readIdx];
// Note: we need to count X/N's as mismatches because that's what SAM requires
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
// continue; // do not count Ns/Xs/etc ?
if ( readChr != refChr ) {
mc.numMismatches++;
mc.mismatchQualities += r.getBaseQualities()[readIdx];
}
}
break;
case I:
case S:
readIdx += ce.getLength();
break;
case D:
case N:
refIndex += ce.getLength();
break;
case H:
case P:
break;
default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported");
}
}
return mc;
}
/** Returns the number of mismatches in the pileup within the given reference context.
*
* @param pileup the pileup with reads
* @param ref the reference context
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
* @return the number of mismatches
*/
public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) {
int mismatches = 0;
for ( PileupElement p : pileup )
mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite);
return mismatches;
}
/** Returns the number of mismatches in the pileup element within the given reference context.
*
* @param p the pileup element
* @param ref the reference context
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
* @return the number of mismatches
*/
public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) {
return mismatchesInRefWindow(p, ref, ignoreTargetSite, false);
}
/** Returns the number of mismatches in the pileup element within the given reference context.
*
* @param p the pileup element
* @param ref the reference context
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
* @param qualitySumInsteadOfMismatchCount if true, return the quality score sum of the mismatches rather than the count
* @return the number of mismatches
*/
public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) {
int sum = 0;
int windowStart = (int)ref.getWindow().getStart();
int windowStop = (int)ref.getWindow().getStop();
byte[] refBases = ref.getBases();
byte[] readBases = p.getRead().getReadBases();
byte[] readQualities = p.getRead().getBaseQualities();
Cigar c = p.getRead().getCigar();
int readIndex = 0;
int currentPos = p.getRead().getAlignmentStart();
int refIndex = Math.max(0, currentPos - windowStart);
for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i);
int cigarElementLength = ce.getLength();
switch ( ce.getOperator() ) {
case M:
for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) {
// are we past the ref window?
if ( currentPos > windowStop )
break;
// are we before the ref window?
if ( currentPos < windowStart )
continue;
byte refChr = refBases[refIndex++];
// do we need to skip the target site?
if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos )
continue;
byte readChr = readBases[readIndex];
if ( readChr != refChr )
sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1;
}
break;
case I:
case S:
readIndex += cigarElementLength;
break;
case D:
case N:
currentPos += cigarElementLength;
if ( currentPos > windowStart )
refIndex += Math.min(cigarElementLength, currentPos - windowStart);
break;
case H:
case P:
break;
}
}
return sum;
}
/** Returns the number of mismatches in the pileup element within the given reference context.
*
* @param read the SAMRecord
* @param ref the reference context
* @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good
* @param windowSize window size (on each side) to test
* @return a bitset representing which bases are good
*/
public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) {
// first determine the positions with mismatches
int readLength = read.getReadLength();
BitSet mismatches = new BitSet(readLength);
// it's possible we aren't starting at the beginning of a read,
// and we don't need to look at any of the previous context outside our window
// (although we do need future context)
int readStartPos = Math.max(read.getAlignmentStart(), (int)ref.getLocus().getStart() - windowSize);
int currentReadPos = read.getAlignmentStart();
byte[] refBases = ref.getBases();
int refIndex = readStartPos - (int)ref.getWindow().getStart();
if ( refIndex < 0 ) {
throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus());
}
byte[] readBases = read.getReadBases();
int readIndex = 0;
Cigar c = read.getCigar();
for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i);
int cigarElementLength = ce.getLength();
switch ( ce.getOperator() ) {
case M:
for (int j = 0; j < cigarElementLength; j++, readIndex++) {
// skip over unwanted bases
if ( currentReadPos++ < readStartPos )
continue;
// this is possible if reads extend beyond the contig end
if ( refIndex >= refBases.length )
break;
byte refChr = refBases[refIndex];
byte readChr = readBases[readIndex];
if ( readChr != refChr )
mismatches.set(readIndex);
refIndex++;
}
break;
case I:
case S:
readIndex += cigarElementLength;
break;
case D:
case N:
if ( currentReadPos >= readStartPos )
refIndex += cigarElementLength;
currentReadPos += cigarElementLength;
break;
case H:
case P:
break;
}
}
// all bits are set to false by default
BitSet result = new BitSet(readLength);
int currentPos = 0, leftPos = 0, rightPos;
int mismatchCount = 0;
// calculate how many mismatches exist in the windows to the left/right
for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) {
if ( mismatches.get(rightPos) )
mismatchCount++;
}
if ( mismatchCount <= maxMismatches )
result.set(currentPos);
// now, traverse over the read positions
while ( currentPos < readLength ) {
// add a new rightmost position
if ( rightPos < readLength && mismatches.get(rightPos++) )
mismatchCount++;
// re-penalize the previous position
if ( mismatches.get(currentPos++) )
mismatchCount++;
// don't penalize the current position
if ( mismatches.get(currentPos) )
mismatchCount--;
// subtract the leftmost position
if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) )
mismatchCount--;
if ( mismatchCount <= maxMismatches )
result.set(currentPos);
}
return result;
}
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
* a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed.
* Formally, this method simply returns the number of M elements in the cigar.
* @param r alignment
* @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored).
*/
public static int getNumAlignmentBlocks(final SAMRecord r) {
int n = 0;
final Cigar cigar = r.getCigar();
if (cigar == null) return 0;
for (final CigarElement e : cigar.getCigarElements()) {
if (e.getOperator() == CigarOperator.M ) n++;
}
return n;
}
@Deprecated
public static char[] alignmentToCharArray( final Cigar cigar, final char[] read, final char[] ref ) {
final char[] alignment = new char[read.length];
int refPos = 0;
int alignPos = 0;
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
case S:
for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
alignment[alignPos++] = '+';
}
break;
case D:
case N:
refPos++;
break;
case M:
for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
alignment[alignPos] = ref[refPos];
alignPos++;
refPos++;
}
break;
case H:
case P:
break;
default:
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
}
}
return alignment;
}
public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) {
final byte[] alignment = new byte[read.length];
int refPos = 0;
int alignPos = 0;
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
case S:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
alignment[alignPos++] = '+';
}
break;
case D:
case N:
refPos++;
break;
case M:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
alignment[alignPos] = ref[refPos];
alignPos++;
refPos++;
}
break;
case H:
case P:
break;
default:
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
}
}
return alignment;
}
public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) {
boolean atDeletion = false;
if(pileupOffset == -1) {
atDeletion = true;
pileupOffset = refLocus - alignmentStart;
final CigarElement ce = cigar.getCigarElement(0);
if( ce.getOperator() == CigarOperator.S ) {
pileupOffset += ce.getLength();
}
}
int pos = 0;
int alignmentPos = 0;
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
case S:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
if( pos == pileupOffset ) {
return alignmentPos;
}
pos++;
}
break;
case D:
case N:
if(!atDeletion) {
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
alignmentPos++;
}
} else {
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
if( pos == pileupOffset ) {
return alignmentPos;
}
pos++;
alignmentPos++;
}
}
break;
case M:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
if( pos == pileupOffset ) {
return alignmentPos;
}
pos++;
alignmentPos++;
}
break;
case H:
case P:
break;
default:
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
}
}
return alignmentPos;
}
public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) {
int alignmentLength = 0;
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
case S:
break;
case D:
case N:
alignmentLength += elementLength;
break;
case M:
alignmentLength += elementLength;
break;
case H:
case P:
break;
default:
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
}
}
final byte[] alignment = new byte[alignmentLength];
int alignPos = 0;
int readPos = 0;
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
case S:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
readPos++;
}
break;
case D:
case N:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
alignment[alignPos] = PileupElement.DELETION_BASE;
alignPos++;
}
break;
case M:
for ( int jjj = 0; jjj < elementLength; jjj++ ) {
alignment[alignPos] = read[readPos];
alignPos++;
readPos++;
}
break;
case H:
case P:
break;
default:
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
}
}
return alignment;
}
/**
* Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
* specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
* alignment reference index/start.
* @param r record
* @return true if read is unmapped
*/
public static boolean isReadUnmapped(final SAMRecord r) {
if ( r.getReadUnmappedFlag() ) return true;
// our life would be so much easier if all sam files followed the specs. In reality,
// sam files (including those generated by maq or bwa) miss headers alltogether. When
// reading such a SAM file, reference name is set, but since there is no sequence dictionary,
// null is always returned for referenceIndex. Let's be paranoid here, and make sure that
// we do not call the read "unmapped" when it has only reference name set with ref. index missing
// or vice versa.
if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
|| r.getReferenceName() != null && r.getReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME )
&& r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ;
return true;
}
/**
* Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format
* specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and
* alignment reference index/start of the mate.
* @param r sam record for the read
* @return true if read's mate is unmapped
*/
public static boolean isMateUnmapped(final SAMRecord r) {
if ( r.getMateUnmappedFlag() ) return true;
// our life would be so much easier if all sam files followed the specs. In reality,
// sam files (including those generated by maq or bwa) miss headers alltogether. When
// reading such a SAM file, reference name is set, but since there is no sequence dictionary,
// null is always returned for referenceIndex. Let's be paranoid here, and make sure that
// we do not call the read "unmapped" when it has only reference name set with ref. index missing
// or vice versa.
if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
|| r.getMateReferenceName() != null && r.getMateReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME )
&& r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ;
return true;
}
/** Returns true is read is mapped and mapped uniquely (Q>0).
*
* @param read
* @return
*/
public static boolean isReadUniquelyMapped(SAMRecord read) {
return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0;
}
/** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
* cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
* qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
* of read's base qualitites is inverted (in this case new array is allocated and returned).
* @param read
* @return
*/
public static byte [] getQualsInCycleOrder(SAMRecord read) {
if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getBaseQualities();
return Utils.reverse(read.getBaseQualities());
}
/** Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from
* cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
* qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
* of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities
* are available this method will throw a runtime exception.
* @param read
* @return
*/
public static byte [] getOriginalQualsInCycleOrder(SAMRecord read) {
if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getOriginalBaseQualities();
return Utils.reverse(read.getOriginalBaseQualities());
}
/** Takes the alignment of the read sequence readSeq to the reference sequence refSeq
* starting at 0-based position refIndex on the refSeq and specified by its cigar.
* The last argument readIndex specifies 0-based position on the read where the alignment described by the
* cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
* Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
* should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
* always the positions where the cigar starts on the ref and on the read, respectively.
*
* If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
* specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
* as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
* is not repeated), the original cigar is returned.
* @param cigar structure of the original alignment
* @param refSeq reference sequence the read is aligned to
* @param readSeq read sequence
* @param refIndex 0-based alignment start position on ref
* @param readIndex 0-based alignment start position on read
* @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
*/
public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
int indexOfIndel = -1;
for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
CigarElement ce = cigar.getCigarElement(i);
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
// if there is more than 1 indel, don't left align
if ( indexOfIndel != -1 )
return cigar;
indexOfIndel = i;
}
}
// if there is no indel or if the alignment starts with an insertion (so that there
// is no place on the read to move that insertion further left), we are done
if ( indexOfIndel < 1 ) return cigar;
final int indelLength = cigar.getCigarElement(indexOfIndel).getLength();
byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
if ( altString == null )
return cigar;
Cigar newCigar = cigar;
for ( int i = 0; i < indelLength; i++ ) {
newCigar = moveCigarLeft(newCigar, indexOfIndel);
byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
// check to make sure we haven't run off the end of the read
boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar);
if ( Arrays.equals(altString, newAltString) ) {
cigar = newCigar;
i = -1;
if ( reachedEndOfRead )
cigar = cleanUpCigar(cigar);
}
if ( reachedEndOfRead )
break;
}
return cigar;
}
private static boolean cigarHasZeroSizeElement(Cigar c) {
for ( CigarElement ce : c.getCigarElements() ) {
if ( ce.getLength() == 0 )
return true;
}
return false;
}
private static Cigar cleanUpCigar(Cigar c) {
ArrayList elements = new ArrayList(c.numCigarElements()-1);
for ( CigarElement ce : c.getCigarElements() ) {
if ( ce.getLength() != 0 &&
(elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) {
elements.add(ce);
}
}
return new Cigar(elements);
}
private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) {
// get the first few elements
ArrayList elements = new ArrayList(cigar.numCigarElements());
for ( int i = 0; i < indexOfIndel - 1; i++)
elements.add(cigar.getCigarElement(i));
// get the indel element and move it left one base
CigarElement ce = cigar.getCigarElement(indexOfIndel-1);
elements.add(new CigarElement(ce.getLength()-1, ce.getOperator()));
elements.add(cigar.getCigarElement(indexOfIndel));
if ( indexOfIndel+1 < cigar.numCigarElements() ) {
ce = cigar.getCigarElement(indexOfIndel+1);
elements.add(new CigarElement(ce.getLength()+1, ce.getOperator()));
} else {
elements.add(new CigarElement(1, CigarOperator.M));
}
// get the last few elements
for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++)
elements.add(cigar.getCigarElement(i));
return new Cigar(elements);
}
private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
CigarElement indel = cigar.getCigarElement(indexOfIndel);
int indelLength = indel.getLength();
int totalRefBases = 0;
for ( int i = 0; i < indexOfIndel; i++ ) {
CigarElement ce = cigar.getCigarElement(i);
int length = ce.getLength();
switch( ce.getOperator() ) {
case M:
readIndex += length;
refIndex += length;
totalRefBases += length;
break;
case S:
readIndex += length;
break;
case N:
refIndex += length;
totalRefBases += length;
break;
default:
break;
}
}
// sometimes, when there are very large known indels, we won't have enough reference sequence to cover them
if ( totalRefBases + indelLength > refSeq.length )
indelLength -= (totalRefBases + indelLength - refSeq.length);
// the indel-based reference string
byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))];
// add the bases before the indel, making sure it's not aligned off the end of the reference
if ( refIndex > alt.length )
return null;
System.arraycopy(refSeq, 0, alt, 0, refIndex);
int currentPos = refIndex;
// take care of the indel
if ( indel.getOperator() == CigarOperator.D ) {
refIndex += indelLength;
} else {
System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength);
currentPos += indelLength;
}
// add the bases after the indel, making sure it's not aligned off the end of the reference
if ( refSeq.length - refIndex > alt.length - currentPos )
return null;
System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex);
return alt;
}
}