2010-04-20 07:00:08 +08:00
/ *
* Copyright ( c ) 2010 The Broad Institute
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2010-04-20 07:00:08 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2010-04-20 07:00:08 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
package org.broadinstitute.sting.utils.sam ;
2009-03-25 13:48:10 +08:00
2009-06-07 03:35:21 +08:00
import net.sf.samtools.CigarOperator ;
2009-03-25 13:48:10 +08:00
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.Cigar ;
import net.sf.samtools.CigarElement ;
2010-01-28 05:36:42 +08:00
import net.sf.samtools.util.StringUtil ;
2009-12-13 10:52:18 +08:00
import org.broadinstitute.sting.gatk.contexts.ReferenceContext ;
2011-02-25 00:09:03 +08:00
import org.broadinstitute.sting.utils.BaseUtils ;
2010-09-12 23:07:38 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
2009-12-13 10:52:18 +08:00
import org.broadinstitute.sting.utils.pileup.* ;
2010-05-20 22:05:13 +08:00
import org.broadinstitute.sting.utils.Utils ;
2009-03-25 13:48:10 +08:00
2010-05-26 04:04:33 +08:00
import java.util.ArrayList ;
import java.util.Arrays ;
2010-10-11 10:19:51 +08:00
import java.util.BitSet ;
2010-05-26 04:04:33 +08:00
2009-03-25 13:48:10 +08:00
public class AlignmentUtils {
2010-10-08 23:13:01 +08:00
public static class MismatchCount {
2010-03-24 23:21:58 +08:00
public int numMismatches = 0 ;
public long mismatchQualities = 0 ;
2009-05-12 06:45:11 +08:00
}
2009-06-07 02:07:48 +08:00
/ * * Returns number of mismatches in the alignment < code > r < / code > to the reference sequence
* < code > refSeq < / code > assuming the alignment starts at ( ZERO - based ) position < code > refIndex < / code > on the
* specified reference sequence ; in other words , < code > refIndex < / code > is used in place of alignment ' s own
* getAlignmentStart ( ) coordinate and the latter is never used . However , the structure of the alignment < code > r < / code >
* ( i . e . it ' s cigar string with all the insertions / deletions it may specify ) is fully respected .
2010-01-28 05:36:42 +08:00
*
* THIS CODE ASSUMES THAT ALL BYTES COME FROM UPPERCASED CHARS .
2009-06-07 02:07:48 +08:00
*
* @param r alignment
* @param refSeq chunk of reference sequence that subsumes the alignment completely ( if alignment runs out of
* the reference string , IndexOutOfBound exception will be thrown at runtime ) .
* @param refIndex zero - based position , at which the alignment starts on the specified reference string .
2009-12-17 01:28:09 +08:00
* @return the number of mismatches
2009-06-07 02:07:48 +08:00
* /
2010-01-28 05:36:42 +08:00
public static int numMismatches ( SAMRecord r , byte [ ] refSeq , int refIndex ) {
2010-03-24 23:21:58 +08:00
return getMismatchCount ( r , refSeq , refIndex ) . numMismatches ;
}
2010-08-04 23:27:35 +08:00
/ * * Same as # numMismatches ( SAMRecord , byte [ ] , refIndex ) , but counts mismatches only along the partial stretch
* on the read of length < code > nReadBases < / code > starting at ( 0 - based ) position < code > readIndex < / code > .
* @param r Aligned read to count mismatches for
* @param refSeq Chunk of reference sequence that subsumes the alignment
* @param refIndex Zero - based position on < code > refSeq < / code > where the alighnment for the whole read starts
* @param readIndex Zero - based position on the read , the mismatches will be counted only from this position on
* @param nReadBases Length of continuous stretch on the read , along which mismatches will be counted
* @return
* /
public static int numMismatches ( SAMRecord r , byte [ ] refSeq , int refIndex , int readIndex , int nReadBases ) {
if ( r . getReadUnmappedFlag ( ) ) return 1000000 ;
return getMismatchCount ( r , refSeq , refIndex , readIndex , nReadBases ) . numMismatches ;
}
public static long mismatchingQualities ( SAMRecord r , byte [ ] refSeq , int refIndex , int readIndex , int nReadBases ) {
return getMismatchCount ( r , refSeq , refIndex , readIndex , nReadBases ) . mismatchQualities ;
}
2010-05-26 22:10:26 +08:00
@Deprecated
2010-03-24 23:21:58 +08:00
public static int numMismatches ( SAMRecord r , String refSeq , int refIndex ) {
if ( r . getReadUnmappedFlag ( ) ) return 1000000 ;
return numMismatches ( r , StringUtil . stringToBytes ( refSeq ) , refIndex ) ;
}
public static long mismatchingQualities ( SAMRecord r , byte [ ] refSeq , int refIndex ) {
return getMismatchCount ( r , refSeq , refIndex ) . mismatchQualities ;
}
2010-05-26 22:10:26 +08:00
@Deprecated
2010-03-24 23:21:58 +08:00
public static long mismatchingQualities ( SAMRecord r , String refSeq , int refIndex ) {
if ( r . getReadUnmappedFlag ( ) ) return 1000000 ;
return numMismatches ( r , StringUtil . stringToBytes ( refSeq ) , refIndex ) ;
}
2010-10-08 23:13:01 +08:00
public static MismatchCount getMismatchCount ( SAMRecord r , byte [ ] refSeq , int refIndex ) {
2010-08-04 23:27:35 +08:00
return getMismatchCount ( r , refSeq , refIndex , 0 , r . getReadLength ( ) ) ;
}
2010-05-26 22:10:26 +08:00
// todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
// todo -- high performance implementation. We can do a lot better than this right now
2010-10-08 23:13:01 +08:00
public static MismatchCount getMismatchCount ( SAMRecord r , byte [ ] refSeq , int refIndex , int startOnRead , int nReadBases ) {
2010-03-24 23:21:58 +08:00
MismatchCount mc = new MismatchCount ( ) ;
2009-06-07 02:07:48 +08:00
int readIdx = 0 ;
2010-08-04 23:27:35 +08:00
int endOnRead = startOnRead + nReadBases - 1 ; // index of the last base on read we want to count
2009-12-13 10:52:18 +08:00
byte [ ] readSeq = r . getReadBases ( ) ;
2009-03-25 13:48:10 +08:00
Cigar c = r . getCigar ( ) ;
2009-06-07 02:07:48 +08:00
for ( int i = 0 ; i < c . numCigarElements ( ) ; i + + ) {
2010-08-04 23:27:35 +08:00
if ( readIdx > endOnRead ) break ;
2009-06-07 02:07:48 +08:00
CigarElement ce = c . getCigarElement ( i ) ;
switch ( ce . getOperator ( ) ) {
2009-03-25 13:48:10 +08:00
case M :
2009-06-07 02:07:48 +08:00
for ( int j = 0 ; j < ce . getLength ( ) ; j + + , refIndex + + , readIdx + + ) {
2010-01-28 05:36:42 +08:00
if ( refIndex > = refSeq . length )
2009-06-30 00:50:05 +08:00
continue ;
2010-08-04 23:27:35 +08:00
if ( readIdx < startOnRead ) continue ;
if ( readIdx > endOnRead ) break ;
2010-01-28 05:36:42 +08:00
byte refChr = refSeq [ refIndex ] ;
byte readChr = readSeq [ readIdx ] ;
2009-07-01 00:08:55 +08:00
// Note: we need to count X/N's as mismatches because that's what SAM requires
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
// continue; // do not count Ns/Xs/etc ?
2010-03-24 23:21:58 +08:00
if ( readChr ! = refChr ) {
mc . numMismatches + + ;
mc . mismatchQualities + = r . getBaseQualities ( ) [ readIdx ] ;
}
2009-03-25 13:48:10 +08:00
}
break ;
2009-06-07 02:07:48 +08:00
case I :
2010-01-14 11:45:42 +08:00
case S :
2009-06-07 02:07:48 +08:00
readIdx + = ce . getLength ( ) ;
break ;
case D :
2010-01-14 11:45:42 +08:00
case N :
2009-06-07 02:07:48 +08:00
refIndex + = ce . getLength ( ) ;
break ;
2010-05-05 21:41:05 +08:00
case H :
case P :
break ;
2010-09-12 23:07:38 +08:00
default : throw new ReviewedStingException ( "The " + ce . getOperator ( ) + " cigar element is not currently supported" ) ;
2009-03-25 13:48:10 +08:00
}
}
2010-03-24 23:21:58 +08:00
return mc ;
2009-03-25 13:48:10 +08:00
}
2009-12-13 10:52:18 +08:00
/ * * Returns the number of mismatches in the pileup within the given reference context .
*
* @param pileup the pileup with reads
* @param ref the reference context
* @param ignoreTargetSite if true , ignore mismatches at the target locus ( i . e . the center of the window )
2009-12-17 01:28:09 +08:00
* @return the number of mismatches
2009-12-13 10:52:18 +08:00
* /
2009-12-17 01:28:09 +08:00
public static int mismatchesInRefWindow ( ReadBackedPileup pileup , ReferenceContext ref , boolean ignoreTargetSite ) {
int mismatches = 0 ;
for ( PileupElement p : pileup )
mismatches + = mismatchesInRefWindow ( p , ref , ignoreTargetSite ) ;
2009-12-13 10:52:18 +08:00
return mismatches ;
}
/ * * Returns the number of mismatches in the pileup element within the given reference context .
*
* @param p the pileup element
* @param ref the reference context
* @param ignoreTargetSite if true , ignore mismatches at the target locus ( i . e . the center of the window )
2009-12-17 01:28:09 +08:00
* @return the number of mismatches
2009-12-13 10:52:18 +08:00
* /
2009-12-17 01:28:09 +08:00
public static int mismatchesInRefWindow ( PileupElement p , ReferenceContext ref , boolean ignoreTargetSite ) {
2010-03-03 00:40:03 +08:00
return mismatchesInRefWindow ( p , ref , ignoreTargetSite , false ) ;
}
2009-12-13 10:52:18 +08:00
2010-03-03 00:40:03 +08:00
/ * * Returns the number of mismatches in the pileup element within the given reference context .
*
* @param p the pileup element
* @param ref the reference context
* @param ignoreTargetSite if true , ignore mismatches at the target locus ( i . e . the center of the window )
* @param qualitySumInsteadOfMismatchCount if true , return the quality score sum of the mismatches rather than the count
* @return the number of mismatches
* /
public static int mismatchesInRefWindow ( PileupElement p , ReferenceContext ref , boolean ignoreTargetSite , boolean qualitySumInsteadOfMismatchCount ) {
int sum = 0 ;
2009-12-13 10:52:18 +08:00
2009-12-17 05:39:58 +08:00
int windowStart = ( int ) ref . getWindow ( ) . getStart ( ) ;
int windowStop = ( int ) ref . getWindow ( ) . getStop ( ) ;
2010-05-20 07:27:55 +08:00
byte [ ] refBases = ref . getBases ( ) ;
2009-12-13 10:52:18 +08:00
byte [ ] readBases = p . getRead ( ) . getReadBases ( ) ;
2010-03-03 00:40:03 +08:00
byte [ ] readQualities = p . getRead ( ) . getBaseQualities ( ) ;
2009-12-13 10:52:18 +08:00
Cigar c = p . getRead ( ) . getCigar ( ) ;
int readIndex = 0 ;
int currentPos = p . getRead ( ) . getAlignmentStart ( ) ;
2009-12-17 05:39:58 +08:00
int refIndex = Math . max ( 0 , currentPos - windowStart ) ;
2009-12-13 10:52:18 +08:00
for ( int i = 0 ; i < c . numCigarElements ( ) ; i + + ) {
CigarElement ce = c . getCigarElement ( i ) ;
2009-12-17 05:39:58 +08:00
int cigarElementLength = ce . getLength ( ) ;
2009-12-13 10:52:18 +08:00
switch ( ce . getOperator ( ) ) {
case M :
2009-12-17 05:39:58 +08:00
for ( int j = 0 ; j < cigarElementLength ; j + + , readIndex + + , currentPos + + ) {
2009-12-13 10:52:18 +08:00
// are we past the ref window?
2009-12-17 05:39:58 +08:00
if ( currentPos > windowStop )
2009-12-13 10:52:18 +08:00
break ;
// are we before the ref window?
2009-12-17 05:39:58 +08:00
if ( currentPos < windowStart )
2009-12-13 10:52:18 +08:00
continue ;
2010-05-20 07:27:55 +08:00
byte refChr = refBases [ refIndex + + ] ;
2009-12-13 10:52:18 +08:00
// do we need to skip the target site?
if ( ignoreTargetSite & & ref . getLocus ( ) . getStart ( ) = = currentPos )
continue ;
2010-05-26 22:10:26 +08:00
byte readChr = readBases [ readIndex ] ;
if ( readChr ! = refChr )
2010-03-03 00:40:03 +08:00
sum + = ( qualitySumInsteadOfMismatchCount ) ? readQualities [ readIndex ] : 1 ;
2009-12-13 10:52:18 +08:00
}
break ;
case I :
2010-01-14 11:45:42 +08:00
case S :
2009-12-17 05:39:58 +08:00
readIndex + = cigarElementLength ;
2009-12-13 10:52:18 +08:00
break ;
case D :
2010-01-14 11:45:42 +08:00
case N :
2009-12-17 05:39:58 +08:00
currentPos + = cigarElementLength ;
if ( currentPos > windowStart )
refIndex + = Math . min ( cigarElementLength , currentPos - windowStart ) ;
2009-12-13 10:52:18 +08:00
break ;
2010-10-11 10:19:51 +08:00
case H :
case P :
break ;
2009-12-13 10:52:18 +08:00
}
}
2010-03-03 00:40:03 +08:00
return sum ;
2009-12-13 10:52:18 +08:00
}
2010-10-11 10:19:51 +08:00
/ * * Returns the number of mismatches in the pileup element within the given reference context .
*
* @param read the SAMRecord
* @param ref the reference context
* @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good
* @param windowSize window size ( on each side ) to test
* @return a bitset representing which bases are good
* /
public static BitSet mismatchesInRefWindow ( SAMRecord read , ReferenceContext ref , int maxMismatches , int windowSize ) {
// first determine the positions with mismatches
int readLength = read . getReadLength ( ) ;
BitSet mismatches = new BitSet ( readLength ) ;
2010-10-14 13:04:28 +08:00
// it's possible we aren't starting at the beginning of a read,
// and we don't need to look at any of the previous context outside our window
// (although we do need future context)
int readStartPos = Math . max ( read . getAlignmentStart ( ) , ( int ) ref . getLocus ( ) . getStart ( ) - windowSize ) ;
int currentReadPos = read . getAlignmentStart ( ) ;
2010-10-11 10:19:51 +08:00
byte [ ] refBases = ref . getBases ( ) ;
2010-10-14 13:04:28 +08:00
int refIndex = readStartPos - ( int ) ref . getWindow ( ) . getStart ( ) ;
2010-10-11 10:19:51 +08:00
if ( refIndex < 0 ) {
2010-10-14 13:04:28 +08:00
throw new IllegalStateException ( "When calculating mismatches, we somehow don't have enough previous reference context for read " + read . getReadName ( ) + " at position " + ref . getLocus ( ) ) ;
2010-10-11 10:19:51 +08:00
}
byte [ ] readBases = read . getReadBases ( ) ;
int readIndex = 0 ;
Cigar c = read . getCigar ( ) ;
for ( int i = 0 ; i < c . numCigarElements ( ) ; i + + ) {
CigarElement ce = c . getCigarElement ( i ) ;
int cigarElementLength = ce . getLength ( ) ;
switch ( ce . getOperator ( ) ) {
case M :
2010-10-14 13:04:28 +08:00
for ( int j = 0 ; j < cigarElementLength ; j + + , readIndex + + ) {
// skip over unwanted bases
if ( currentReadPos + + < readStartPos )
continue ;
2010-10-22 22:38:26 +08:00
// this is possible if reads extend beyond the contig end
if ( refIndex > = refBases . length )
break ;
2010-10-14 13:04:28 +08:00
2010-10-11 10:19:51 +08:00
byte refChr = refBases [ refIndex ] ;
byte readChr = readBases [ readIndex ] ;
if ( readChr ! = refChr )
mismatches . set ( readIndex ) ;
2010-10-14 13:04:28 +08:00
refIndex + + ;
2010-10-11 10:19:51 +08:00
}
break ;
case I :
case S :
readIndex + = cigarElementLength ;
break ;
case D :
case N :
2010-10-14 13:04:28 +08:00
if ( currentReadPos > = readStartPos )
refIndex + = cigarElementLength ;
currentReadPos + = cigarElementLength ;
2010-10-11 10:19:51 +08:00
break ;
case H :
case P :
break ;
}
}
// all bits are set to false by default
BitSet result = new BitSet ( readLength ) ;
int currentPos = 0 , leftPos = 0 , rightPos ;
int mismatchCount = 0 ;
// calculate how many mismatches exist in the windows to the left/right
for ( rightPos = 1 ; rightPos < = windowSize & & rightPos < readLength ; rightPos + + ) {
if ( mismatches . get ( rightPos ) )
mismatchCount + + ;
}
if ( mismatchCount < = maxMismatches )
result . set ( currentPos ) ;
// now, traverse over the read positions
while ( currentPos < readLength ) {
// add a new rightmost position
if ( rightPos < readLength & & mismatches . get ( rightPos + + ) )
mismatchCount + + ;
// re-penalize the previous position
if ( mismatches . get ( currentPos + + ) )
mismatchCount + + ;
// don't penalize the current position
if ( mismatches . get ( currentPos ) )
mismatchCount - - ;
// subtract the leftmost position
if ( leftPos < currentPos - windowSize & & mismatches . get ( leftPos + + ) )
mismatchCount - - ;
if ( mismatchCount < = maxMismatches )
result . set ( currentPos ) ;
}
return result ;
}
2009-06-07 03:35:21 +08:00
/ * * Returns number of alignment blocks ( continuous stretches of aligned bases ) in the specified alignment .
* This method follows closely the SAMRecord : : getAlignmentBlocks ( ) implemented in samtools library , but
* it only counts blocks without actually allocating and filling the list of blocks themselves . Hence , this method is
* a much more efficient alternative to r . getAlignmentBlocks . size ( ) in the situations when this number is all that is needed .
* Formally , this method simply returns the number of M elements in the cigar .
* @param r alignment
* @return number of continuous alignment blocks ( i . e . 'M' elements of the cigar ; all indel and clipping elements are ignored ) .
* /
public static int getNumAlignmentBlocks ( final SAMRecord r ) {
int n = 0 ;
final Cigar cigar = r . getCigar ( ) ;
if ( cigar = = null ) return 0 ;
for ( final CigarElement e : cigar . getCigarElements ( ) ) {
if ( e . getOperator ( ) = = CigarOperator . M ) n + + ;
}
return n ;
}
2009-03-25 13:48:10 +08:00
2011-02-25 00:09:03 +08:00
public static int getNumAlignedBases ( final SAMRecord r ) {
int n = 0 ;
final Cigar cigar = r . getCigar ( ) ;
if ( cigar = = null ) return 0 ;
for ( final CigarElement e : cigar . getCigarElements ( ) ) {
if ( e . getOperator ( ) = = CigarOperator . M ) { n + = e . getLength ( ) ; }
}
return n ;
}
2010-07-20 03:10:29 +08:00
@Deprecated
2010-01-15 23:36:59 +08:00
public static char [ ] alignmentToCharArray ( final Cigar cigar , final char [ ] read , final char [ ] ref ) {
final char [ ] alignment = new char [ read . length ] ;
int refPos = 0 ;
int alignPos = 0 ;
2010-07-20 03:10:29 +08:00
for ( int iii = 0 ; iii < cigar . numCigarElements ( ) ; iii + + ) {
final CigarElement ce = cigar . getCigarElement ( iii ) ;
final int elementLength = ce . getLength ( ) ;
switch ( ce . getOperator ( ) ) {
case I :
case S :
for ( int jjj = 0 ; jjj < elementLength ; jjj + + ) {
alignment [ alignPos + + ] = '+' ;
}
break ;
case D :
case N :
refPos + + ;
break ;
case M :
for ( int jjj = 0 ; jjj < elementLength ; jjj + + ) {
alignment [ alignPos ] = ref [ refPos ] ;
alignPos + + ;
refPos + + ;
}
break ;
case H :
case P :
break ;
default :
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Unsupported cigar operator: " + ce . getOperator ( ) ) ;
2010-07-20 03:10:29 +08:00
}
}
return alignment ;
}
public static byte [ ] alignmentToByteArray ( final Cigar cigar , final byte [ ] read , final byte [ ] ref ) {
final byte [ ] alignment = new byte [ read . length ] ;
int refPos = 0 ;
int alignPos = 0 ;
2010-01-15 23:36:59 +08:00
for ( int iii = 0 ; iii < cigar . numCigarElements ( ) ; iii + + ) {
final CigarElement ce = cigar . getCigarElement ( iii ) ;
2010-05-26 04:04:33 +08:00
final int elementLength = ce . getLength ( ) ;
2010-01-15 23:36:59 +08:00
switch ( ce . getOperator ( ) ) {
case I :
case S :
2011-01-05 08:28:05 +08:00
for ( int jjj = 0 ; jjj < elementLength ; jjj + + ) {
2010-01-15 23:36:59 +08:00
alignment [ alignPos + + ] = '+' ;
}
break ;
case D :
case N :
refPos + + ;
break ;
case M :
2011-01-05 08:28:05 +08:00
for ( int jjj = 0 ; jjj < elementLength ; jjj + + ) {
2010-01-15 23:36:59 +08:00
alignment [ alignPos ] = ref [ refPos ] ;
alignPos + + ;
refPos + + ;
}
break ;
2010-05-26 04:04:33 +08:00
case H :
case P :
break ;
2010-01-15 23:36:59 +08:00
default :
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Unsupported cigar operator: " + ce . getOperator ( ) ) ;
2010-01-15 23:36:59 +08:00
}
}
return alignment ;
}
2011-01-05 08:28:05 +08:00
public static int calcAlignmentByteArrayOffset ( final Cigar cigar , int pileupOffset , final int alignmentStart , final int refLocus ) {
boolean atDeletion = false ;
if ( pileupOffset = = - 1 ) {
atDeletion = true ;
pileupOffset = refLocus - alignmentStart ;
final CigarElement ce = cigar . getCigarElement ( 0 ) ;
if ( ce . getOperator ( ) = = CigarOperator . S ) {
pileupOffset + = ce . getLength ( ) ;
}
}
int pos = 0 ;
int alignmentPos = 0 ;
for ( int iii = 0 ; iii < cigar . numCigarElements ( ) ; iii + + ) {
final CigarElement ce = cigar . getCigarElement ( iii ) ;
final int elementLength = ce . getLength ( ) ;
switch ( ce . getOperator ( ) ) {
case I :
case S :
2011-02-25 00:09:03 +08:00
pos + = elementLength ;
if ( pos > = pileupOffset ) {
return alignmentPos ;
2011-01-05 08:28:05 +08:00
}
break ;
case D :
case N :
if ( ! atDeletion ) {
2011-02-25 00:09:03 +08:00
alignmentPos + = elementLength ;
2011-01-05 08:28:05 +08:00
} else {
2011-02-25 00:09:03 +08:00
if ( pos + elementLength > = pileupOffset ) {
return alignmentPos + ( pileupOffset - pos ) ;
} else {
pos + = elementLength ;
alignmentPos + = elementLength ;
2011-01-05 08:28:05 +08:00
}
}
break ;
case M :
2011-02-25 00:09:03 +08:00
if ( pos + elementLength > = pileupOffset ) {
return alignmentPos + ( pileupOffset - pos ) ;
} else {
pos + = elementLength ;
alignmentPos + = elementLength ;
2011-01-05 08:28:05 +08:00
}
break ;
case H :
case P :
break ;
default :
throw new ReviewedStingException ( "Unsupported cigar operator: " + ce . getOperator ( ) ) ;
}
}
return alignmentPos ;
}
public static byte [ ] readToAlignmentByteArray ( final Cigar cigar , final byte [ ] read ) {
int alignmentLength = 0 ;
for ( int iii = 0 ; iii < cigar . numCigarElements ( ) ; iii + + ) {
final CigarElement ce = cigar . getCigarElement ( iii ) ;
final int elementLength = ce . getLength ( ) ;
switch ( ce . getOperator ( ) ) {
case I :
case S :
break ;
case D :
case N :
alignmentLength + = elementLength ;
break ;
case M :
alignmentLength + = elementLength ;
break ;
case H :
case P :
break ;
default :
throw new ReviewedStingException ( "Unsupported cigar operator: " + ce . getOperator ( ) ) ;
}
}
final byte [ ] alignment = new byte [ alignmentLength ] ;
int alignPos = 0 ;
int readPos = 0 ;
for ( int iii = 0 ; iii < cigar . numCigarElements ( ) ; iii + + ) {
final CigarElement ce = cigar . getCigarElement ( iii ) ;
final int elementLength = ce . getLength ( ) ;
switch ( ce . getOperator ( ) ) {
2011-02-25 00:09:03 +08:00
case I :
/ *
if ( alignPos > 0 ) {
if ( alignment [ alignPos - 1 ] = = BaseUtils . A ) { alignment [ alignPos - 1 ] = PileupElement . A_FOLLOWED_BY_INSERTION_BASE ; }
else if ( alignment [ alignPos - 1 ] = = BaseUtils . C ) { alignment [ alignPos - 1 ] = PileupElement . C_FOLLOWED_BY_INSERTION_BASE ; }
else if ( alignment [ alignPos - 1 ] = = BaseUtils . T ) { alignment [ alignPos - 1 ] = PileupElement . T_FOLLOWED_BY_INSERTION_BASE ; }
else if ( alignment [ alignPos - 1 ] = = BaseUtils . G ) { alignment [ alignPos - 1 ] = PileupElement . G_FOLLOWED_BY_INSERTION_BASE ; }
}
* /
2011-01-05 08:28:05 +08:00
case S :
for ( int jjj = 0 ; jjj < elementLength ; jjj + + ) {
readPos + + ;
}
break ;
case D :
case N :
for ( int jjj = 0 ; jjj < elementLength ; jjj + + ) {
alignment [ alignPos ] = PileupElement . DELETION_BASE ;
alignPos + + ;
}
break ;
case M :
for ( int jjj = 0 ; jjj < elementLength ; jjj + + ) {
alignment [ alignPos ] = read [ readPos ] ;
alignPos + + ;
readPos + + ;
}
break ;
case H :
case P :
break ;
default :
throw new ReviewedStingException ( "Unsupported cigar operator: " + ce . getOperator ( ) ) ;
}
}
return alignment ;
}
2009-09-03 01:00:39 +08:00
/ * *
* Due to ( unfortunate ) multiple ways to indicate that read is unmapped allowed by SAM format
* specification , one may need this convenience shortcut . Checks both ' read unmapped ' flag and
* alignment reference index / start .
2010-03-24 23:21:58 +08:00
* @param r record
* @return true if read is unmapped
2009-09-03 01:00:39 +08:00
* /
public static boolean isReadUnmapped ( final SAMRecord r ) {
if ( r . getReadUnmappedFlag ( ) ) return true ;
2009-09-15 00:04:19 +08:00
// our life would be so much easier if all sam files followed the specs. In reality,
// sam files (including those generated by maq or bwa) miss headers alltogether. When
// reading such a SAM file, reference name is set, but since there is no sequence dictionary,
// null is always returned for referenceIndex. Let's be paranoid here, and make sure that
// we do not call the read "unmapped" when it has only reference name set with ref. index missing
// or vice versa.
if ( ( r . getReferenceIndex ( ) ! = null & & r . getReferenceIndex ( ) ! = SAMRecord . NO_ALIGNMENT_REFERENCE_INDEX
| | r . getReferenceName ( ) ! = null & & r . getReferenceName ( ) ! = SAMRecord . NO_ALIGNMENT_REFERENCE_NAME )
& & r . getAlignmentStart ( ) ! = SAMRecord . NO_ALIGNMENT_START ) return false ;
return true ;
2009-09-03 01:00:39 +08:00
}
2010-04-16 08:15:57 +08:00
2010-08-04 23:27:35 +08:00
/ * *
* Due to ( unfortunate ) multiple ways to indicate that read / mate is unmapped allowed by SAM format
* specification , one may need this convenience shortcut . Checks both ' mate unmapped ' flag and
* alignment reference index / start of the mate .
* @param r sam record for the read
* @return true if read ' s mate is unmapped
* /
public static boolean isMateUnmapped ( final SAMRecord r ) {
if ( r . getMateUnmappedFlag ( ) ) return true ;
// our life would be so much easier if all sam files followed the specs. In reality,
// sam files (including those generated by maq or bwa) miss headers alltogether. When
// reading such a SAM file, reference name is set, but since there is no sequence dictionary,
// null is always returned for referenceIndex. Let's be paranoid here, and make sure that
// we do not call the read "unmapped" when it has only reference name set with ref. index missing
// or vice versa.
if ( ( r . getMateReferenceIndex ( ) ! = null & & r . getMateReferenceIndex ( ) ! = SAMRecord . NO_ALIGNMENT_REFERENCE_INDEX
| | r . getMateReferenceName ( ) ! = null & & r . getMateReferenceName ( ) ! = SAMRecord . NO_ALIGNMENT_REFERENCE_NAME )
& & r . getMateAlignmentStart ( ) ! = SAMRecord . NO_ALIGNMENT_START ) return false ;
return true ;
}
2010-10-08 23:13:01 +08:00
/ * * Returns true is read is mapped and mapped uniquely ( Q > 0 ) .
*
* @param read
* @return
* /
public static boolean isReadUniquelyMapped ( SAMRecord read ) {
return ( ! AlignmentUtils . isReadUnmapped ( read ) ) & & read . getMappingQuality ( ) > 0 ;
}
2010-04-16 08:15:57 +08:00
/ * * Returns the array of base qualitites in the order the bases were read on the machine ( i . e . always starting from
* cycle 1 ) . In other words , if the read is unmapped or aligned in the forward direction , the read ' s own base
* qualities are returned as stored in the SAM record ; if the read is aligned in the reverse direction , the array
* of read ' s base qualitites is inverted ( in this case new array is allocated and returned ) .
* @param read
* @return
* /
public static byte [ ] getQualsInCycleOrder ( SAMRecord read ) {
if ( isReadUnmapped ( read ) | | ! read . getReadNegativeStrandFlag ( ) ) return read . getBaseQualities ( ) ;
2010-05-20 22:05:13 +08:00
return Utils . reverse ( read . getBaseQualities ( ) ) ;
2010-04-16 08:15:57 +08:00
}
2010-04-24 00:12:34 +08:00
2010-07-22 23:44:25 +08:00
/ * * Returns the array of original base qualitites ( before recalibration ) in the order the bases were read on the machine ( i . e . always starting from
* cycle 1 ) . In other words , if the read is unmapped or aligned in the forward direction , the read ' s own base
* qualities are returned as stored in the SAM record ; if the read is aligned in the reverse direction , the array
* of read ' s base qualitites is inverted ( in this case new array is allocated and returned ) . If no original base qualities
* are available this method will throw a runtime exception .
* @param read
* @return
* /
public static byte [ ] getOriginalQualsInCycleOrder ( SAMRecord read ) {
if ( isReadUnmapped ( read ) | | ! read . getReadNegativeStrandFlag ( ) ) return read . getOriginalBaseQualities ( ) ;
return Utils . reverse ( read . getOriginalBaseQualities ( ) ) ;
}
2010-04-24 00:12:34 +08:00
/ * * Takes the alignment of the read sequence < code > readSeq < / code > to the reference sequence < code > refSeq < / code >
* starting at 0 - based position < code > refIndex < / code > on the < code > refSeq < / code > and specified by its < code > cigar < / code > .
* The last argument < code > readIndex < / code > specifies 0 - based position on the read where the alignment described by the
* < code > cigar < / code > starts . Usually cigars specify alignments of the whole read to the ref , so that readIndex is normally 0.
* Use non - zero readIndex only when the alignment cigar represents alignment of a part of the read . The refIndex in this case
* should be the position where the alignment of that part of the read starts at . In other words , both refIndex and readIndex are
* always the positions where the cigar starts on the ref and on the read , respectively .
*
* If the alignment has an indel , then this method attempts moving this indel left across a stretch of repetitive bases . For instance , if the original cigar
* specifies that ( any ) one AT is deleted from a repeat sequence TATATATA , the output cigar will always mark the leftmost AT
* as deleted . If there is no indel in the original cigar , or the indel position is determined unambiguously ( i . e . inserted / deleted sequence
* is not repeated ) , the original cigar is returned .
* @param cigar structure of the original alignment
* @param refSeq reference sequence the read is aligned to
* @param readSeq read sequence
* @param refIndex 0 - based alignment start position on ref
* @param readIndex 0 - based alignment start position on read
* @return a cigar , in which indel is guaranteed to be placed at the leftmost possible position across a repeat ( if any )
* /
public static Cigar leftAlignIndel ( Cigar cigar , final byte [ ] refSeq , final byte [ ] readSeq , final int refIndex , final int readIndex ) {
2010-05-26 04:04:33 +08:00
int indexOfIndel = - 1 ;
for ( int i = 0 ; i < cigar . numCigarElements ( ) ; i + + ) {
CigarElement ce = cigar . getCigarElement ( i ) ;
if ( ce . getOperator ( ) = = CigarOperator . D | | ce . getOperator ( ) = = CigarOperator . I ) {
// if there is more than 1 indel, don't left align
if ( indexOfIndel ! = - 1 )
return cigar ;
indexOfIndel = i ;
}
}
// if there is no indel or if the alignment starts with an insertion (so that there
// is no place on the read to move that insertion further left), we are done
if ( indexOfIndel < 1 ) return cigar ;
final int indelLength = cigar . getCigarElement ( indexOfIndel ) . getLength ( ) ;
byte [ ] altString = createIndelString ( cigar , indexOfIndel , refSeq , readSeq , refIndex , readIndex ) ;
2010-12-15 01:12:25 +08:00
if ( altString = = null )
return cigar ;
2010-05-26 04:04:33 +08:00
Cigar newCigar = cigar ;
for ( int i = 0 ; i < indelLength ; i + + ) {
newCigar = moveCigarLeft ( newCigar , indexOfIndel ) ;
byte [ ] newAltString = createIndelString ( newCigar , indexOfIndel , refSeq , readSeq , refIndex , readIndex ) ;
// check to make sure we haven't run off the end of the read
boolean reachedEndOfRead = cigarHasZeroSizeElement ( newCigar ) ;
if ( Arrays . equals ( altString , newAltString ) ) {
cigar = newCigar ;
i = - 1 ;
if ( reachedEndOfRead )
cigar = cleanUpCigar ( cigar ) ;
}
if ( reachedEndOfRead )
break ;
}
return cigar ;
}
private static boolean cigarHasZeroSizeElement ( Cigar c ) {
for ( CigarElement ce : c . getCigarElements ( ) ) {
if ( ce . getLength ( ) = = 0 )
return true ;
}
return false ;
}
private static Cigar cleanUpCigar ( Cigar c ) {
ArrayList < CigarElement > elements = new ArrayList < CigarElement > ( c . numCigarElements ( ) - 1 ) ;
for ( CigarElement ce : c . getCigarElements ( ) ) {
if ( ce . getLength ( ) ! = 0 & &
( elements . size ( ) ! = 0 | | ce . getOperator ( ) ! = CigarOperator . D ) ) {
elements . add ( ce ) ;
}
}
return new Cigar ( elements ) ;
}
private static Cigar moveCigarLeft ( Cigar cigar , int indexOfIndel ) {
// get the first few elements
ArrayList < CigarElement > elements = new ArrayList < CigarElement > ( cigar . numCigarElements ( ) ) ;
for ( int i = 0 ; i < indexOfIndel - 1 ; i + + )
elements . add ( cigar . getCigarElement ( i ) ) ;
// get the indel element and move it left one base
CigarElement ce = cigar . getCigarElement ( indexOfIndel - 1 ) ;
elements . add ( new CigarElement ( ce . getLength ( ) - 1 , ce . getOperator ( ) ) ) ;
elements . add ( cigar . getCigarElement ( indexOfIndel ) ) ;
2010-07-05 01:26:38 +08:00
if ( indexOfIndel + 1 < cigar . numCigarElements ( ) ) {
ce = cigar . getCigarElement ( indexOfIndel + 1 ) ;
elements . add ( new CigarElement ( ce . getLength ( ) + 1 , ce . getOperator ( ) ) ) ;
} else {
elements . add ( new CigarElement ( 1 , CigarOperator . M ) ) ;
}
2010-05-26 04:04:33 +08:00
// get the last few elements
for ( int i = indexOfIndel + 2 ; i < cigar . numCigarElements ( ) ; i + + )
elements . add ( cigar . getCigarElement ( i ) ) ;
return new Cigar ( elements ) ;
}
private static byte [ ] createIndelString ( final Cigar cigar , final int indexOfIndel , final byte [ ] refSeq , final byte [ ] readSeq , int refIndex , int readIndex ) {
CigarElement indel = cigar . getCigarElement ( indexOfIndel ) ;
int indelLength = indel . getLength ( ) ;
2010-07-09 04:05:16 +08:00
int totalRefBases = 0 ;
2010-05-26 04:04:33 +08:00
for ( int i = 0 ; i < indexOfIndel ; i + + ) {
CigarElement ce = cigar . getCigarElement ( i ) ;
int length = ce . getLength ( ) ;
switch ( ce . getOperator ( ) ) {
2010-07-08 12:42:32 +08:00
case M :
readIndex + = length ;
refIndex + = length ;
2010-07-09 04:05:16 +08:00
totalRefBases + = length ;
2010-07-08 12:42:32 +08:00
break ;
case S :
readIndex + = length ;
break ;
case N :
refIndex + = length ;
2010-07-09 04:05:16 +08:00
totalRefBases + = length ;
2010-07-08 12:42:32 +08:00
break ;
default :
break ;
2010-05-26 04:04:33 +08:00
}
}
2010-07-09 04:05:16 +08:00
// sometimes, when there are very large known indels, we won't have enough reference sequence to cover them
if ( totalRefBases + indelLength > refSeq . length )
indelLength - = ( totalRefBases + indelLength - refSeq . length ) ;
// the indel-based reference string
byte [ ] alt = new byte [ refSeq . length + ( indelLength * ( indel . getOperator ( ) = = CigarOperator . D ? - 1 : 1 ) ) ] ;
2010-12-15 01:12:25 +08:00
// add the bases before the indel, making sure it's not aligned off the end of the reference
if ( refIndex > alt . length )
return null ;
2010-05-26 04:04:33 +08:00
System . arraycopy ( refSeq , 0 , alt , 0 , refIndex ) ;
int currentPos = refIndex ;
// take care of the indel
if ( indel . getOperator ( ) = = CigarOperator . D ) {
refIndex + = indelLength ;
} else {
System . arraycopy ( readSeq , readIndex , alt , currentPos , indelLength ) ;
currentPos + = indelLength ;
}
2010-12-15 01:12:25 +08:00
// add the bases after the indel, making sure it's not aligned off the end of the reference
if ( refSeq . length - refIndex > alt . length - currentPos )
return null ;
2010-05-26 04:04:33 +08:00
System . arraycopy ( refSeq , refIndex , alt , currentPos , refSeq . length - refIndex ) ;
return alt ;
}
2009-03-25 13:48:10 +08:00
}