From dbd3d4473e4f10218ea65a1eb9ae90fbac7461b2 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 27 Jan 2010 20:44:15 +0000 Subject: [PATCH] Making a copy here before changing the live copy git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2709 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/oldindels/AlignmentUtils.java | 412 ++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 archive/java/src/org/broadinstitute/sting/oldindels/AlignmentUtils.java diff --git a/archive/java/src/org/broadinstitute/sting/oldindels/AlignmentUtils.java b/archive/java/src/org/broadinstitute/sting/oldindels/AlignmentUtils.java new file mode 100644 index 000000000..d8d263b9e --- /dev/null +++ b/archive/java/src/org/broadinstitute/sting/oldindels/AlignmentUtils.java @@ -0,0 +1,412 @@ +package org.broadinstitute.sting.utils; + +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.picard.reference.ReferenceSequence; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.pileup.*; + + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Mar 25, 2009 + * Time: 12:15:38 AM + * To change this template use File | Settings | File Templates. + */ +public class AlignmentUtils { + + + /** Returns number of mismatches in the alignment r to the reference sequence + * refSeq. It is assumed that + * the alignment starts at (1-based) position r.getAlignmentStart() on the specified, and all single-base mismatches + * are counted in the alignment segments where both sequences are present. Insertions/deletions are skipped and do + * not contribute to the error count returned by this method. + * @param r aligned read + * @param refSeq reference sequence + * @return number of single-base mismatches in the aligned segments (gaps on either of the sequences are skipped) + */ + public static int numMismatches(SAMRecord r, ReferenceSequence refSeq) { + byte[] ref = refSeq.getBases(); + if ( r.getReadUnmappedFlag() ) return 1000000; + int i_ref = r.getAlignmentStart()-1; // position on the ref + int i_read = 0; // position on the read + int mm_count = 0; // number of mismatches + Cigar c = r.getCigar(); + for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { + CigarElement ce = c.getCigarElement(k); + switch( ce.getOperator() ) { + case M: + for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { + char refChr = (char)ref[i_ref]; + char readChr = (char)r.getReadBases()[i_read]; + if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + mm_count++; + } + break; + case I: + case S: + i_read += ce.getLength(); + break; + case D: + case N: + i_ref += ce.getLength(); + break; + default: throw new RuntimeException("Unrecognized cigar element"); + } + + } + return mm_count; + } + + /** + * mhanna - 11 May 2009 - stubbed out competing method that works with partial references. + * Computes number of mismatches in the read alignment to the refence ref + * specified in the record r. Indels are completely ignored by this method: + * only base mismatches in the alignment segments where both sequences are present are counted. + * @param r + * @return + */ + public static int numMismatches(SAMRecord r, char[] ref) { + if ( r.getReadUnmappedFlag() ) return 1000000; + int i_ref = 0; // position on the ref + int i_read = 0; // position on the read + int mm_count = 0; // number of mismatches + Cigar c = r.getCigar(); + for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { + CigarElement ce = c.getCigarElement(k); + switch( ce.getOperator() ) { + case M: + for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { + char refChr = ref[i_ref]; + char readChr = (char)r.getReadBases()[i_read]; + if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + mm_count++; + } + break; + case I: + case S: + i_read += ce.getLength(); + break; + case D: + case N: + i_ref += ce.getLength(); + break; + default: throw new RuntimeException("Unrecognized cigar element: " + ce.getOperator()); + } + + } + return mm_count; + } + + // IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED + // TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO + // PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY... + /** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality + * for reference sequence specified as conventional java string (of bases). By default, it is assumed that + * the alignment starts at (1-based) position r.getAlignmentStart() on the reference refSeq. + * See {@link #numMismatches(SAMRecord, String, int)} if this is not the case. + */ + public static int numMismatches(SAMRecord r, String refSeq ) { + if ( r.getReadUnmappedFlag() ) return 1000000; + return numMismatches(r, refSeq, r.getAlignmentStart()-1); + } + + /** Returns number of mismatches in the alignment r to the reference sequence + * refSeq assuming the alignment starts at (ZERO-based) position refIndex on the + * specified reference sequence; in other words, refIndex is used in place of alignment's own + * getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment r + * (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected. + * + * @param r alignment + * @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of + * the reference string, IndexOutOfBound exception will be thrown at runtime). + * @param refIndex zero-based position, at which the alignment starts on the specified reference string. + * @return the number of mismatches + */ + public static int numMismatches(SAMRecord r, String refSeq, int refIndex) { + int readIdx = 0; + int mismatches = 0; + byte[] readSeq = r.getReadBases(); + Cigar c = r.getCigar(); + for (int i = 0 ; i < c.numCigarElements() ; i++) { + CigarElement ce = c.getCigarElement(i); + switch ( ce.getOperator() ) { + case M: + for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { + if ( refIndex >= refSeq.length() ) + continue; + char refChr = refSeq.charAt(refIndex); + char readChr = (char)readSeq[readIdx]; + // Note: we need to count X/N's as mismatches because that's what SAM requires + //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + // continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + mismatches++; + } + break; + case I: + case S: + readIdx += ce.getLength(); + break; + case D: + case N: + refIndex += ce.getLength(); + break; + default: throw new StingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + + } + return mismatches; + } + + /** Returns the number of mismatches in the pileup within the given reference context. + * + * @param pileup the pileup with reads + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { + int mismatches = 0; + for ( PileupElement p : pileup ) + mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); + return mismatches; + } + + /** Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { + + int mismatches = 0; + + int windowStart = (int)ref.getWindow().getStart(); + int windowStop = (int)ref.getWindow().getStop(); + char[] refBases = ref.getBases(); + byte[] readBases = p.getRead().getReadBases(); + Cigar c = p.getRead().getCigar(); + + int readIndex = 0; + int currentPos = p.getRead().getAlignmentStart(); + int refIndex = Math.max(0, currentPos - windowStart); + + for (int i = 0 ; i < c.numCigarElements() ; i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { + // are we past the ref window? + if ( currentPos > windowStop ) + break; + + // are we before the ref window? + if ( currentPos < windowStart ) + continue; + + char refChr = refBases[refIndex++]; + + // do we need to skip the target site? + if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) + continue; + + char readChr = (char)readBases[readIndex]; + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + mismatches++; + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + currentPos += cigarElementLength; + if ( currentPos > windowStart ) + refIndex += Math.min(cigarElementLength, currentPos - windowStart); + break; + default: + // fail silently + return 0; + } + + } + + return mismatches; + } + + /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. + * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but + * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is + * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. + * Formally, this method simply returns the number of M elements in the cigar. + * @param r alignment + * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). + */ + public static int getNumAlignmentBlocks(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) { + if (e.getOperator() == CigarOperator.M ) n++; + } + + return n; + } + + public static String toString(Cigar cig) { + StringBuilder b = new StringBuilder(); + + for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) { + char c='?'; + switch ( cig.getCigarElement(i).getOperator() ) { + case M : c = 'M'; break; + case D : c = 'D'; break; + case I : c = 'I'; break; + } + b.append(cig.getCigarElement(i).getLength()); + b.append(c); + } + return b.toString(); + } + + + public static String alignmentToString(final Cigar cigar,final String seq, final String ref, final int posOnRef ) { + return alignmentToString( cigar, seq, ref, posOnRef, 0 ); + } + + public static String cigarToString(Cigar cig) { + if ( cig == null ) + return "null"; + + StringBuilder b = new StringBuilder(); + + for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) { + char c='?'; + switch ( cig.getCigarElement(i).getOperator() ) { + case M : c = 'M'; break; + case D : c = 'D'; break; + case I : c = 'I'; break; + } + b.append(cig.getCigarElement(i).getLength()); + b.append(c); + } + return b.toString(); + } + + public static String alignmentToString(final Cigar cigar,final String seq, final String ref, final int posOnRef, final int posOnRead ) { + int readPos = posOnRead; + int refPos = posOnRef; + + StringBuilder refLine = new StringBuilder(); + StringBuilder readLine = new StringBuilder(); + + for ( int i = 0 ; i < posOnRead ; i++ ) { + refLine.append( ref.charAt( refPos - readPos + i ) ); + readLine.append( seq.charAt(i) ) ; + } + + for ( int i = 0 ; i < cigar.numCigarElements() ; i++ ) { + + final CigarElement ce = cigar.getCigarElement(i); + + switch(ce.getOperator()) { + case I: + for ( int j = 0 ; j < ce.getLength(); j++ ) { + refLine.append('+'); + readLine.append( seq.charAt( readPos++ ) ); + } + break; + case D: + for ( int j = 0 ; j < ce.getLength(); j++ ) { + readLine.append('*'); + refLine.append( ref.charAt( refPos++ ) ); + } + break; + case M: + for ( int j = 0 ; j < ce.getLength(); j++ ) { + refLine.append(ref.charAt( refPos++ ) ); + readLine.append( seq.charAt( readPos++ ) ); + } + break; + default: throw new StingException("Unsupported cigar operator: "+ce.getOperator() ); + } + } + refLine.append('\n'); + refLine.append(readLine); + refLine.append('\n'); + return refLine.toString(); + } + + public static char[] alignmentToCharArray( final Cigar cigar, final char[] read, final char[] ref ) { + + final char[] alignment = new char[read.length]; + int refPos = 0; + int alignPos = 0; + + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + + switch( ce.getOperator() ) { + case I: + case S: + for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) { + alignment[alignPos++] = '+'; + } + break; + case D: + case N: + refPos++; + break; + case M: + for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) { + alignment[alignPos] = ref[refPos]; + alignPos++; + refPos++; + } + break; + default: + throw new StingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignment; + } + + /** + * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and + * alignment reference index/start. + * @param r + * @return + */ + public static boolean isReadUnmapped(final SAMRecord r) { + if ( r.getReadUnmappedFlag() ) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers alltogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getReferenceName() != null && r.getReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME ) + && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; + return true; + } +}