From 92b054b71b080c4c9ea2842f09a1ebdf4fe6467b Mon Sep 17 00:00:00 2001 From: asivache Date: Sat, 6 Jun 2009 18:07:48 +0000 Subject: [PATCH] moved another variant of numMismatches to AlignmentUtils git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@922 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/indels/AlignmentUtils.java | 75 +++++++++++++------ 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java index dcf25b03f..b98fc801b 100644 --- a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java @@ -5,6 +5,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.picard.reference.ReferenceSequence; import org.broadinstitute.sting.playground.utils.CountedObject; +import org.broadinstitute.sting.utils.StingException; import java.util.*; @@ -17,15 +18,18 @@ import java.util.*; */ public class AlignmentUtils { - /** Computes number of mismatches in the read alignment to the refence ref - * specified in the record r. Indels are completely ignored by this method: - * only base mismatches in the alignment segments where both sequences are present are counted. - * @param r - * @param refseq - * @return + + /** Returns number of mismatches in the alignment r to the reference sequence + * refSeq. It is assumed that + * the alignment starts at (1-based) position r.getAlignmentStart() on the specified, and all single-base mismatches + * are counted in the alignment segments where both sequences are present. Insertions/deletions are skipped and do + * not contribute to the error count returned by this method. + * @param r aligned read + * @param refSeq reference sequence + * @return number of single-base mismatches in the aligned segments (gaps on either of the sequences are skipped) */ - public static int numMismatches(SAMRecord r, ReferenceSequence refseq) { - byte[] ref = refseq.getBases(); + public static int numMismatches(SAMRecord r, ReferenceSequence refSeq) { + byte[] ref = refSeq.getBases(); if ( r.getReadUnmappedFlag() ) return 1000000; int i_ref = r.getAlignmentStart()-1; // position on the ref int i_read = 0; // position on the read @@ -86,31 +90,56 @@ public class AlignmentUtils { // IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED // TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO // PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY... - public static int numMismatches(SAMRecord r, String ref ) { + /** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality + * for reference sequence specified as conventional java string (of bases). By default, it is assumed that + * the alignment starts at (1-based) position r.getAlignmentStart() on the reference refSeq. + * See {@link #numMismatches(SAMRecord, String, int)} if this is not the case. + */ + public static int numMismatches(SAMRecord r, String refSeq ) { if ( r.getReadUnmappedFlag() ) return 1000000; - int i_ref = r.getAlignmentStart()-1; // position on the ref - int i_read = 0; // position on the read - int mm_count = 0; // number of mismatches + return numMismatches(r, refSeq, r.getAlignmentStart()-1); + } + + /** Returns number of mismatches in the alignment r to the reference sequence + * refSeq assuming the alignment starts at (ZERO-based) position refIndex on the + * specified reference sequence; in other words, refIndex is used in place of alignment's own + * getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment r + * (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected. + * + * @param r alignment + * @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of + * the reference string, IndexOutOfBound exception will be thrown at runtime). + * @param refIndex zero-based position, at which the alignment starts on the specified reference string. + * @return + */ + public static int numMismatches(SAMRecord r, String refSeq, int refIndex) { + int readIdx = 0; + int mismatches = 0; + String readSeq = r.getReadString(); Cigar c = r.getCigar(); - for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { - CigarElement ce = c.getCigarElement(k); - switch( ce.getOperator() ) { + for (int i = 0 ; i < c.numCigarElements() ; i++) { + CigarElement ce = c.getCigarElement(i); + switch ( ce.getOperator() ) { case M: - for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { - if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ? - if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != - Character.toUpperCase(ref.charAt(i_ref)) ) mm_count++; + for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { + if ( Character.toUpperCase(readSeq.charAt(readIdx)) != Character.toUpperCase(refSeq.charAt(refIndex)) ) + mismatches++; } break; - case I: i_read += ce.getLength(); break; - case D: i_ref += ce.getLength(); break; - default: throw new RuntimeException("Unrecognized cigar element"); + case I: + readIdx += ce.getLength(); + break; + case D: + refIndex += ce.getLength(); + break; + default: throw new StingException("Only M,I,D cigar elements are currently supported"); } } - return mm_count; + return mismatches; } + /** Reads through the alignment cigar and returns all indels found in the alignment as a collection * of Indel objects. * @param c