From 758db73b98bedae85e69107c6ae6359fe29811bf Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 17 Apr 2009 20:10:34 +0000 Subject: [PATCH] Fixed SLOWNESS issue. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@469 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/indels/AlignmentUtils.java | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java index 874940fdc..5db2e63cd 100644 --- a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java @@ -21,18 +21,11 @@ public class AlignmentUtils { * specified in the record r. Indels are completely ignored by this method: * only base mismatches in the alignment segments where both sequences are present are counted. * @param r - * @param ref + * @param refseq * @return */ - public static int numMismatches(SAMRecord r, ReferenceSequence ref) { - return numMismatches(r, ref.getBases()); - } - - public static int numMismatches(SAMRecord r, String ref ) { - return numMismatches(r,ref.getBytes()); - } - - public static int numMismatches(SAMRecord r, byte[] ref) { + public static int numMismatches(SAMRecord r, ReferenceSequence refseq) { + byte[] ref = refseq.getBases(); if ( r.getReadUnmappedFlag() ) return 1000000; int i_ref = r.getAlignmentStart()-1; // position on the ref int i_read = 0; // position on the read @@ -57,6 +50,34 @@ public class AlignmentUtils { return mm_count; } + // IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED + // TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO + // PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY... + public static int numMismatches(SAMRecord r, String ref ) { + if ( r.getReadUnmappedFlag() ) return 1000000; + int i_ref = r.getAlignmentStart()-1; // position on the ref + int i_read = 0; // position on the read + int mm_count = 0; // number of mismatches + Cigar c = r.getCigar(); + for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { + CigarElement ce = c.getCigarElement(k); + switch( ce.getOperator() ) { + case M: + for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { + if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ? + if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != + Character.toUpperCase(ref.charAt(i_ref)) ) mm_count++; + } + break; + case I: i_read += ce.getLength(); break; + case D: i_ref += ce.getLength(); break; + default: throw new RuntimeException("Unrecognized cigar element"); + } + + } + return mm_count; + } + /** Reads through the alignment cigar and returns all indels found in the alignment as a collection * of Indel objects. * @param c