Changed reads traversals from providing a LocusContext from which the reference sequence

could be extracted to a char[] containing the reference bases. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@657 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-11 22:45:11 +00:00 · 2009-05-11 22:45:11 +00:00 · 23e9e29964
parent 052819bed5
commit 23e9e29964
21 changed files with 115 additions and 61 deletions
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceProvider.java
@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
 import edu.mit.broad.picard.reference.ReferenceSequence;
 import net.sf.samtools.util.StringUtil;
 import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMSequenceRecord;

 /**
 * Created by IntelliJ IDEA.
@ -54,6 +55,8 @@ public class ReferenceProvider {
     * Gets the bases of the reference that are aligned to the given read.
     * @param read the read for which to extract reference information.
     * @return The bases corresponding to this read, or null if the read is unmapped.
+     *         If the alignment goes off the end of the contig, return just the portion
+     *         mapped to the reference.
     */
    public char[] getReferenceBases( SAMRecord read ) {
        if( read.getReadUnmappedFlag() )
@ -63,6 +66,10 @@ public class ReferenceProvider {
        int start = read.getAlignmentStart();
        int stop = read.getAlignmentEnd();

+        SAMSequenceRecord sequenceRecord = sequenceFile.getSequenceDictionary().getSequence(contig);
+        if( stop > sequenceRecord.getSequenceLength() )
+            stop = sequenceRecord.getSequenceLength();
+
        ReferenceSequence alignmentToReference = sequenceFile.getSubsequenceAt( contig, start, stop );
        return StringUtil.bytesToString(alignmentToReference.getBases()).toCharArray();
    }
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java
@ -17,6 +17,7 @@ import java.util.LinkedList;
 import java.io.File;

 import net.sf.samtools.SAMRecord;
+import net.sf.samtools.util.StringUtil;

 /**
 * Created by IntelliJ IDEA.
@ -84,11 +85,21 @@ public class TraverseByReads extends TraversalEngine {
            final List<SAMRecord> reads = Arrays.asList(read);
            GenomeLoc loc = new GenomeLoc(read);

+            char[] refBases = null;
+
            // Jump forward in the reference to this locus location
            LocusContext locus = new LocusContext(loc, reads, offsets);
            if (!loc.isUnmapped() && refIter != null) {
                final ReferenceIterator refSite = refIter.seekForward(loc);
                locus.setReferenceContig(refSite.getCurrentContig());
+
+                // MAQ alignments sometimes go off the end of the reference.  Take this
+                // into account by only returning a shortened version of the reference.                
+                int refStart = read.getAlignmentStart()-1;
+                int refLength = Math.min( read.getAlignmentEnd() - read.getAlignmentStart() + 1,
+                                          refSite.getCurrentContig().length() - read.getAlignmentStart() );
+
+                refBases = StringUtil.bytesToString( refSite.getCurrentContig().getBases(),refStart,refLength ).toCharArray();
            }

            GenomeLoc.removePastLocs(loc, notYetTraversedLocations);
@ -97,9 +108,9 @@ public class TraverseByReads extends TraversalEngine {
                //
                // execute the walker contact
                //
-                final boolean keepMeP = walker.filter(locus, read);
+                final boolean keepMeP = walker.filter(refBases, read);
                if (keepMeP) {
-                    M x = walker.map(locus, read);
+                    M x = walker.map(refBases, read);
                    sum = walker.reduce(x, sum);
                }

--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java
@ -112,9 +112,9 @@ public class TraverseReads extends TraversalEngine {
            // update the number of reads we've seen
            TraversalStatistics.nRecords++;

-            final boolean keepMeP = readWalker.filter(locus, read);
+            final boolean keepMeP = readWalker.filter(refSeq, read);
            if (keepMeP) {
-                M x = readWalker.map(locus, read);
+                M x = readWalker.map(refSeq, read);
                sum = readWalker.reduce(x, sum);
            }

--- a/java/src/org/broadinstitute/sting/gatk/walkers/CountReadsWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/CountReadsWalker.java
@ -4,7 +4,7 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.LocusContext;

 public class CountReadsWalker extends ReadWalker<Integer, Integer> {
-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {
        //System.out.println(read.format());
        return 1;
    }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
@ -4,7 +4,7 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.LocusContext;

 public class PrintReadsWalker extends ReadWalker<Integer, Integer> {
-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {
        out.println(read.format());
        return 1;
    }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java
@ -17,11 +17,11 @@ public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, Re
    /** Must return true for reads that need to be processed. Reads, for which this method return false will
     * be skipped by the engine and never passed to the walker.
     */
-    public boolean filter(LocusContext context, SAMRecord read) {
+    public boolean filter(char[] ref, SAMRecord read) {
        // We are keeping all the reads
        return true;
    }

    // Map over the org.broadinstitute.sting.gatk.LocusContext
-    public abstract MapType map(LocusContext context, SAMRecord read);
+    public abstract MapType map(char[] ref, SAMRecord read);
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlignedReadsHistoWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlignedReadsHistoWalker.java
@ -22,16 +22,14 @@ public class AlignedReadsHistoWalker extends ReadWalker<Integer, Integer> {
        }
    }

-    public String walkerType() { return "ByRead"; }
-
    // Do we actually want to operate on the context?
-    public boolean filter(LocusContext context, SAMRecord read) {
+    public boolean filter(char[] ref, SAMRecord read) {
 	    // we only want aligned reads
 	    return !read.getReadUnmappedFlag();
    }

    // Map over the org.broadinstitute.sting.atk.LocusContext
-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {
        //System.out.println(read.getAttribute("NM"));
        int editDist = Integer.parseInt(read.getAttribute("NM").toString());
        if (editDist <= 50)
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityDumpWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityDumpWalker.java
@ -12,12 +12,12 @@ public class BaseQualityDumpWalker extends ReadWalker<Integer, Integer> {
    protected final int MAX_TARGET_EDIT_DISTANCE = 4; //10;

    // Do we actually want to operate on the context?
-    public boolean filter(LocusContext context, SAMRecord read) {
+    public boolean filter(char[] ref, SAMRecord read) {
 	    // we only want aligned reads
 	    return !read.getReadUnmappedFlag();
    }

-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {

        int editDist = Integer.parseInt(read.getAttribute("NM").toString());

--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityHistoWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityHistoWalker.java
@ -23,12 +23,12 @@ public class BaseQualityHistoWalker extends ReadWalker<Integer, Integer> {
    }

    // Do we actually want to operate on the context?
-    public boolean filter(LocusContext context, SAMRecord read) {
+    public boolean filter(char[] ref, SAMRecord read) {
        return true;    // We are keeping all the reads
    }

    // Map over the org.broadinstitute.sting.gatk.LocusContext
-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {
        for ( byte qual : read.getBaseQualities() ) {
            //System.out.println(qual);
            this.qualCounts[qual]++;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DisplayFourBaseReadWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DisplayFourBaseReadWalker.java
@ -6,7 +6,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
 import net.sf.samtools.SAMRecord;

 public class DisplayFourBaseReadWalker extends ReadWalker<Integer, Integer> {
-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {
        String bases = read.getReadString();

        boolean displayed = false;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java
@ -39,7 +39,7 @@ public class IOCrusherWalker extends ReadWalker<SAMRecord, ArrayList<SAMFileWrit
    /**
     *
     */
-    public SAMRecord map(LocusContext context, SAMRecord read) {
+    public SAMRecord map(char[] ref, SAMRecord read) {
        nReadsRead++;
        return read;
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IndelInspector.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IndelInspector.java
@ -75,7 +75,7 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
        }
    }

-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {

    	total_reads++;
 
@ -84,7 +84,8 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
    		ptWriter.receive(read); // make sure we still write the read to the output, we do not want to lose data!
    		return 0;
    	}
-    	
+
+        /* 11 May 2009 - Commented out because we no longer have the full reference sequence for sanity checking.
        ReferenceSequence contig_seq = context.getReferenceContig();
        if ( read.getReferenceName() != cur_contig) {
            cur_contig = read.getReferenceName();
@ -94,6 +95,7 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
                pileBuilder.setReferenceSequence(refstr);
            out.println("loaded contig "+cur_contig+" (index="+read.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
        }
+        */

        //    if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY") > 0 ) continue; // skip chrM and unplaced contigs for now

@ -138,10 +140,10 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
 */

        if ( ERR_MODE != null ) {
-            if ( ERR_MODE.equals("MM"))       err = numMismatches(read,contig_seq);
-            else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(read,contig_seq);
-            else if ( ERR_MODE.equals("ERR")) err = numErrors(read,contig_seq);
-            else if ( ERR_MODE.equals("MG"))  err = numMismatchesGaps(read,contig_seq);
+            if ( ERR_MODE.equals("MM"))       err = numMismatches(read,ref);
+            else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(read,ref);
+            else if ( ERR_MODE.equals("ERR")) err = numErrors(read,ref);
+            else if ( ERR_MODE.equals("MG"))  err = numMismatchesGaps(read,ref);
            if ( err > MAX_ERRS ) {
            	ptWriter.receive(read);
            	discarded_maxerr_count++;
@ -159,10 +161,10 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
 	 * @return number of errors (number of mismatches plus total length of all insertions/deletions
 	 * @throws RuntimeException
 	 */
-    private static int numErrors(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
+    private static int numErrors(SAMRecord r, char[] ref) throws RuntimeException {

 		// NM currently stores the total number of mismatches in all blocks + 1
-        int errs = numMismatches(r,refseq);
+        int errs = numMismatches(r,ref);

 		// now we have to add the total length of all indels:
 		Cigar c = r.getCigar();
@ -187,10 +189,10 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
 	 * deletion will be counted as a single error regardless of the length)
 	 * @throws RuntimeException
 	 */
-    private static int numMismatchesGaps(SAMRecord r,ReferenceSequence refseq) throws RuntimeException {
+    private static int numMismatchesGaps(SAMRecord r,char[] ref) throws RuntimeException {

 		// NM currently stores the total number of mismatches in all blocks + 1
-        int errs = numMismatches(r,refseq);
+        int errs = numMismatches(r,ref);

 		// now we have to add the total length of all indels:
 		Cigar c = r.getCigar();
@ -209,7 +211,7 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
 	}

    /** This method is a HACK: it is designed to work around the current bug in NM tags created  at CRD */
-    private static int numMismatches(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
+    private static int numMismatches(SAMRecord r, char[] refseq) throws RuntimeException {

        // NM currently stores the total number of mismatches in all blocks + 1
        Integer i = (Integer)r.getAttribute("NM");
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IndelIntervalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IndelIntervalWalker.java
@ -24,18 +24,21 @@ public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval

    public void initialize() {}

-    public boolean filter(LocusContext context, SAMRecord read) {
+    public boolean filter(char[] ref, SAMRecord read) {
        return ( !read.getReadUnmappedFlag() &&            // mapped
                 read.getReadLength() <= maxReadLength &&  // not too big
                 read.getAlignmentBlocks().size() > 1);    // indel
    }

-    public Interval map(LocusContext context, SAMRecord read) {
+    public Interval map(char[] ref, SAMRecord read) {
        List<AlignmentBlock> blocks = read.getAlignmentBlocks();
        long indelLeftEdge = read.getAlignmentStart() + blocks.get(0).getLength() - 1;
        long indelRightEdge = read.getAlignmentEnd() - blocks.get(blocks.size()-1).getLength() + 1;
-        GenomeLoc indelLoc = new GenomeLoc(context.getLocation().getContigIndex(), indelLeftEdge, indelRightEdge);
-        return new Interval(context.getLocation(), indelLoc);
+
+        GenomeLoc indelLoc = new GenomeLoc(read.getReferenceIndex(), indelLeftEdge, indelRightEdge);
+        GenomeLoc refLoc = new GenomeLoc(read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd());
+
+        return new Interval(refLoc, indelLoc);
    }

    public Interval reduceInit() {
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java
@ -90,7 +90,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
    }


-    public SAMRecord map(LocusContext context, SAMRecord read) {
+    public SAMRecord map(char[] ref, SAMRecord read) {
        SAMRecord recalRead = read;
        byte[] bases = read.getReadBases();
        byte[] quals = read.getBaseQualities();
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchCounterWalker.java
@ -13,20 +13,18 @@ import java.util.List;

@WalkerName("CountMismatches")
 public class MismatchCounterWalker extends ReadWalker<Integer, Integer> {
-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {
        int nMismatches = 0;
        
-        ReferenceSequence refseq = context.getReferenceContig();
-
        int start = read.getAlignmentStart()-1;
        int stop  = read.getAlignmentEnd();
        // sometimes BWA outputs screwy reads
-        if ( stop >= context.getReferenceContig().getBases().length )
+        if ( stop - start > ref.length )
            return 0;

        if ( read.getAlignmentBlocks().size() == 1 ) {
            // No indels
-            List<Byte> refSeq = Utils.subseq(context.getReferenceContig().getBases(), start, stop);
+            List<Byte> refSeq = Utils.subseq(ref);
            List<Byte> readBases = Utils.subseq(read.getReadBases());

            assert(refSeq.size() == readBases.size());
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchHistoWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchHistoWalker.java
@ -19,12 +19,12 @@ public class MismatchHistoWalker extends ReadWalker<Integer, Integer> {
    protected final int MAX_TARGET_EDIT_DISTANCE = 10;

    // Do we actually want to operate on the context?
-    public boolean filter(LocusContext context, SAMRecord read) {
+    public boolean filter(char[] ref, SAMRecord read) {
 	    // we only want aligned reads
 	    return !read.getReadUnmappedFlag();
    }

-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {

        int editDist = Integer.parseInt(read.getAttribute("NM").toString());

@ -33,20 +33,18 @@ public class MismatchHistoWalker extends ReadWalker<Integer, Integer> {
             editDist >= MIN_TARGET_EDIT_DISTANCE &&
             editDist <= MAX_TARGET_EDIT_DISTANCE ) {

-            ReferenceSequence refseq = context.getReferenceContig();
-
            int start = read.getAlignmentStart()-1;
            int stop  = read.getAlignmentEnd();
            // sometimes BWA outputs screwy reads
-            if ( stop >= context.getReferenceContig().getBases().length )
+            if ( stop - start > ref.length )
                return 0;

-            List<Byte> refSeq = Utils.subseq(context.getReferenceContig().getBases(), start, stop);
+            List<Byte> refSeq = Utils.subseq(ref);
            List<Byte> readBases = Utils.subseq(read.getReadBases());
            assert(refSeq.size() == readBases.size());

            // it's actually faster to reallocate a resized array than to use ArrayLists...
-            if ( refSeq.size() > mismatchCounts.length ) {
+            if ( ref.length > mismatchCounts.length ) {
                int oldLength = mismatchCounts.length;
                mismatchCounts = (long[])resizeArray(mismatchCounts, refSeq.size());
                for ( int i = oldLength; i < refSeq.size(); i++ )
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadErrorRateWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadErrorRateWalker.java
@ -27,12 +27,11 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
    /**
     * Ignore reads with indels or clipping
     *
-     * @param context  the reference context
     * @param read     the read to assess
     * @return true if the read can be processed, false if it should be ignored
     */
-    public boolean filter(LocusContext context, SAMRecord read) {
-        return (read.getCigar().numCigarElements() == 1 && read.getAlignmentStart() + read.getReadLength() < context.getReferenceContig().length());
+    public boolean filter(char[] ref, SAMRecord read) {
+        return (read.getCigar().numCigarElements() == 1 && read.getReadLength() > ref.length);
    }

    /**
@ -41,17 +40,15 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
     * of this array is always "true" so that we can figure out how many reads we
     * processed in a thread-safe manner.
     * 
-     * @param context  the reference context
     * @param read     the read to assess
     * @return An array of length (read_length + 1) indicating where the mismatches occur.
     *         Last element is for internal use so the reduce() function can figure out how
     *         many reads we processed.
     */
-    public boolean[] map(LocusContext context, SAMRecord read) {
+    public boolean[] map(char[] ref, SAMRecord read) {
        boolean[] errorsPerCycle = new boolean[read.getReadLength() + 1];

        byte[] bases  = read.getReadBases();
-        byte[] contig = context.getReferenceContig().getBases();
        byte[] sq     = (byte[]) read.getAttribute("SQ");

        if (printVisualHits) {
@ -61,16 +58,16 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
            }
            System.out.println();

-            for (int cycle = 0, offset = (int) context.getPosition() - 1; cycle < bases.length; cycle++, offset++) {
-                byte compBase = convertIUPACBaseToSimpleBase(contig[offset]);
+            for (int cycle = 0; cycle < bases.length; cycle++) {
+                byte compBase = convertIUPACBaseToSimpleBase((byte)ref[cycle]);

                System.out.print((char) compBase);
            }
            System.out.println("\n");
        }

-        for (int cycle = 0, offset = (int) context.getPosition() - 1; cycle < bases.length; cycle++, offset++) {
-            byte compBase = convertIUPACBaseToSimpleBase(contig[offset]);
+        for (int cycle = 0; cycle < bases.length; cycle++) {
+            byte compBase = convertIUPACBaseToSimpleBase((byte)ref[cycle]);

            if (compBase != '.') {
                if (useNextBestBase || useNextRandomBase) {
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadFilterWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReadFilterWalker.java
@ -23,12 +23,12 @@ public class ReadFilterWalker extends ReadWalker<Integer,Integer> {
    }

 	@Override
-    public boolean filter(LocusContext context, SAMRecord read) {
+    public boolean filter(char[] ref, SAMRecord read) {
 		return read.getReadLength() <= max_len;
 	}

 	@Override
-	public Integer map(LocusContext context, SAMRecord read) {
+	public Integer map(char[] ref, SAMRecord read) {
 		writer.addAlignment(read);
 		return 1;
 	}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SAMToFastqAndSqWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SAMToFastqAndSqWalker.java
@ -19,7 +19,7 @@ public class SAMToFastqAndSqWalker extends ReadWalker<Integer, Integer> {

    private boolean ready = false;
    
-    public Integer map(LocusContext context, SAMRecord read) {
+    public Integer map(char[] ref, SAMRecord read) {
        if (!ready) {
            try {
                fastqbw = new PrintWriter(FASTQFILE);
--- a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java
@ -50,6 +50,39 @@ public class AlignmentUtils {
        return mm_count;
    }

+    /**
+     * mhanna - 11 May 2009 - stubbed out competing method that works with partial references. 
+     * Computes number of mismatches in the read alignment to the refence <code>ref</code>
+     * specified in the record <code>r</code>. Indels are completely <i>ignored</i> by this method:
+     * only base mismatches in the alignment segments where both sequences are present are counted.
+     * @param r
+     * @return
+     */
+    public static int numMismatches(SAMRecord r, char[] ref) {
+        if ( r.getReadUnmappedFlag() ) return 1000000;
+        int i_ref = 0; // position on the ref
+        int i_read = 0;                    // position on the read
+        int mm_count = 0; // number of mismatches
+        Cigar c = r.getCigar();
+        for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
+            CigarElement ce = c.getCigarElement(k);
+            switch( ce.getOperator() ) {
+                case M:
+                    for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
+                        if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ?
+                        if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
+                             Character.toUpperCase(ref[i_ref]) ) mm_count++;
+                    }
+                    break;
+                case I: i_read += ce.getLength(); break;
+                case D: i_ref += ce.getLength(); break;
+                default: throw new RuntimeException("Unrecognized cigar element");
+            }
+
+        }
+        return mm_count;
+    }
+
    // IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED
    // TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO
    // PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY...
--- a/java/src/org/broadinstitute/sting/utils/Utils.java
+++ b/java/src/org/broadinstitute/sting/utils/Utils.java
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils;
 import net.sf.samtools.SAMRecord;
 import net.sf.samtools.SAMSequenceRecord;
 import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.util.StringUtil;
 import edu.mit.broad.picard.reference.ReferenceSequenceFile;

 import java.util.*;
@ -119,14 +120,20 @@ public class Utils {
        return c;
    }

+    public static ArrayList<Byte> subseq(char[] fullArray) {
+        byte[] fullByteArray = new byte[fullArray.length];
+        StringUtil.charsToBytes(fullArray,0,fullArray.length,fullByteArray,0);
+        return subseq(fullByteArray);
+    }
+
    public static ArrayList<Byte> subseq(byte[] fullArray) {
-        return subseq(fullArray, 0, fullArray.length);
+        return subseq(fullArray, 0, fullArray.length-1);
    }

    public static ArrayList<Byte> subseq(byte[] fullArray, int start, int end) {
        assert end < fullArray.length;
        ArrayList<Byte> dest = new ArrayList<Byte>(end - start + 1);
-        for (int i = start; i < end; i++) {
+        for (int i = start; i <= end; i++) {
            dest.add(fullArray[i]);
        }
        return dest;