From fddca032bbbbf4759eb1403c8e7e5a9ae49fa220 Mon Sep 17 00:00:00 2001
From: ebanks <ebanks@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Wed, 27 Jan 2010 21:36:42 +0000
Subject: [PATCH] Initial commit of v2.0 of the cleaner.  DO NOT USE. (this
 means you, Chris)

Cleaned up SW code and started moving over everything to use byte[] instead of String or char[].

Added a wrapper class for SAMFileWriter that allows for adding reads out of order.

Not even close to done, but I need to commit now to sync up with Andrey.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2712 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../gatk/walkers/indels/IndelRealigner.java   | 1037 +++++++++++++++++
 .../walkers/indels/IntervalCleanerWalker.java |    7 +-
 .../walkers/indels/SortingSAMFileWriter.java  |  118 ++
 .../sting/utils/AlignmentUtils.java           |   17 +-
 .../sting/utils/SWPairwiseAlignment.java      |  597 +---------
 5 files changed, 1189 insertions(+), 587 deletions(-)
 create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
 create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java

diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
new file mode 100755
index 000000000..953ec070a
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@@ -0,0 +1,1037 @@
+package org.broadinstitute.sting.gatk.walkers.indels;
+
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+import net.sf.samtools.*;
+import net.sf.samtools.util.StringUtil;
+
+import java.util.*;
+import java.io.File;
+import java.io.FileWriter;
+
+/**
+ * Performs local realignment of reads based on misalignments due to the presence of indels.
+ * Unlike most mappers, this walker uses the full alignment context to determine whether an
+ * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly.
+ */
+public class IndelRealigner extends ReadWalker<Integer, Integer> {
+
+    @Argument(fullName="intervals", shortName="intervals", doc="intervals file output from RealignerTargetCreator", required=true)
+    protected String intervalsFile = null;
+
+    @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
+    protected double LOD_THRESHOLD = 5.0;
+    @Argument(fullName="EntropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
+    protected double MISMATCH_THRESHOLD = 0.15;
+
+    @Argument(fullName="OutputBam", shortName="O", required=false, doc="Output bam")
+    protected SAMFileWriter baseWriter = null;
+
+
+    @Argument(fullName="OutputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found")
+    String OUT_INDELS = null;
+    @Argument(fullName="OutputCleanedReadsOnly", shortName="cleanedOnly", doc="print out cleaned reads only (otherwise, all reads within the intervals)", required=false)
+    boolean cleanedReadsOnly = false;
+    @Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false)
+    String OUT_STATS = null;
+    @Argument(fullName="SNPsFile", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out", required=false)
+    String OUT_SNPS = null;
+    @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false)
+    int MAX_CONSENSUSES = 30;
+    @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false)
+    int MAX_READS_FOR_CONSENSUSES = 120;
+
+    @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false)
+    int MAX_READS = 20000;
+
+
+    // the intervals input by the user
+    private Iterator<GenomeLoc> intervals = null;
+
+    // the current interval in the list
+    private GenomeLoc currentInterval = null;
+
+    // the reads that fall into the current interval
+    private ReadBin readsToClean = new ReadBin();
+
+    // the wrapper around the SAM writer
+    private SortingSAMFileWriter writer = null;
+
+    // random number generator
+    private Random generator;
+
+    public static final int MAX_QUAL = 99;
+    public static final long RANDOM_SEED = 1252863495;
+
+    // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned
+    private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75;
+
+    private static final double SW_MATCH = 30.0;      // 1.0;
+    private static final double SW_MISMATCH = -10.0;  //-1.0/3.0;
+    private static final double SW_GAP = -10.0;       //-1.0-1.0/3.0;
+    private static final double SW_GAP_EXTEND = -2.0; //-1.0/.0;
+
+    // other output files
+    private FileWriter indelOutput = null;
+    private FileWriter statsOutput = null;
+    private FileWriter snpsOutput = null;
+
+    public void initialize() {
+
+        if ( LOD_THRESHOLD < 0.0 )
+            throw new RuntimeException("LOD threshold cannot be a negative number");
+        if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 )
+            throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
+
+        List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(intervalsFile));
+        intervals = GenomeLocSortedSet.createSetFromList(locs).iterator();
+        currentInterval = intervals.hasNext() ? intervals.next() : null;
+
+        if ( baseWriter != null )
+            writer = new SortingSAMFileWriter(baseWriter, 50);
+
+        generator = new Random(RANDOM_SEED);
+
+        if ( OUT_INDELS != null ) {
+            try {
+                indelOutput = new FileWriter(new File(OUT_INDELS));
+            } catch (Exception e) {
+                logger.warn("Failed to create output file "+ OUT_INDELS+". Indel output will be suppressed");
+                err.println(e.getMessage());
+                indelOutput = null;
+            }
+        }
+        if ( OUT_STATS != null ) {
+            try {
+                statsOutput = new FileWriter(new File(OUT_STATS));
+            } catch (Exception e) {
+                logger.warn("Failed to create output file "+ OUT_STATS+". Cleaning stats output will be suppressed");
+                err.println(e.getMessage());
+                statsOutput = null;
+            }
+        }
+        if ( OUT_SNPS != null ) {
+            try {
+                snpsOutput = new FileWriter(new File(OUT_SNPS));
+            } catch (Exception e) {
+                logger.warn("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed");
+                err.println(e.getMessage());
+                snpsOutput = null;
+            }
+        }
+    }
+
+    private void emit(SAMRecord read) {
+        if ( writer != null )
+            writer.addAlignment(read);
+    }
+
+    private void emit(List<SAMRecord> reads) {
+        if ( writer != null )
+            writer.addAlignments(reads);
+    }
+
+    public Integer map(char[] ref, SAMRecord read) {
+        if ( currentInterval == null ) {
+            emit(read);
+            return 0;
+        }
+
+        GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
+
+        if ( readLoc.isBefore(currentInterval) ) {
+            emit(read);
+            return 0;
+        }
+        else if ( readLoc.overlapsP(currentInterval) ) {
+            if ( read.getReadUnmappedFlag() ||
+                 read.getDuplicateReadFlag() ||
+                 read.getNotPrimaryAlignmentFlag() ||
+                 read.getMappingQuality() == 0 ||
+                 read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
+		         Utils.is454Read(read) ) {
+                emit(read);
+            }
+
+            readsToClean.add(read, ref);
+            if ( readsToClean.size() >= MAX_READS ) {
+                emit(readsToClean.getReads());
+                readsToClean.clear();
+                currentInterval = intervals.hasNext() ? intervals.next() : null;
+            }
+        }
+        else {  // the read is past the current interval
+            clean(readsToClean);
+            emit(readsToClean.getReads());
+            readsToClean.clear();
+            currentInterval = intervals.hasNext() ? intervals.next() : null;
+
+            // call back into map now that the state has been updated
+            map(ref, read);
+        }
+
+        return 0;
+    }
+
+    public Integer reduceInit() {
+        return 0;
+    }
+
+    public Integer reduce(Integer value, Integer sum) {
+        return sum + value;
+    }
+
+    public void onTraversalDone(Integer result) {
+        if ( readsToClean.size() > 0 ) {
+            clean(readsToClean);
+            emit(readsToClean.getReads());
+        }
+
+        writer.close();
+
+        if ( OUT_INDELS != null ) {
+            try {
+                indelOutput.close();
+            } catch (Exception e) {
+                logger.error("Failed to close "+OUT_INDELS+" gracefully. Data may be corrupt.");
+            }
+        }
+        if ( OUT_STATS != null ) {
+            try {
+                statsOutput.close();
+            } catch (Exception e) {
+                logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt.");
+            }
+        }
+        if ( OUT_SNPS != null ) {
+            try {
+                snpsOutput.close();
+            } catch (Exception e) {
+                logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt.");
+            }
+        }
+    }
+
+
+    private static int mismatchQualitySumIgnoreCigar(AlignedRead aRead, byte[] refSeq, int refIndex) {
+        byte[] readSeq = aRead.getRead().getReadBases();
+        byte[] quals = aRead.getRead().getBaseQualities();
+        int sum = 0;
+        for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) {
+            if ( refIndex >= refSeq.length )
+                sum += MAX_QUAL;
+            else {
+                byte refChr = refSeq[refIndex];
+                byte readChr = readSeq[readIndex];
+                if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
+                     BaseUtils.simpleBaseToBaseIndex(refChr)  == -1 )
+                    continue; // do not count Ns/Xs/etc ?
+                if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
+                    sum += (int)quals[readIndex];
+            }
+        }
+        return sum;
+    }
+
+    private static boolean readIsClipped(SAMRecord read) {
+        final Cigar c = read.getCigar();
+        final int n = c.numCigarElements();
+        if ( c.getCigarElement(n-1).getOperator() == CigarOperator.S ||
+             c.getCigarElement(0).getOperator() == CigarOperator.S) return true;
+        return false;
+    }
+
+    private void clean(ReadBin readsToClean) {
+
+        List<SAMRecord> reads = readsToClean.getReads();
+        byte[] reference = readsToClean.getRereference();
+        long leftmostIndex = readsToClean.getLocation().getStart();
+
+        ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>();                   // reads that perfectly match ref
+        ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>();               // reads that don't perfectly match
+        LinkedList<AlignedRead> altAlignmentsToTest = new LinkedList<AlignedRead>();  // should we try to make an alt consensus from the read?
+        ArrayList<AlignedRead> leftMovedIndels = new ArrayList<AlignedRead>();
+        Set<Consensus> altConsenses = new LinkedHashSet<Consensus>();                   // list of alt consenses
+        int totalMismatchSum = 0;
+
+
+        // decide which reads potentially need to be cleaned
+        for ( SAMRecord read : reads ) {
+
+            //            if ( debugOn ) {
+            //                System.out.println(read.getReadName()+" "+read.getCigarString()+" "+read.getAlignmentStart()+"-"+read.getAlignmentEnd());
+            //                System.out.println(reference.substring((int)(read.getAlignmentStart()-leftmostIndex),(int)(read.getAlignmentEnd()-leftmostIndex)));
+            //                System.out.println(read.getReadString());
+            //            }
+
+            // we currently can not deal with clipped reads correctly (or screwy record)
+            if ( read.getCigar().numCigarElements() == 0 || readIsClipped(read) ) {
+                refReads.add(read);
+                continue;
+            }
+
+            AlignedRead aRead = new AlignedRead(read);
+
+            // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
+            int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
+            if ( numBlocks == 2 ) {
+                Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
+                if ( aRead.setCigar(newCigar) ) {
+                    leftMovedIndels.add(aRead);
+                }
+            }
+
+            int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
+            //            if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
+
+            // if this doesn't match perfectly to the reference, let's try to clean it
+            if ( mismatchScore > 0 ) {
+                altReads.add(aRead);
+                if ( !read.getDuplicateReadFlag() )
+                    totalMismatchSum += mismatchScore;
+                aRead.setMismatchScoreToReference(mismatchScore);
+                // if it has an indel, let's see if that's the best consensus
+                if ( numBlocks == 2 )  {
+                    Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases());
+                    if ( c==null) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName());
+                    else altConsenses.add(c);
+                }
+                else {
+                    //                    if ( debugOn ) System.out.println("Going to test...");
+                    altAlignmentsToTest.add(aRead);
+                }
+            }
+            // otherwise, we can emit it as is
+            else {
+                // if ( debugOn ) System.out.println("Emitting as is...");
+                refReads.add(read);
+            }
+        }
+
+        // choose alternate consensuses
+        if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) {
+            for ( AlignedRead aRead : altAlignmentsToTest ) {
+                // do a pairwise alignment against the reference
+                SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
+                Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
+                if ( c != null) {
+                    //                    if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ;
+                    altConsenses.add(c);
+                } else {
+                    //   if ( debugOn ) System.out.println("FAILED to create Alt consensus from SW");
+                }
+            }
+        } else {
+            // choose alternate consenses randomly
+            int readsSeen = 0;
+            while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsenses.size() <= MAX_CONSENSUSES) {
+                int index = generator.nextInt(altAlignmentsToTest.size());
+                AlignedRead aRead = altAlignmentsToTest.remove(index);
+                // do a pairwise alignment against the reference
+                SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
+                Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
+                if ( c != null)
+                    altConsenses.add(c);
+            }
+        }
+
+        Consensus bestConsensus = null;
+        Iterator<Consensus> iter = altConsenses.iterator();
+
+        // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n");
+
+        while ( iter.hasNext() ) {
+            Consensus consensus = iter.next();
+
+            // if ( debugOn ) System.out.println("Consensus: "+consensus.str);
+
+            for ( int j = 0; j < altReads.size(); j++ ) {
+                AlignedRead toTest = altReads.get(j);
+                Pair<Integer, Integer> altAlignment = findBestOffset(consensus.str, toTest);
+
+                // the mismatch score is the min of its alignment vs. the reference and vs. the alternate
+                int myScore = altAlignment.second;
+                if ( myScore >= toTest.getMismatchScoreToReference() )
+                    myScore = toTest.getMismatchScoreToReference();
+                // keep track of reads that align better to the alternate consensus.
+                // By pushing alignments with equal scores to the alternate, it means we'll over-call (het -> hom non ref) but are less likely to under-call (het -> ref, het non ref -> het)
+                else
+                    consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first));
+
+                //logger.debug(consensus.str +  " vs. " + toTest.getRead().getReadString() + " => " + myScore + " - " + altAlignment.first);
+                if ( !toTest.getRead().getDuplicateReadFlag() )
+                    consensus.mismatchSum += myScore;
+            }
+
+            //logger.debug(consensus.str +  " " + consensus.mismatchSum);
+            if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
+                bestConsensus = consensus;
+                //logger.debug(consensus.str +  " " + consensus.mismatchSum);
+            }
+        }
+
+        // if the best alternate consensus has a smaller sum of quality score mismatches (more than
+        // the LOD threshold), and it didn't just move around the mismatching columns, then clean!
+        double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0);
+        if ( improvement >= LOD_THRESHOLD ) {
+
+            bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
+
+           // start cleaning the appropriate reads
+            for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
+                AlignedRead aRead = altReads.get(indexPair.first);
+                updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.second, aRead, (int)leftmostIndex);
+            }
+            if ( !alternateReducesEntropy(altReads, reference, leftmostIndex) ) {
+                if ( statsOutput != null ) {
+                    try {
+                        statsOutput.write(readsToClean.getLocation().toString());
+                        statsOutput.write("\tFAIL (bad indel)\t"); // if improvement > LOD_THRESHOLD *BUT* entropy is not reduced (SNPs still exist)
+                        statsOutput.write(Double.toString(improvement));
+                        statsOutput.write("\n");
+                        statsOutput.flush();
+                    } catch (Exception e) {}
+                }
+            } else {
+                //logger.debug("CLEAN: " + AlignmentUtils.cigarToString(bestConsensus.cigar) + " " + bestConsensus.str );
+                if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
+                    // NOTE: indels are printed out in the format specified for the low-coverage pilot1
+                    //  indel calls (tab-delimited): chr position size type sequence
+                    StringBuilder str = new StringBuilder();
+                    str.append(reads.get(0).getReferenceName());
+                    int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength();
+                    str.append("\t" + (leftmostIndex + position - 1));
+                    CigarElement ce = bestConsensus.cigar.getCigarElement(1);
+                    str.append("\t" + ce.getLength() + "\t" + ce.getOperator() + "\t");
+                    int length = ce.getLength();
+                    if ( ce.getOperator() == CigarOperator.D ) {
+                        for ( int i = 0; i < length; i++)
+                            str.append(reference[position+i]);
+                    } else {
+                        for ( int i = 0; i < length; i++)
+                            str.append(bestConsensus.str[position+i]);
+                    }
+                    str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
+                    try {
+                        indelOutput.write(str.toString());
+                        indelOutput.flush();
+                    } catch (Exception e) {}
+                }
+                if ( statsOutput != null ) {
+                    try {
+                        statsOutput.write(readsToClean.getLocation().toString());
+                        statsOutput.write("\tCLEAN"); // if improvement > LOD_THRESHOLD *AND* entropy is reduced
+                        if ( bestConsensus.cigar.numCigarElements() > 1 )
+                            statsOutput.write(" (found indel)");
+                        statsOutput.write("\t");
+                        statsOutput.write(Double.toString(improvement));
+                        statsOutput.write("\n");
+                        statsOutput.flush();
+                    } catch (Exception e) {}
+                }
+
+                // We need to update the mapping quality score of the cleaned reads;
+                // however we don't have enough info to use the proper MAQ scoring system.
+                // For now, we'll use a heuristic:
+                // the mapping quality score is improved by the LOD difference in mismatching
+                // bases between the reference and alternate consensus (divided by 10)
+
+                // finish cleaning the appropriate reads
+                for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
+                    AlignedRead aRead = altReads.get(indexPair.first);
+                    if ( aRead.finalizeUpdate() ) {
+                        aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + (int)(improvement/10.0), 255));
+                        aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
+                    }
+                }
+            }
+
+            // END IF ( improvement >= LOD_THRESHOLD )
+
+        } else if ( statsOutput != null ) {
+            try {
+                statsOutput.write(readsToClean.getLocation().toString());
+                statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD
+                statsOutput.write(Double.toString(improvement));
+                statsOutput.write("\n");
+                statsOutput.flush();
+            } catch (Exception e) {}
+        }
+    }
+
+    private Consensus createAlternateConsensus(int indexOnRef, Cigar c, byte[] reference, byte[] readStr) {
+        if ( indexOnRef < 0 )
+            return null;
+
+        // create the new consensus
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < indexOnRef; i++)
+            sb.append(reference[i]);        
+        //logger.debug("CIGAR = " + AlignmentUtils.cigarToString(c));
+
+        int indelCount = 0;
+        int altIdx = 0;
+        int refIdx = indexOnRef;
+        boolean ok_flag = true;
+        for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+            CigarElement ce = c.getCigarElement(i);
+            int elementLength = ce.getLength();
+            switch( ce.getOperator() ) {
+            case D:
+                indelCount++;
+                refIdx += elementLength;
+                break;
+            case M:
+                if ( reference.length < refIdx + elementLength )
+                    ok_flag = false;
+                else  {
+                    for (int j = 0; j < elementLength; j++)
+                        sb.append(reference[refIdx+j]);
+                }
+                refIdx += elementLength;
+                altIdx += elementLength;
+                break;
+            case I:
+                for (int j = 0; j < elementLength; j++)
+                    sb.append((char)readStr[altIdx + j]);
+                altIdx += elementLength;
+                indelCount++;
+                break;
+            }
+        }
+        // make sure that there is at most only a single indel and it aligns appropriately!
+        if ( !ok_flag || indelCount != 1 || reference.length < refIdx )
+            return null;
+
+        for (int i = refIdx; i < reference.length; i++)
+            sb.append(reference[i]);
+        byte[] altConsensus =  StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the cuurent read
+
+        // if ( debugOn ) System.out.println("Alt consensus generated: "+altConsensus);
+
+        return new Consensus(altConsensus, c, indexOnRef);
+    }
+
+    private Pair<Integer, Integer> findBestOffset(byte[] ref, AlignedRead read) {
+        int attempts = ref.length - read.getReadLength() + 1;
+        int bestScore = mismatchQualitySumIgnoreCigar(read, ref, 0);
+        int bestIndex = 0;
+        for ( int i = 1; i < attempts; i++ ) {
+            // we can't get better than 0!
+            if ( bestScore == 0 )
+                return new Pair<Integer, Integer>(bestIndex, 0);
+            int score = mismatchQualitySumIgnoreCigar(read, ref, i);
+            if ( score < bestScore ) {
+                bestScore = score;
+                bestIndex = i;
+            }
+        }
+        return new Pair<Integer, Integer>(bestIndex, bestScore);
+    }
+
+
+    private void updateRead(Cigar altCigar, int altPosOnRef, int myPosOnAlt, AlignedRead aRead, int leftmostIndex) {
+        Cigar readCigar = new Cigar();
+
+        // special case: there is no indel
+        if ( altCigar.getCigarElements().size() == 1 ) {
+            aRead.setAlignmentStart(leftmostIndex + myPosOnAlt);
+            readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M));
+            aRead.setCigar(readCigar);
+            return;
+        }
+
+        CigarElement altCE1 = altCigar.getCigarElement(0);
+        CigarElement altCE2 = altCigar.getCigarElement(1);
+
+        // the easiest thing to do is to take each case separately
+        int endOfFirstBlock = altPosOnRef + altCE1.getLength();
+        boolean sawAlignmentStart = false;
+
+        // for reads starting before the indel
+        if ( myPosOnAlt < endOfFirstBlock) {
+            aRead.setAlignmentStart(leftmostIndex + myPosOnAlt);
+            sawAlignmentStart = true;
+
+            // for reads ending before the indel
+            if ( myPosOnAlt + aRead.getReadLength() <= endOfFirstBlock) {
+                readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M));
+                aRead.setCigar(readCigar);
+                return;
+            }
+            readCigar.add(new CigarElement(endOfFirstBlock - myPosOnAlt, CigarOperator.M));
+        }
+
+        int indelOffsetOnRef = 0, indelOffsetOnRead = 0;
+        // forward along the indel
+        if ( altCE2.getOperator() == CigarOperator.I ) {
+            // for reads that end in an insertion
+            if ( myPosOnAlt + aRead.getReadLength() < endOfFirstBlock + altCE2.getLength() ) {
+                readCigar.add(new CigarElement(myPosOnAlt + aRead.getReadLength() - endOfFirstBlock, CigarOperator.I));
+                aRead.setCigar(readCigar);
+                return;
+            }
+
+            // for reads that start in an insertion
+            if ( !sawAlignmentStart && myPosOnAlt < endOfFirstBlock + altCE2.getLength() ) {
+                aRead.setAlignmentStart(leftmostIndex + endOfFirstBlock);
+                readCigar.add(new CigarElement(altCE2.getLength() - (myPosOnAlt - endOfFirstBlock), CigarOperator.I));
+                indelOffsetOnRead = myPosOnAlt - endOfFirstBlock;
+                sawAlignmentStart = true;
+            } else if ( sawAlignmentStart ) {
+                readCigar.add(altCE2);
+                indelOffsetOnRead = altCE2.getLength();
+            }
+        } else if ( altCE2.getOperator() == CigarOperator.D ) {
+            if ( sawAlignmentStart )
+                readCigar.add(altCE2);
+            indelOffsetOnRef = altCE2.getLength();
+        } else {
+            throw new RuntimeException("Operator of middle block is not I or D: " + altCE2.getOperator());
+        }
+
+        // for reads that start after the indel
+        if ( !sawAlignmentStart ) {
+            aRead.setAlignmentStart(leftmostIndex + myPosOnAlt + indelOffsetOnRef - indelOffsetOnRead);
+            readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M));
+            aRead.setCigar(readCigar);
+            return;
+        }
+
+        int readRemaining = aRead.getReadLength();
+        for ( CigarElement ce : readCigar.getCigarElements() ) {
+            if ( ce.getOperator() != CigarOperator.D )
+                readRemaining -= ce.getLength();
+        }
+        if ( readRemaining > 0 )
+            readCigar.add(new CigarElement(readRemaining, CigarOperator.M));
+        aRead.setCigar(readCigar);
+    }
+
+    private boolean alternateReducesEntropy(List<AlignedRead> reads, byte[] reference, long leftmostIndex) {
+        int[] originalMismatchBases = new int[reference.length];
+        int[] cleanedMismatchBases = new int[reference.length];
+        int[] totalOriginalBases = new int[reference.length];
+        int[] totalCleanedBases = new int[reference.length];
+
+        // set to 1 to prevent dividing by zero
+        for ( int i=0; i < reference.length; i++ )
+            originalMismatchBases[i] = totalOriginalBases[i] = cleanedMismatchBases[i] = totalCleanedBases[i] = 1;
+
+        for (int i=0; i < reads.size(); i++) {
+            AlignedRead read = reads.get(i);
+            if ( read.getRead().getAlignmentBlocks().size() > 1 )
+                 continue;
+
+            int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex;
+            byte[] readStr = read.getRead().getReadBases();
+            byte[] quals = read.getRead().getBaseQualities();
+
+            for (int j=0; j < readStr.length; j++, refIdx++ ) {
+                if ( refIdx < 0 || refIdx >= reference.length ) {
+                    //System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() );
+                    //System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() );
+                    break;
+                }
+                totalOriginalBases[refIdx] += quals[j];
+                if ( readStr[j] != reference[refIdx] )
+                    originalMismatchBases[refIdx] += quals[j];
+            }
+
+            // reset and now do the calculation based on the cleaning
+            refIdx = read.getAlignmentStart() - (int)leftmostIndex;
+            int altIdx = 0;
+            Cigar c = read.getCigar();
+            for (int j = 0 ; j < c.numCigarElements() ; j++) {
+                CigarElement ce = c.getCigarElement(j);
+                int elementLength = ce.getLength();
+                switch ( ce.getOperator() ) {
+                    case M:
+                        for (int k = 0 ; k < elementLength ; k++, refIdx++, altIdx++ ) {
+                            if ( refIdx >= reference.length )
+                                break;
+                            totalCleanedBases[refIdx] += quals[altIdx];
+                            if ( readStr[altIdx] != reference[refIdx] )
+                                cleanedMismatchBases[refIdx] += quals[altIdx];
+                        }
+                        break;
+                    case I:
+                        altIdx += elementLength;
+                        break;
+                    case D:
+                        refIdx += elementLength;
+                        break;
+                }
+
+            }
+        }
+
+        int originalMismatchColumns = 0, cleanedMismatchColumns = 0;
+        StringBuilder sb = new StringBuilder();
+        for ( int i=0; i < reference.length; i++ ) {
+            if ( cleanedMismatchBases[i] == originalMismatchBases[i] )
+                continue;
+            boolean didMismatch = false, stillMismatches = false;
+            if ( originalMismatchBases[i] > totalOriginalBases[i] * MISMATCH_THRESHOLD )  {
+                didMismatch = true;
+                originalMismatchColumns++;
+                if ( (cleanedMismatchBases[i] / totalCleanedBases[i]) > (originalMismatchBases[i] / totalOriginalBases[i]) * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) {
+                    stillMismatches = true;
+                    cleanedMismatchColumns++;
+                }
+            } else if ( cleanedMismatchBases[i] > totalCleanedBases[i] * MISMATCH_THRESHOLD ) {
+                cleanedMismatchColumns++;
+            }
+            if ( snpsOutput != null ) {
+                    if ( didMismatch ) {
+                        sb.append(reads.get(0).getRead().getReferenceName() + ":");
+                        sb.append(((int)leftmostIndex + i));
+                        if ( stillMismatches )
+                            sb.append(" SAME_SNP\n");
+                        else
+                            sb.append(" NOT_SNP\n");
+                    }
+            }
+        }
+
+        //logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns);
+
+        boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns);
+        if ( reduces && snpsOutput != null ) {
+            try {
+                snpsOutput.write(sb.toString());
+                snpsOutput.flush();
+            } catch (Exception e) {}
+        }
+        return reduces;
+    }
+
+    /** Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
+     * starting at 0-based position <code>refIndex</code> on the <code>refSeq</code> and specified by its <code>cigar</code>.
+     * The last argument <code>readIndex</code> specifies 0-based position on the read where the alignment described by the
+     * <code>cigar</code> starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
+     * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
+     * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
+     * always the positions where the cigar starts on the ref and on the read, respectively.
+     *
+     * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
+     * specifies that (any) one AT  is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
+     * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
+     * is not repeated), the original cigar is returned.
+     * @param cigar structure of the original alignment
+     * @param refSeq reference sequence the read is aligned to
+     * @param readSeq read sequence
+     * @param refIndex 0-based alignment start position on ref
+     * @param readIndex 0-based alignment start position on read
+     * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
+     */
+    private Cigar indelRealignment(Cigar cigar, byte[] refSeq, byte[] readSeq, int refIndex, int readIndex) {
+        if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do
+
+        CigarElement ce1 = cigar.getCigarElement(0);
+        CigarElement ce2 = cigar.getCigarElement(1);
+
+        // we currently can not handle clipped reads; alternatively, if the alignment starts from insertion, there
+        // is no place on the read to move that insertion further left; so we are done:
+        if ( ce1.getOperator() != CigarOperator.M ) return cigar;
+
+        int difference = 0; // we can move indel 'difference' bases left
+        final int indel_length = ce2.getLength();
+
+        String indelString; // inserted or deleted sequence
+        int period = 0; // period of the inserted/deleted sequence
+        int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion)
+        int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion)
+
+        if ( ce2.getOperator() == CigarOperator.D )
+            indelString = new String(refSeq, indelIndexOnRef, ce2.getLength()).toUpperCase(); // deleted bases
+        else if ( ce2.getOperator() == CigarOperator.I )
+            indelString = new String(readSeq, indelIndexOnRead, ce2.getLength()).toUpperCase(); // get the inserted bases
+        else
+            // we can get here if there is soft clipping done at the beginning of the read
+            // for now, we'll just punt the issue and not try to realign these
+            return cigar;
+
+        // now we have to check all WHOLE periods of the indel sequence:
+        //  for instance, if
+        //   REF:   AGCTATATATAGCC
+        //   READ:   GCTAT***TAGCC
+        // the deleted sequence ATA does have period of 2, but deletion obviously can not be
+        // shifted left by 2 bases (length 3 does not contain whole number of periods of 2);
+        // however if 4 bases are deleted:
+        //   REF:   AGCTATATATAGCC
+        //   READ:   GCTA****TAGCC
+        // the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases!
+        //  Also, we will always have to check the length of the indel sequence itself (trivial period). If the smallest
+        // period is 1 (which means that indel sequence is a homo-nucleotide sequence), we obviously do not have to check
+        // any other periods.
+
+        // NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence
+        // repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion
+        // position, we will move insertion left, to the position right after CA. This way, while moving the indel across the repeat
+        // on the ref, we can theoretically move it across a non-repeat on the read if the latter has a mismtach.
+
+        while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength
+
+                period = BaseUtils.sequencePeriod(indelString, period+1);
+
+                if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work
+
+                int newIndex = indelIndexOnRef;
+
+                while ( newIndex >= period ) { // let's see if there is a repeat, i.e. if we could also say that same bases at lower position are deleted
+
+                    // lets check if bases [newIndex-period,newIndex) immediately preceding the indel on the ref
+                    // are the same as the currently checked period of the inserted sequence:
+
+                    boolean match = true;
+
+                    for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) {
+                        char indelChr = indelString.charAt(indelPos);
+                        if ( Character.toUpperCase(refSeq[testRefPos]) != indelChr || BaseUtils.simpleBaseToBaseIndex(indelChr) == -1 ) {
+                            match = false;
+                            break;
+                        }
+                    }
+                    if ( match )
+                        newIndex -= period; // yes, they are the same, we can move indel farther left by at least period bases, go check if we can do more...
+                    else break; // oops, no match, can not push indel farther left
+                }
+
+                final int newDifference = indelIndexOnRef - newIndex;
+                if ( newDifference > difference ) difference = newDifference; // deletion should be moved 'difference' bases left
+
+                if ( period == 1 ) break; // we do not have to check all periods of homonucleotide sequences, we already
+                                          // got maximum possible shift after checking period=1 above.
+        }
+
+        //        if ( ce2.getLength() >= 2 )
+        //            System.out.println("-----------------------------------\n  FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0)));
+
+
+        if ( difference > 0 ) {
+
+            // The following if() statement: this should've never happened, unless the alignment is really screwed up.
+            // A real life example:
+            //
+            //   ref:    TTTTTTTTTTTTTTTTTT******TTTTTACTTATAGAAGAAAT...
+            //  read:       GTCTTTTTTTTTTTTTTTTTTTTTTTACTTATAGAAGAAAT...
+            //
+            //  i.e. the alignment claims 6 T's to be inserted. The alignment is clearly malformed/non-conforming since we could
+            // have just 3 T's inserted (so that the beginning of the read maps right onto the beginning of the
+            // reference fragment shown): that would leave us with same 2 mismatches at the beginning of the read
+            // (G and C) but lower gap penalty. Note that this has nothing to do with the alignment being "right" or "wrong"
+            // with respect to where on the DNA the read actually came from. It is the assumptions of *how* the alignments are
+            // built and represented that are broken here. While it is unclear how the alignment shown above could be generated
+            // in the first place, we are not in the business of fixing incorrect alignments in this method; all we are
+            // trying to do is to left-adjust correct ones. So if something like that happens, we refuse to change the cigar
+            // and bail out.
+            if ( ce1.getLength()-difference < 0 ) return cigar;
+
+            Cigar newCigar = new Cigar();
+            // do not add leading M cigar element if its length is zero (i.e. if we managed to left-shift the
+            // insertion all the way to the read start):
+            if ( ce1.getLength() - difference > 0 )
+                newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
+            newCigar.add(ce2);  // add the indel, now it's left shifted since we decreased the number of preceding matching bases
+
+            if ( cigar.numCigarElements() > 2 ) {
+                // if we got something following the indel element:
+
+                if ( cigar.getCigarElement(2).getOperator() == CigarOperator.M  ) {
+                    // if indel was followed by matching bases (that's the most common situation),
+                    // increase the length of the matching section after the indel by the amount of left shift
+                    // (matching bases that were on the left are now *after* the indel; we have also checked at the beginning
+                    // that the first cigar element was also M):
+                    newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M));
+                } else {
+                    // if the element after the indel was not M, we have to add just the matching bases that were on the left
+                    // and now appear after the indel after we performed the shift. Then add the original element that followed the indel.
+                    newCigar.add(new CigarElement(difference, CigarOperator.M));
+                    newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength(),cigar.getCigarElement(2).getOperator()));
+                }
+                // now add remaining (unchanged) cigar elements, if any:
+                for ( int i = 3 ; i < cigar.numCigarElements() ; i++ )  {
+                    newCigar.add(new CigarElement(cigar.getCigarElement(i).getLength(),cigar.getCigarElement(i).getOperator()));
+                }
+            }
+
+            //logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar));
+            cigar = newCigar;
+
+        }
+        return cigar;
+    }
+
+    private class AlignedRead {
+        private SAMRecord read;
+        private Cigar newCigar = null;
+        private int newStart = -1;
+        private int mismatchScoreToReference;
+        private boolean updated = false;
+
+        public AlignedRead(SAMRecord read) {
+            this.read = read;
+            mismatchScoreToReference = 0;
+        }
+
+        public SAMRecord getRead() {
+               return read;
+        }
+
+        public int getReadLength() {
+               return read.getReadLength();
+        }
+
+        public Cigar getCigar() {
+            return (newCigar != null ? newCigar : read.getCigar());
+        }
+
+        // tentatively sets the new Cigar, but it needs to be confirmed later
+        // returns true if the new cigar is a valid change (i.e. not same as original and doesn't remove indel)
+        public boolean setCigar(Cigar cigar) {
+            if ( getCigar().equals(cigar) )
+                return false;
+
+            String str = AlignmentUtils.cigarToString(cigar);
+            if ( !str.contains("D") && !str.contains("I") )
+                return false;
+
+            newCigar = cigar;
+            return true;
+        }
+
+        // tentatively sets the new start, but it needs to be confirmed later
+        public void setAlignmentStart(int start) {
+            newStart = start;
+        }
+
+        public int getAlignmentStart() {
+            return (newStart != -1 ? newStart : read.getAlignmentStart());
+        }
+
+        public int getOriginalAlignmentStart() {
+            return read.getAlignmentStart();
+        }
+
+        // finalizes the changes made.
+        // returns true if this record actually changes, false otherwise
+        public boolean finalizeUpdate() {
+            // if we haven't made any changes, don't do anything
+            if ( newCigar == null )
+                return false;
+            if ( newStart == -1 )
+                newStart = read.getAlignmentStart();
+
+            // if it's a paired end read, we need to update the insert size
+            if ( read.getReadPairedFlag() ) {
+                int insertSize = read.getInferredInsertSize();
+                if ( insertSize > 0 ) {
+                    read.setCigar(newCigar);
+                    read.setInferredInsertSize(insertSize + read.getAlignmentStart() - newStart);
+                    read.setAlignmentStart(newStart);
+                } else {
+                    // note that the correct order of actions is crucial here
+                    int oldEnd = read.getAlignmentEnd();
+                    read.setCigar(newCigar);
+                    read.setAlignmentStart(newStart);
+                    read.setInferredInsertSize(insertSize + oldEnd - read.getAlignmentEnd());
+                }
+            } else {
+                read.setCigar(newCigar);
+                read.setAlignmentStart(newStart);
+            }
+            updated = true;
+            return true;
+        }
+
+        public boolean wasUpdated() {
+            return updated;
+        }
+
+        public void setMismatchScoreToReference(int score) {
+            mismatchScoreToReference = score;
+        }
+
+        public int getMismatchScoreToReference() {
+            return mismatchScoreToReference;
+        }
+    }
+
+    private class Consensus {
+        public byte[] str;
+        public int mismatchSum;
+        public int positionOnReference;
+        public Cigar cigar;
+        public ArrayList<Pair<Integer, Integer>> readIndexes;
+
+        public Consensus(byte[] str, Cigar cigar, int positionOnReference) {
+            this.str = str;
+            this.cigar = cigar;
+            this.positionOnReference = positionOnReference;
+            mismatchSum = 0;
+            readIndexes = new ArrayList<Pair<Integer, Integer>>();
+        }
+
+        public boolean equals(Object o) {
+            return ( this == o || (o instanceof Consensus && this.str.equals(((Consensus)o).str)) );
+        }
+
+        public boolean equals(Consensus c) {
+            return ( this == c || this.str.equals(c.str) );
+        }
+    }
+
+    private class ReadBin {
+
+        private ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
+        private char[] reference = null;
+        private GenomeLoc loc = null;
+
+        public ReadBin() { }
+
+        public void add(SAMRecord read, char[] ref) {
+            reads.add(read);
+
+            // set up the reference
+            if ( reference == null ) {
+                reference = ref;
+                loc = GenomeLocParser.createGenomeLoc(read);
+            } else {
+                long lastPosWithRefBase = loc.getStart() + reference.length -1;
+                int neededBases = (int)(read.getAlignmentEnd() - lastPosWithRefBase);
+                char[] newReference = new char[reference.length + neededBases];
+                System.arraycopy(reference, 0, newReference, 0, reference.length);
+                System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases);
+                reference = newReference;
+            }
+        }
+
+        public List<SAMRecord> getReads() { return reads; }
+
+        public byte[] getRereference() {
+            // upper case it
+            for ( int i = 0; i < reference.length; i++ )
+                reference[i] = Character.toUpperCase(reference[i]);
+
+            // convert it to a byte array
+            byte[] refArray = new byte[reference.length];
+            StringUtil.charsToBytes(reference, 0, reference.length, refArray, 0);
+            return refArray;
+        }
+
+        public GenomeLoc getLocation() { return loc; }
+
+        public int size() { return reads.size(); }
+
+        public void clear() {
+            reads.clear();
+            reference = null;
+            loc = null;
+        }
+
+    }
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java
index 96159f9b6..0b63c5bb1 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java
@@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
 
 import net.sf.samtools.*;
+import net.sf.samtools.util.StringUtil;
 
 import java.util.*;
 import java.io.File;
@@ -279,7 +280,7 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
         if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) {
             for ( AlignedRead aRead : altAlignmentsToTest ) {
                 // do a pairwise alignment against the reference
-                SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
+                SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(StringUtil.stringToBytes(reference), aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
                 Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
                 if ( c != null) {
                     //                    if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ; 
@@ -295,7 +296,7 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
                 int index = generator.nextInt(altAlignmentsToTest.size());
                 AlignedRead aRead = altAlignmentsToTest.remove(index);
                 // do a pairwise alignment against the reference
-                SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
+                SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(StringUtil.stringToBytes(reference), aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
                 Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
                 if ( c != null)
                     altConsenses.add(c);
@@ -404,7 +405,7 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
                     AlignedRead aRead = altReads.get(indexPair.first);
                     if ( aRead.finalizeUpdate() ) {
                         aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + (int)(improvement/10.0), 255));
-                        aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
+                        aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), StringUtil.stringToBytes(reference), aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
                     }
                 }
             }
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java
new file mode 100755
index 000000000..72bd87afe
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java
@@ -0,0 +1,118 @@
+package org.broadinstitute.sting.gatk.walkers.indels;
+
+import net.sf.samtools.*;
+
+import java.util.TreeSet;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * @author ebanks
+ * SortingSAMFileWriter
+ *
+ * this class extends the samtools SAMFileWriter class and caches reads for N loci so that reads
+ * can be emitted out of order (provided they are within the N-locus window)
+ *
+ */
+public class SortingSAMFileWriter implements SAMFileWriter {
+
+    // the base writer from Picard
+    private SAMFileWriter baseWriter;
+
+    // the window over which we agree to accumulate reads
+    private int window;
+
+    // the reads we are accumulating
+    private TreeSet<SAMRecord> cachedReads = new TreeSet<SAMRecord>(new SAMRecordCoordinateComparator());
+
+
+    /**
+     * Constructor
+     *
+     * @param baseWriter   the real SAMFileWriter
+     * @param window       the window over which we agree to store reads
+     */
+    public SortingSAMFileWriter(SAMFileWriter baseWriter, int window) {
+        this.baseWriter = baseWriter;
+        this.window = window;
+    }
+
+    /**
+     * Add a read to the writer for emission
+     *
+     * @param read   the read to emit
+     */
+    public void addAlignment(SAMRecord read) {
+        // at a new contig, clear the cache
+        if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < read.getReferenceIndex() )
+            clearCache();
+
+        long currentPos = read.getAlignmentStart();
+
+        Iterator<SAMRecord> iter = cachedReads.iterator();
+        while ( iter.hasNext() ) {
+            SAMRecord cachedRead = iter.next();
+            if ( currentPos - cachedRead.getAlignmentStart() >= window ) {
+                baseWriter.addAlignment(cachedRead);
+                iter.remove();                
+            } else {
+                break;
+            }
+        }
+
+        cachedReads.add(read);
+    }
+
+    /**
+     * Add a list of reads to the writer for emission; the reads do NOT need to be sorted
+     *
+     * @param reads   the reads to emit
+     */
+    public void addAlignments(List<SAMRecord> reads) {
+        if ( reads.size() == 0 )
+            return;
+
+        // at a new contig, clear the cache
+        if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < reads.get(0).getReferenceIndex() )
+            clearCache();
+
+        cachedReads.addAll(reads);
+
+        // get the last read in the cache
+        SAMRecord last = cachedReads.last();
+
+        long currentPos = last.getAlignmentStart();
+
+        Iterator<SAMRecord> iter = cachedReads.iterator();
+        while ( iter.hasNext() ) {
+            SAMRecord cachedRead = iter.next();
+            if ( currentPos - cachedRead.getAlignmentStart() >= window ) {
+                baseWriter.addAlignment(cachedRead);
+                iter.remove();
+            } else {
+                break;
+            }
+        }
+    }
+
+    /**
+     * get the SAM file header
+     */
+    public SAMFileHeader getFileHeader() {
+        return baseWriter.getFileHeader();
+    }
+
+    /**
+     * close this writer by clearing the cache
+     */
+    public void close() {
+        clearCache();
+    }
+
+    private void clearCache() {
+        Iterator<SAMRecord> iter = cachedReads.iterator();
+        while ( iter.hasNext() )
+            baseWriter.addAlignment(iter.next());
+        cachedReads.clear();
+    }
+}
diff --git a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java
index d8d263b9e..2499f58eb 100644
--- a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java
@@ -4,6 +4,7 @@ import net.sf.samtools.CigarOperator;
 import net.sf.samtools.SAMRecord;
 import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
+import net.sf.samtools.util.StringUtil;
 import net.sf.picard.reference.ReferenceSequence;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.utils.pileup.*;
@@ -113,11 +114,11 @@ public class AlignmentUtils {
     /** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality
      * for reference sequence specified as conventional java string (of bases). By default, it is assumed that
      * the alignment starts at (1-based) position r.getAlignmentStart() on the reference <code>refSeq</code>.
-     * See {@link #numMismatches(SAMRecord, String, int)} if this is not the case.
+     * See {@link #numMismatches(SAMRecord, byte[], int)} if this is not the case.
      */
     public static int numMismatches(SAMRecord r, String refSeq ) {
         if ( r.getReadUnmappedFlag() ) return 1000000;
-        return numMismatches(r, refSeq, r.getAlignmentStart()-1);
+        return numMismatches(r, StringUtil.stringToBytes(refSeq), r.getAlignmentStart()-1);
      }
 
     /** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
@@ -125,6 +126,8 @@ public class AlignmentUtils {
      * specified reference sequence; in other words, <code>refIndex</code> is used in place of alignment's own
      * getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment <code>r</code>
      * (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected.
+     *
+     * THIS CODE ASSUMES THAT ALL BYTES COME FROM UPPERCASED CHARS.
      * 
      * @param r alignment
      * @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of 
@@ -132,7 +135,7 @@ public class AlignmentUtils {
      * @param refIndex zero-based position, at which the alignment starts on the specified reference string. 
      * @return the number of mismatches
      */
-    public static int numMismatches(SAMRecord r, String refSeq, int refIndex) {
+    public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) {
         int readIdx = 0;
         int mismatches = 0;
         byte[] readSeq = r.getReadBases();
@@ -142,15 +145,15 @@ public class AlignmentUtils {
             switch ( ce.getOperator() ) {
                 case M:
                     for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) {
-                        if ( refIndex >= refSeq.length() )
+                        if ( refIndex >= refSeq.length )
                             continue;
-                        char refChr = refSeq.charAt(refIndex);
-                        char readChr = (char)readSeq[readIdx];
+                        byte refChr = refSeq[refIndex];
+                        byte readChr = readSeq[readIdx];
                         // Note: we need to count X/N's as mismatches because that's what SAM requires
                         //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
                         //     BaseUtils.simpleBaseToBaseIndex(refChr)  == -1 )
                         //    continue; // do not count Ns/Xs/etc ?
-                        if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
+                        if ( readChr != refChr )
                             mismatches++;
                     }
                     break;
diff --git a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java
index 2f31b4cc7..69ad52469 100755
--- a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java
+++ b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java
@@ -16,10 +16,6 @@ import java.util.Collections;
  * To change this template use File | Settings | File Templates.
  */
 public class SWPairwiseAlignment {
-    private String s1;
-    private String s2;
-    private int i1;
-    private int i2;
     private int alignment_offset; // offset of s2 w/respect to s1
     private Cigar alignmentCigar;
 
@@ -28,428 +24,48 @@ public class SWPairwiseAlignment {
     private double w_open;
     private double w_extend;
 
-    private int best_mm; // mismatch count
-    private static final int IMPOSSIBLE = 1000000000;
     private static final int MSTATE = 0;
     private static final int ISTATE = 1;
     private static final int DSTATE = 2;
 
-    public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2, double match, double mismatch, double open, double extend ) {
-        s1 = seq1;
-        s2 = seq2;
-        i1 = id1;
-        i2 = id2;
+    // ************************************************************************
+    // ****                    IMPORTANT NOTE:                             ****
+    // ****  This class assumes that all bytes come from UPPERCASED chars! ****
+    // ************************************************************************
+    public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) {
         w_match = match;
         w_mismatch = mismatch;
         w_open = open;
         w_extend = extend;
-        best_mm = IMPOSSIBLE;
-        //next_mm = IMPOSSIBLE;
-        align4(s1,s2);
+        align(seq1,seq2);
     }
 
-    public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2) {
-        this(seq1,seq2,id1,id2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3)
-    }
-
-    /** Initializes the alignment with pair of sequences (that will be immediately aligned) and
-     * sets their external ids to -1. Such un-annotated pairwise alignment can not be added to MultipleAlignment.
-     *
-     */
-    public SWPairwiseAlignment(String seq1, String seq2) {
-        this(seq1,seq2,-1,-1);
-    }
-
-    public SWPairwiseAlignment(String seq1, String seq2, double match, double mismatch, double open, double extend) {
-        this(seq1,seq2,-1,-1,match,mismatch,open, extend);
+    public SWPairwiseAlignment(byte[] seq1, byte[] seq2) {
+        this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3)
     }
 
     public Cigar getCigar() { return alignmentCigar ; }
 
     public int getAlignmentStart2wrt1() { return alignment_offset; }
 
-    public void align(String a, String b) {
-        int n = a.length();
-        int m = b.length();
-        int [][] sw = new int[n+1][m+1];
-
-        // build smith-waterman matrix:
-        for ( int i = 1 ; i < n+1 ; i++ ) {
-            char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos
-            for ( int j = 1 ; j < m+1 ; j++ ) {
-                char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos
-                int step_diag = sw[i-1][j-1] + w(a_base,b_base);
-                int step_down = sw[i-1][j]+w(a_base,'-');
-                int step_right = sw[i][j-1]+w('-',b_base);
-
-                sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
-            }
-        }
-
-//        print(sw,a,b);
-
-        PrimitivePair.Int p = new PrimitivePair.Int();
-        int maxscore = 0;
-        int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
-
-        // look for largest score. we use >= combined with the traversal direction
-        // to ensure that if two scores are equal, the one closer to diagonal gets picked
-        for ( int i = 1 ; i < n+1 ; i++ ) {
-            if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; }
-        }
-        for ( int j = 1 ; j < m+1 ; j++ ) {
-            if ( sw[n][j] > maxscore ||
-                  sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) {
-                p.first = n;
-                p.second = j ;
-                maxscore = sw[n][j];
-                segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment
-            }
-        }
-
-//        System.out.println("\ni="+p.first+"; j="+p.second);
-
-        // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
-
-
-        // we will be placing all insertions and deletions into sequence b, so the state are named w/regard
-        // to that sequence
-
-        int state = MSTATE;
-
-        int [] scores = new int[3];
-
-        List<CigarElement> lce = new ArrayList<CigarElement>(5);
-
-        do {
-            scores[ISTATE] = sw[p.first][p.second-1]; // moving left: same base on a, prev base on b = insertion on b
-            scores[DSTATE] = sw[p.first-1][p.second]; // moving up: same base on b, prev base on a = deletion on b
-            scores[MSTATE] = sw[p.first-1][p.second-1]; // moving diagonal : mathc/mismatch
-
-            int new_state = findMaxInd(scores,MSTATE);
-
-            // move to next best location in the sw matrix:
-            switch( new_state ) {
-                case MSTATE: p.first--; p.second--; break;
-                case ISTATE: p.second--; break;
-                case DSTATE: p.first--; break;
-            }
-
-            // now let's see if the state actually changed:
-            if ( new_state == state ) segment_length++;
-            else {
-                // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
-                CigarOperator o=null;
-                switch(state) {
-                    case MSTATE: o = CigarOperator.M; break;
-                    case ISTATE: o = CigarOperator.I; break;
-                    case DSTATE: o = CigarOperator.D; break;
-                }
-                CigarElement e = new CigarElement(segment_length,o);
-                lce.add(e);
-                segment_length = 1;
-                state = new_state;
-            }
-        } while ( scores[state] != 0 );
-
-        // post-process the last segment we are still keeping
-        CigarOperator o=null;
-
-        switch(state) {
-            case MSTATE: o = CigarOperator.M; break;
-            case ISTATE: o = CigarOperator.I; break;
-            case DSTATE: o = CigarOperator.D; break;
-        }
-
-        alignment_offset = p.first - p.second;
-        segment_length+=p.second;
-        CigarElement e = new CigarElement(segment_length,o);
-        lce.add(e);
-        Collections.reverse(lce);
-        alignmentCigar = new Cigar(lce);
-    }
-
-    /** Allows for separate gap opening end extension penalties, no direct backtracking.
-     *
-     * @param a
-     * @param b
-     */
-    public void align2(String a, String b) {
-        int n = a.length();
-        int m = b.length();
-        double [][] sw = new double[n+1][m+1];
-
-        // build smith-waterman matrix:
-        for ( int i = 1 ; i < n+1 ; i++ ) {
-            char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos
-            for ( int j = 1 ; j < m+1 ; j++ ) {
-                char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos
-                double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
-                double step_down = 0.0 ;
-                for ( int k = 1 ; k < i ; k++ ) step_down = Math.max(step_down,sw[i-k][j]+wk(a_base,'-',k));
-                    
-                double step_right = 0;
-                for ( int k = 1 ; k < j ; k++ ) step_right = Math.max(step_right,sw[i][j-k]+wk('-',b_base,k));
-
-                sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
-            }
-        }
-
-//        print(sw,a,b);
-
-        PrimitivePair.Int p = new PrimitivePair.Int();
-        double maxscore = 0.0;
-        int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
-
-        // look for largest score. we use >= combined with the traversal direction
-        // to ensure that if two scores are equal, the one closer to diagonal gets picked
-        for ( int i = 1 ; i < n+1 ; i++ ) {
-            if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; }
-        }
-        for ( int j = 1 ; j < m+1 ; j++ ) {
-            if ( sw[n][j] > maxscore ||
-                  sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) {
-                p.first = n;
-                p.second = j ;
-                maxscore = sw[n][j];
-                segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment
-            }
-        }
-
-//        System.out.println("\ni="+p.first+"; j="+p.second);
-
-        // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
-
-
-        // we will be placing all insertions and deletions into sequence b, so the state are named w/regard
-        // to that sequence
-
-        int state = MSTATE;
-
-        double [] scores = new double[3];
-
-        List<CigarElement> lce = new ArrayList<CigarElement>(5);
-
-        do {
-            // moving left: same base on a, prev base on b = insertion on b:
-            scores[ISTATE] = sw[p.first][p.second-1] ;
-            scores[DSTATE] = sw[p.first - 1][p.second];
-            scores[MSTATE] = sw[p.first-1][p.second-1]; // moving diagonal : match/mismatch
-
-            //            System.out.println("i = " + p.first + " ; j = " + p.second);
-            //            System.out.println("s(M)="+scores[MSTATE]+"; s(D)="+scores[DSTATE]+"; s(I)=" + scores[ISTATE]);
-            int new_state = findMaxInd(scores,MSTATE);
-
-            // move to next best location in the sw matrix:
-            switch( new_state ) {
-                case MSTATE: p.first--; p.second--; break;
-                case ISTATE: p.second--; break;
-                case DSTATE: p.first--; break;
-            }
-
-            // now let's see if the state actually changed:
-            if ( new_state == state ) segment_length++;
-            else {
-                // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
-                CigarOperator o=null;
-                switch(state) {
-                    case MSTATE: o = CigarOperator.M; break;
-                    case ISTATE: o = CigarOperator.I; break;
-                    case DSTATE: o = CigarOperator.D; break;
-                }
-                CigarElement e = new CigarElement(segment_length,o);
-                lce.add(e);
-                segment_length = 1;
-                state = new_state;
-            }
-        } while ( scores[state] != 0 );
-
-        // post-process the last segment we are still keeping
-        CigarOperator o=null;
-        switch(state) {
-            case MSTATE: o = CigarOperator.M; break;
-            case ISTATE: o = CigarOperator.I; break;
-            case DSTATE: o = CigarOperator.D; break;
-        }
-        alignment_offset = p.first - p.second;
-        segment_length+=p.second;
-        CigarElement e = new CigarElement(segment_length,o);
-        lce.add(e);
-        Collections.reverse(lce);
-        alignmentCigar = new Cigar(lce);
-    }
-
-
-
-
-    /** Allows for separate gap opening and extension penalties, with backtracking.
-     *
-     * @param a
-     * @param b
-     */
-public void align3(String a, String b) {
-        int n = a.length();
-        int m = b.length();
-        double [][] sw = new double[n+1][m+1];
-
-        int [][] btrack = new int[n+1][m+1];
-
-        // build smith-waterman matrix and keep backtrack info:
-        for ( int i = 1 ; i < n+1 ; i++ ) {
-            char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos
-            for ( int j = 1 ; j < m+1 ; j++ ) {
-                char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos
-                double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
-                double step_down = 0.0 ;
-                int kd = 0;
-                for ( int k = 1 ; k < i ; k++ ) {
-                    if ( step_down < sw[i-k][j]+wk(a_base,'-',k) ) {
-                        step_down=sw[i-k][j]+wk(a_base,'-',k);
-                        kd = k;
-                    }
-                }
-
-                double step_right = 0;
-                int ki = 0;
-                for ( int k = 1 ; k < j ; k++ ) {
-                    if ( step_right < sw[i][j-k]+wk('-',b_base,k) ) {
-                        step_right=sw[i][j-k]+wk('-',b_base,k);
-                        ki = k;
-                    }
-                }
-
-                if ( step_down > step_right ) {
-                    if ( step_down > step_diag ) {
-                        sw[i][j] = Math.max(0,step_down);
-                        btrack[i][j] = kd; // positive=vertical
-                    }
-                    else {
-                        sw[i][j] = Math.max(0,step_diag);
-                        btrack[i][j] = 0; // 0 = diagonal
-                    }
-                } else {
-                    // step_down < step_right
-                    if ( step_right > step_diag ) {
-                        sw[i][j] = Math.max(0,step_right);
-                        btrack[i][j] = -ki; // negative = horizontal
-                    } else {
-                        sw[i][j] = Math.max(0,step_diag);
-                        btrack[i][j] = 0; // 0 = diagonal
-                    }
-                }
-                sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
-            }
-        }
-
-//        print(sw,a,b);
-
-        PrimitivePair.Int p = new PrimitivePair.Int();
-        double maxscore = 0.0;
-        int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
-
-        // look for largest score. we use >= combined with the traversal direction
-        // to ensure that if two scores are equal, the one closer to diagonal gets picked
-        for ( int i = 1 ; i < n+1 ; i++ ) {
-            if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; }
-        }
-        for ( int j = 1 ; j < m+1 ; j++ ) {
-            if ( sw[n][j] > maxscore ||
-                  sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) {
-                p.first = n;
-                p.second = j ;
-                maxscore = sw[n][j];
-                segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
-            }
-        }
-
-        System.out.println("\ni="+p.first+"; j="+p.second);
-
-        // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
-
-
-        // we will be placing all insertions and deletions into sequence b, so the state are named w/regard
-        // to that sequence
-
-        int state = MSTATE;
-
-        double [] scores = new double[3];
-
-        List<CigarElement> lce = new ArrayList<CigarElement>(5);
-
-        do {
-
-            int btr = btrack[p.first][p.second];
-            int step_left = ( btr < 0 ? -btr : 1);
-            int step_up = ( btr > 0 ? btr : 1 );
-
-            // moving left: same base on a, prev base on b = insertion on b:
-            scores[ISTATE] = sw[p.first][p.second-step_left] ;
-            scores[DSTATE] = sw[p.first - step_up][p.second];
-            scores[MSTATE] = sw[p.first-1][p.second-1]; // moving diagonal : match/mismatch
-
-            //            System.out.println("i = " + p.first + " ; j = " + p.second);
-            //            System.out.println("s(M)="+scores[MSTATE]+"; s(D)="+scores[DSTATE]+"; s(I)=" + scores[ISTATE]);
-            int new_state = findMaxInd(scores,MSTATE);
-
-            int step_length = 1;
-
-            // move to next best location in the sw matrix:
-            switch( new_state ) {
-                case MSTATE: p.first--; p.second--; break;
-                case ISTATE: p.second-=step_left; step_length = step_left; break;
-                case DSTATE: p.first-=step_up; step_length = step_up; break;
-            }
-
-            // now let's see if the state actually changed:
-            if ( new_state == state ) segment_length+=step_length;
-            else {
-                // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
-                CigarOperator o=null;
-                switch(state) {
-                    case MSTATE: o = CigarOperator.M; break;
-                    case ISTATE: o = CigarOperator.I; break;
-                    case DSTATE: o = CigarOperator.D; break;
-                }
-                CigarElement e = new CigarElement(segment_length,o);
-                lce.add(e);
-                segment_length = step_length;
-                state = new_state;
-            }
-        } while ( scores[state] != 0 );
-
-        // post-process the last segment we are still keeping
-        CigarOperator o=null;
-        switch(state) {
-            case MSTATE: o = CigarOperator.M; break;
-            case ISTATE: o = CigarOperator.I; break;
-            case DSTATE: o = CigarOperator.D; break;
-        }
-        alignment_offset = p.first - p.second;
-        segment_length+=p.second;
-        CigarElement e = new CigarElement(segment_length,o);
-        lce.add(e);
-        Collections.reverse(lce);
-        alignmentCigar = new Cigar(lce);
-    }
-
-    public void align4(String a, String b) {
-            int n = a.length();
-            int m = b.length();
+    public void align(final byte[] a, final byte[] b) {
+            int n = a.length;
+            int m = b.length;
             double [][] sw = new double[n+1][m+1];
 
             int [][] btrack = new int[n+1][m+1];
 
             // build smith-waterman matrix and keep backtrack info:
             for ( int i = 1 ; i < n+1 ; i++ ) {
-                char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos
+                byte a_base = a[i-1]; // letter in a at the current pos
                 for ( int j = 1 ; j < m+1 ; j++ ) {
-                    char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos
+                    byte b_base = b[j-1]; // letter in b at the current pos
                     double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
                     double step_down = 0.0 ;
                     int kd = 0;
                     for ( int k = 1 ; k < i ; k++ ) {
-                        if ( step_down < sw[i-k][j]+wk(a_base,'-',k) ) {
-                            step_down=sw[i-k][j]+wk(a_base,'-',k);
+                        if ( step_down < sw[i-k][j]+wk(k) ) {
+                            step_down=sw[i-k][j]+wk(k);
                             kd = k;
                         }
                     }
@@ -457,8 +73,8 @@ public void align3(String a, String b) {
                     double step_right = 0;
                     int ki = 0;
                     for ( int k = 1 ; k < j ; k++ ) {
-                        if ( step_right < sw[i][j-k]+wk('-',b_base,k) ) {
-                            step_right=sw[i][j-k]+wk('-',b_base,k);
+                        if ( step_right < sw[i][j-k]+wk(k) ) {
+                            step_right=sw[i][j-k]+wk(k);
                             ki = k;
                         }
                     }
@@ -517,8 +133,6 @@ public void align3(String a, String b) {
 
             int state = MSTATE;
 
-            double [] scores = new double[3];
-
             List<CigarElement> lce = new ArrayList<CigarElement>(5);
 
             do {
@@ -527,7 +141,7 @@ public void align3(String a, String b) {
                 int step_left = ( btr < 0 ? -btr : 1);
                 int step_up = ( btr > 0 ? btr : 1 );
 
-                int new_state = -1;
+                int new_state;
                 if ( btr > 0 ) new_state = DSTATE;
                 else if ( btr < 0 ) new_state = ISTATE;
                 else new_state = MSTATE;
@@ -573,51 +187,13 @@ public void align3(String a, String b) {
             alignmentCigar = new Cigar(lce);
     }
 
-    private int w(char x, char y) {
-        if ( x == y ) return 2; // match
-        if ( x == '-' || y == '-' ) return -1; // gap
-        return -1; // mismatch
-    }
-
-    private double wd ( char x, char y ) {
+    private double wd(byte x, byte y) {
         if ( x== y ) return w_match;
         else return w_mismatch;
     }
 
-    private double wk(char x, char y, int k) {
+    private double wk(int k) {
         return w_open+(k-1)*w_extend; // gap
-        // return -1.0 ; // no extension penalty
-        // return -1.0-Math.log(k+1); // weak extension penalty
-    }
-
-    /** Returns index of the maximum element in array s. If there is a tie, and one of the tied indices is
-     * pref_id, then it will be preferred and returned.
-     * @param s
-     * @param pref_id
-     * @return
-     */
-    private int findMaxInd(int[] s, int pref_id) {
-        int imax = 0;
-        int maxval = s[0];
-        for ( int i = 1; i < s.length ; i++ ) {
-            if ( s[i] > maxval || i == pref_id && Math.abs(s[i] - maxval) < 0.0001 ) {
-                imax = i;
-                maxval = s[i];
-            }
-        }
-        return imax;
-    }
-
-    private int findMaxInd(double[] s, int pref_id) {
-        int imax = 0;
-        double maxval = s[0];
-        for ( int i = 1; i < s.length ; i++ ) {
-            if ( s[i] > maxval + 0.0001 || i == pref_id && Math.abs(s[i] - maxval) < 0.0001 ) {
-                imax = i;
-                maxval = s[i];
-            }
-        }
-        return imax;
     }
 
     private void print(int[][] s) {
@@ -673,137 +249,4 @@ public void align3(String a, String b) {
         }
     }
 
-    public String toString() {
-        StringBuilder bmm = new StringBuilder();
-        StringBuilder b1 = new StringBuilder();
-        StringBuilder b2 = new StringBuilder();
-
-        int pos1 = 0;
-        int pos2 = 0;
-        if ( alignment_offset < 0 ) {
-            for (  ; pos2 < -alignment_offset ; pos2++ ) {
-                b1.append(' ');
-                b2.append(s2.charAt(pos2));
-                bmm.append(' ');
-            }
-            // now pos2 = -alignment_offset;
-        } else {
-            for (  ; pos1 < alignment_offset ; pos1++ ) {
-                b2.append(' ');
-                b1.append(s1.charAt(pos1));
-                bmm.append(' ');
-            }
-            // now pos1 = alignment_offset
-        }
-/* debug prints: */
-//        System.out.println(AlignmentUtils.toString(getCigar()));
-//        System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length());
-//        System.out.println("offset="+alignment_offset);
-//        System.out.println("pos1="+pos1+"; pos2=" + pos2);
-/**/
-        for ( int i = 0 ; i < getCigar().numCigarElements() ; i++ ) {
-            CigarElement ce = getCigar().getCigarElement(i) ;
-            switch( ce.getOperator() ) {
-                case M:
-                    int z = ( i == 0 ? pos2 : 0); // if we are in the first element and seq overhangs to the left,
-                                                  // start inside the first segment, at the position where actual matches begin
-                    // check separately for pos1 < s1.length() since seq2 is allowed to overhang beyond seq1's end
-                    for (  ; z < ce.getLength() && pos1 < s1.length() ; z++ ) {
-//                        System.out.println("pos1="+pos1+"; pos2="+pos2+"; k="+z);
-                        if ( Character.toUpperCase(s1.charAt(pos1)) !=
-                                Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*');
-                        else bmm.append(' ');
-                        b1.append(s1.charAt(pos1++));
-                        b2.append(s2.charAt(pos2++));
-                    }
-                    break;
-                case I:
-                    for ( int k = 0 ; k < ce.getLength() ; k++ ) {
-                        b1.append('+');
-                        bmm.append('+');
-                        b2.append(s2.charAt(pos2++));
-                    }
-                    break;
-                case D:
-                    for ( int k = 0 ; k < ce.getLength() ; k++ ) {
-                        b1.append(s1.charAt(pos1++));
-                        bmm.append('-');
-                        b2.append('-');
-                    }
-                    break;
-            }
-        }
-
-        bmm.append('\n');
-        b1.append(s1,pos1,s1.length());
-        bmm.append(b1);
-        bmm.append('\n');
-        b2.append(s2,pos2,s2.length());
-        bmm.append(b2);
-        bmm.append('\n');
-
-        return bmm.toString();
-    }
-
-    public static void testMe() {
- /*       String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
-        String s2 =       "TGTATATAGGGTAAGG";
-
-        testMe(s1,s2);
-
-        s1 = "GGTAAGGC";
-        s2 = "GGTCTCAA";
-
-        testMe(s1,s2);
-
-        s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
-        s2 =       "TGTTAGGGTCTCAAGG";
-        testMe(s1,s2);
-
-
-        s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
-        s2 =             "TAGGGTAAGGCTGATCCATGTACCG" ;
-        testMe(s1,s2);
-
-        s1 =           "ACCTGGTGTATATAGGGTAAGGCTGAT";
-        s2 = "CCGTATCATTACCTGGTGTATATAGG";
-        testMe(s1,s2);
-
-        s1 = "GGTGTATATAGGGT"  ;
-        s2 = "TGTTAGGG";
-        testMe(s1,s2);
-
-        s1 = "AGACAGAGAGAAGG";
-        s2 = "AGACAGAGAAGG";
-        testMe(s1,s2);
-   */
- //       String s1 = "CCAGCACACAGGTATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA";
- //       String s2 = "CCAGCACACATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA";
-
-//        String s1 = "CCCATCTGTCTCCAATCTGCTGTTTTCCAAAAATTAGGGAACTTCAGTTTTCCCTTTGATACTCTGTATTTCTACCAACCACAACGCCAGGGCTGTCCTGCTTCTACAAGTGACAATGACAAATATAGGCCTGAAGGAAGATG";
-//        String s2 = "AAAATCTGTTTCCAATCTACTGTTTTCCAAAAATTAGGGAAGTTCAGTTTTCCCTTTGATACTCTGTTTCTACCAATCC";
-        String s1 = "CCCATCTGTCTCCAATCTGCTGTTTTCCAAAAATTAGGGAACTTCAGTTTTCCCTTTGATACTCTGTATTTCTACCAACCACAACGCCAGGGCTGTCCTGCTTCTACAAGTGACAATGACAAATATAGGCCTGAAGGAAGATG";
-        String s2 = "AAAATCTGTCTCCAATCTACTGTTTTCCAAAAATTAGGGAAGTTCAGTTTTCCCTTTGATACTCTGTTTCTACCAATCC";
-           testMe(s1,s2);
-    }
-
-    public static void testMe(String s1, String s2) {
-
-        SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2,3.0,-1.0,-4,-0.5);
-
-        System.out.println(AlignmentUtils.toString(swpa.getCigar()));
-//       SequencePile sp = new SequencePile(s1);
-//        sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1());
-//        System.out.println();
-//        System.out.println(sp.format());
-
-        System.out.println("--------\n"+swpa.toString());        
-
-        //sp.colorprint(false);
-    }
-
-    public static void main(String argv[]) {
-        if ( argv.length > 0 ) testMe(argv[0],argv[1]);
-        else testMe();
-    }
 }