From ec05ecef60c5332c75b4815af1f2dd3a54610642 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Fri, 11 Jan 2013 18:05:45 -0500
Subject: [PATCH 01/34] getAdaptorBoundary returns an int, not an Integer, as
 this was taking 30% of the allocation effort for LIBS

---
 .../sting/utils/clipping/ReadClipper.java     |  4 ++--
 .../sting/utils/sam/ReadUtils.java            | 16 ++++++++-------
 .../sting/utils/sam/ReadUtilsUnitTest.java    | 20 +++++++++----------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java
index 524c29d64..87526545d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java
+++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java
@@ -381,9 +381,9 @@ public class ReadClipper {
      * @return a new read without adaptor sequence
      */
     private GATKSAMRecord hardClipAdaptorSequence () {
-        final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read);
+        final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read);
 
-        if (adaptorBoundary == null || !ReadUtils.isInsideRead(read, adaptorBoundary))
+        if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY || !ReadUtils.isInsideRead(read, adaptorBoundary))
             return read;
 
         return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary);
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
index b61628d4d..b43b590df 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
@@ -169,8 +169,8 @@ public class ReadUtils {
      * @return whether or not the base is in the adaptor
      */
     public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) {
-        Integer adaptorBoundary = getAdaptorBoundary(read);
-        if (adaptorBoundary == null || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE)
+        final int adaptorBoundary = getAdaptorBoundary(read);
+        if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE)
             return false;
 
         return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary;
@@ -199,26 +199,28 @@ public class ReadUtils {
      *   in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one)
      *
      * @param read the read being tested for the adaptor boundary
-     * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig.
+     * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read.
+     * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig.
      */
-    public static Integer getAdaptorBoundary(final SAMRecord read) {
+    public static int getAdaptorBoundary(final SAMRecord read) {
         final int MAXIMUM_ADAPTOR_LENGTH = 8;
         final int insertSize = Math.abs(read.getInferredInsertSize());    // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value)
 
         if (insertSize == 0 || read.getReadUnmappedFlag())                // no adaptors in reads with mates in another chromosome or unmapped pairs
-            return null;                                                  
+            return CANNOT_COMPUTE_ADAPTOR_BOUNDARY;
         
-        Integer adaptorBoundary;                                          // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read)
+        int adaptorBoundary;                                          // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read)
         if (read.getReadNegativeStrandFlag())
             adaptorBoundary = read.getMateAlignmentStart() - 1;           // case 1 (see header)
         else
             adaptorBoundary = read.getAlignmentStart() + insertSize + 1;  // case 2 (see header)
 
         if ( (adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH) )
-            adaptorBoundary = null;                                       // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor
+            adaptorBoundary = CANNOT_COMPUTE_ADAPTOR_BOUNDARY;                                       // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor
         
         return adaptorBoundary;
     }
+    public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE;
 
     /**
      * is the read a 454 read?
diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java
index 71c7d1bb0..4194aa6d5 100644
--- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java
@@ -40,7 +40,7 @@ public class ReadUtilsUnitTest extends BaseTest {
         final int mateStart = 1000;
         final int BEFORE = mateStart - 2;
         final int AFTER = mateStart + 2;
-        Integer myStart, boundary;
+        int myStart, boundary;
 
         GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar);
         read.setMateAlignmentStart(mateStart);
@@ -51,43 +51,43 @@ public class ReadUtilsUnitTest extends BaseTest {
         read.setAlignmentStart(myStart);
         read.setReadNegativeStrandFlag(false);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1);
+        Assert.assertEquals(boundary, myStart + fragmentSize + 1);
 
         // Test case 2: positive strand, second read
         myStart = AFTER;
         read.setAlignmentStart(myStart);
         read.setReadNegativeStrandFlag(false);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1);
+        Assert.assertEquals(boundary, myStart + fragmentSize + 1);
 
         // Test case 3: negative strand, second read
         myStart = AFTER;
         read.setAlignmentStart(myStart);
         read.setReadNegativeStrandFlag(true);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertEquals(boundary.intValue(), mateStart - 1);
+        Assert.assertEquals(boundary, mateStart - 1);
 
         // Test case 4: negative strand, first read
         myStart = BEFORE;
         read.setAlignmentStart(myStart);
         read.setReadNegativeStrandFlag(true);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertEquals(boundary.intValue(), mateStart - 1);
+        Assert.assertEquals(boundary, mateStart - 1);
 
         // Test case 5: mate is mapped to another chromosome (test both strands)
         read.setInferredInsertSize(0);
         read.setReadNegativeStrandFlag(true);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertNull(boundary);
+        Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
         read.setReadNegativeStrandFlag(false);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertNull(boundary);
+        Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
         read.setInferredInsertSize(10);
 
         // Test case 6: read is unmapped
         read.setReadUnmappedFlag(true);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertNull(boundary);
+        Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
         read.setReadUnmappedFlag(false);
 
         // Test case 7:  reads don't overlap and look like this:
@@ -99,7 +99,7 @@ public class ReadUtilsUnitTest extends BaseTest {
         read.setInferredInsertSize(20);
         read.setReadNegativeStrandFlag(true);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertNull(boundary);
+        Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
 
         // second read:
         myStart = 1000;
@@ -107,6 +107,6 @@ public class ReadUtilsUnitTest extends BaseTest {
         read.setMateAlignmentStart(980);
         read.setReadNegativeStrandFlag(false);
         boundary = ReadUtils.getAdaptorBoundary(read);
-        Assert.assertNull(boundary);
+        Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
     }
 }

From 83fcc06e28b5d4d85e84183465f7c118536688f5 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sat, 12 Jan 2013 12:41:13 -0500
Subject: [PATCH 02/34] LIBS optimizations and performance tools

-- Made LIBSPerformance a full featured CommandLineProgram, and it can be used to assess the LIBS performance by reading a provided BAM
-- ReadStateManager now provides a clean interface to iterate in sample order the per-sample read states, allowing us to avoid many map.get calls
-- Moved updateReadStates to ReadStateManager
-- Removed the unnecessary wrapping of an iterator in ReadStateManager
-- readStatesBySample is now a LinkedHashMap so that iteration occurs in LIBS sample order, allowing us to avoid many unnecessary calls to map.get iterating over samples.  Now those are just map native iterations
-- Restructured collectPendingReads for simplicity, removing redundant and consolidating common range checks.  The new piece is code is much clearer and avoids several unnecessary function calls
---
 .../locusiterator/AlignmentStateMachine.java  |  10 ++
 .../locusiterator/LocusIteratorByState.java   |  40 ++---
 .../utils/locusiterator/ReadStateManager.java | 144 ++++++++++--------
 .../ReadStateManagerUnitTest.java             |   2 +-
 4 files changed, 99 insertions(+), 97 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java
index 32e56866b..50bc9e25b 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java
@@ -113,6 +113,16 @@ public class AlignmentStateMachine {
         return read;
     }
 
+    /**
+     * Get the reference index of the underlying read
+     *
+     * @return the reference index of the read
+     */
+    @Ensures("result == getRead().getReferenceIndex()")
+    public int getReferenceIndex() {
+        return getRead().getReferenceIndex();
+    }
+
     /**
      * Is this the left edge state?  I.e., one that is before or after the current read?
      * @return true if this state is an edge state, false otherwise
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
index 01c9e564e..9499bfa35 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
@@ -34,8 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
+import org.broadinstitute.sting.utils.pileup.*;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 
@@ -234,17 +233,16 @@ public class LocusIteratorByState extends LocusIterator {
             final GenomeLoc location = getLocation();
             final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
 
-            // TODO: How can you determine here whether the current pileup has been downsampled?
-            boolean hasBeenSampled = false;
-
-            for (final String sample : samples) {
-                final Iterator<AlignmentStateMachine> iterator = readStates.iterator(sample);
-                final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
+            for (final Map.Entry<String, ReadStateManager.PerSampleReadStateManager> sampleStatePair : readStates ) {
+                final String sample = sampleStatePair.getKey();
+                final ReadStateManager.PerSampleReadStateManager readState = sampleStatePair.getValue();
+                final Iterator<AlignmentStateMachine> iterator = readState.iterator();
+                final List<PileupElement> pile = new ArrayList<PileupElement>(readState.size());
 
                 while (iterator.hasNext()) {
                     // state object with the read/offset information
                     final AlignmentStateMachine state = iterator.next();
-                    final GATKSAMRecord read = (GATKSAMRecord) state.getRead();
+                    final GATKSAMRecord read = state.getRead();
                     final CigarOperator op = state.getCigarOperator();
 
                     if (op == CigarOperator.N) // N's are never added to any pileup
@@ -263,29 +261,9 @@ public class LocusIteratorByState extends LocusIterator {
                     fullPileup.put(sample, new ReadBackedPileupImpl(location, pile));
             }
 
-            updateReadStates(); // critical - must be called after we get the current state offsets and location
+            readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location
             if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
-                nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
-        }
-    }
-
-    /**
-     * Advances all fo the read states by one bp.  After this call the read states are reflective
-     * of the next pileup.
-     */
-    private void updateReadStates() {
-        for (final String sample : samples) {
-            Iterator<AlignmentStateMachine> it = readStates.iterator(sample);
-            while (it.hasNext()) {
-                AlignmentStateMachine state = it.next();
-                CigarOperator op = state.stepForwardOnGenome();
-                if (op == null) {
-                    // we discard the read only when we are past its end AND indel at the end of the read (if any) was
-                    // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
-                    // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
-                    it.remove();                                                // we've stepped off the end of the object
-                }
-            }
+                nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false);
         }
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
index 2dcf01d72..0a8d3a108 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
@@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import net.sf.picard.util.PeekableIterator;
+import net.sf.samtools.CigarOperator;
 import org.broadinstitute.sting.gatk.downsampling.Downsampler;
 import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -48,11 +49,18 @@ import java.util.*;
  * Date: 1/5/13
  * Time: 2:02 PM
  */
-class ReadStateManager {
+final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> {
     private final List<String> samples;
     private final PeekableIterator<GATKSAMRecord> iterator;
     private final SamplePartitioner<GATKSAMRecord> samplePartitioner;
-    private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
+
+    /**
+     * A mapping from sample name -> the per sample read state manager that manages
+     *
+     * IT IS CRITICAL THAT THIS BE A LINKED HASH MAP, SO THAT THE ITERATION OF THE MAP OCCURS IN THE SAME
+     * ORDER AS THE ORIGINL SAMPLES
+     */
+    private final Map<String, PerSampleReadStateManager> readStatesBySample = new LinkedHashMap<String, PerSampleReadStateManager>();
 
     private LinkedList<GATKSAMRecord> submittedReads;
     private final boolean keepSubmittedReads;
@@ -70,6 +78,7 @@ class ReadStateManager {
         this.submittedReads = new LinkedList<GATKSAMRecord>();
 
         for (final String sample : samples) {
+            // because this is a linked hash map the order of iteration will be in sample order
             readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo));
         }
 
@@ -77,29 +86,16 @@ class ReadStateManager {
     }
 
     /**
-     * Returns a iterator over all the reads associated with the given sample.  Note that remove() is implemented
-     * for this iterator; if present, total read states will be decremented.
+     * Returns a iterator over all the sample -> per-sample read state managers with each sample in this read state manager.
      *
-     * @param sample The sample.
-     * @return Iterator over the reads associated with that sample.
+     * The order of iteration is the same as the order of the samples provided upon construction to this
+     * ReadStateManager.
+     *
+     * @return Iterator over sample + per sample read state manager pairs for this read state manager.
      */
-    public Iterator<AlignmentStateMachine> iterator(final String sample) {
-        // TODO -- why is this wrapped?
-        return new Iterator<AlignmentStateMachine>() {
-            private Iterator<AlignmentStateMachine> wrappedIterator = readStatesBySample.get(sample).iterator();
-
-            public boolean hasNext() {
-                return wrappedIterator.hasNext();
-            }
-
-            public AlignmentStateMachine next() {
-                return wrappedIterator.next();
-            }
-
-            public void remove() {
-                wrappedIterator.remove();
-            }
-        };
+    @Override
+    public Iterator<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> iterator() {
+        return readStatesBySample.entrySet().iterator();
     }
 
     public boolean isEmpty() {
@@ -126,10 +122,9 @@ class ReadStateManager {
     }
 
     public AlignmentStateMachine getFirst() {
-        for (final String sample : samples) {
-            PerSampleReadStateManager reads = readStatesBySample.get(sample);
-            if (!reads.isEmpty())
-                return reads.peek();
+        for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) {
+            if ( ! manager.isEmpty() )
+                return manager.peek();
         }
         return null;
     }
@@ -138,51 +133,65 @@ class ReadStateManager {
         return totalReadStates > 0 || iterator.hasNext();
     }
 
-    // fast testing of position
-
     /**
-     * TODO -- this function needs to be optimized
-     *
-     * Notes:
-     * -- the only place where it's called is in a block where we know isEmpty is false
-     * -- getFirst() is quite expensive, and it seems that we could cache this value in the outer
-     *    block, and then pass this in as an argument
-     *
-     * @param read
-     * @return
+     * Advances all fo the read states by one bp.  After this call the read states are reflective
+     * of the next pileup.
      */
-    private boolean readIsPastCurrentPosition(GATKSAMRecord read) {
-        if (isEmpty())
-            return false;
-        else {
-            final AlignmentStateMachine state = getFirst();
-            final GATKSAMRecord ourRead = state.getRead();
-            return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
+    public void updateReadStates() {
+        for (final PerSampleReadStateManager readStateManager : readStatesBySample.values() ) {
+            final Iterator<AlignmentStateMachine> it = readStateManager.iterator();
+            while (it.hasNext()) {
+                final AlignmentStateMachine state = it.next();
+                final CigarOperator op = state.stepForwardOnGenome();
+                if (op == null) {
+                    // we discard the read only when we are past its end AND indel at the end of the read (if any) was
+                    // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
+                    // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
+                    it.remove();                                                // we've stepped off the end of the object
+                }
+            }
         }
     }
 
+    /**
+     * Does read start at the same position as described by currentContextIndex and currentAlignmentStart?
+     *
+     * @param read the read we want to test
+     * @param currentContigIndex the contig index (from the read's getReferenceIndex) of the reads in this state manager
+     * @param currentAlignmentStart the alignment start of the of the left-most position on the
+     *                           genome of the reads in this read state manager
+     * @return true if read has contig index and start equal to the current ones
+     */
+    private boolean readStartsAtCurrentPosition(final GATKSAMRecord read, final int currentContigIndex, final int currentAlignmentStart) {
+        return read.getAlignmentStart() == currentAlignmentStart && read.getReferenceIndex() == currentContigIndex;
+    }
+
+    /**
+     * Pull all of the reads off the iterator that overlap the left-most position among all
+     * reads this ReadStateManager
+     */
     public void collectPendingReads() {
         if (!iterator.hasNext())
             return;
 
-        // the next record in the stream, peeked as to not remove it from the stream
+        // determine the left-most boundary that determines which reads to keep in this new pileup
+        final int firstContigIndex;
+        final int firstAlignmentStart;
         if ( isEmpty() ) {
-            final int firstContigIndex = iterator.peek().getReferenceIndex();
-            final int firstAlignmentStart = iterator.peek().getAlignmentStart();
-            while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
-                submitRead(iterator.next());
-            }
+            // there are no reads here, so our next state is the next read in the stream
+            firstContigIndex = iterator.peek().getReferenceIndex();
+            firstAlignmentStart = iterator.peek().getAlignmentStart();
         } else {
-            // Fast fail in the case that the read is past the current position.
-            if (readIsPastCurrentPosition(iterator.peek()))
-                return;
-
-            while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
-                submitRead(iterator.next());
-            }
+            // there's a read in the system, so it's our targeted first read
+            final AlignmentStateMachine firstState = getFirst();
+            firstContigIndex = firstState.getReferenceIndex();
+            // note this isn't the alignment start of the read, but rather the alignment start position
+            firstAlignmentStart = firstState.getGenomePosition();
         }
 
-        samplePartitioner.doneSubmittingReads();
+        while ( iterator.hasNext() && readStartsAtCurrentPosition(iterator.peek(), firstContigIndex, firstAlignmentStart) ) {
+            submitRead(iterator.next());
+        }
 
         for (final String sample : samples) {
             final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
@@ -271,11 +280,11 @@ class ReadStateManager {
         if (reads.isEmpty())
             return;
 
-        Collection<AlignmentStateMachine> newReadStates = new LinkedList<AlignmentStateMachine>();
+        final LinkedList<AlignmentStateMachine> newReadStates = new LinkedList<AlignmentStateMachine>();
 
-        for (GATKSAMRecord read : reads) {
-            AlignmentStateMachine state = new AlignmentStateMachine(read);
-            if ( state.stepForwardOnGenome() != null )
+        for (final GATKSAMRecord read : reads) {
+            final AlignmentStateMachine state = new AlignmentStateMachine(read);
+            if ( state.stepForwardOnGenome() != null ) // todo -- should be an assertion not a skip
                 // explicitly filter out reads that are all insertions / soft clips
                 newReadStates.add(state);
         }
@@ -283,6 +292,7 @@ class ReadStateManager {
         readStates.addStatesAtNextAlignmentStart(newReadStates);
     }
 
+    // TODO -- refactor into separate class with pointer to ReadStateManager for updates to the total counts
     protected class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
         private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
         private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
@@ -295,12 +305,16 @@ class ReadStateManager {
                     : null;
         }
 
-        public void addStatesAtNextAlignmentStart(Collection<AlignmentStateMachine> states) {
+        /**
+         * Assumes it can just keep the states linked lists without making a copy
+         * @param states
+         */
+        public void addStatesAtNextAlignmentStart(LinkedList<AlignmentStateMachine> states) {
             if ( states.isEmpty() ) {
                 return;
             }
 
-            readStatesByAlignmentStart.add(new LinkedList<AlignmentStateMachine>(states));
+            readStatesByAlignmentStart.add(states);
             thisSampleReadStates += states.size();
             totalReadStates += states.size();
 
diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java
index 1db0605c7..76b324d85 100644
--- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java
@@ -71,7 +71,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
             makeReads();
 
             for ( ArrayList<AlignmentStateMachine> stackRecordStates : recordStatesByAlignmentStart ) {
-                perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
+                perSampleReadStateManager.addStatesAtNextAlignmentStart(new LinkedList<AlignmentStateMachine>(stackRecordStates));
             }
 
             // read state manager should have the right number of reads

From 19288b007d77c597f75bf9ce639df9ebf6601709 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sat, 12 Jan 2013 13:39:19 -0500
Subject: [PATCH 03/34] LIBS bugfix: kept reads now only (correctly) includes
 reads that at least passed the reservoir

-- Added unit tests to ensure this behavior is correct
---
 .../utils/locusiterator/ReadStateManager.java | 12 ++-
 .../LocusIteratorByStateUnitTest.java         | 93 +++++++++++++------
 2 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
index 0a8d3a108..955dbcef7 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
@@ -195,7 +195,15 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
 
         for (final String sample : samples) {
             final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
-            PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
+
+//            // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample
+//            // and add to the list of reads.  Note this may reorder the list of reads someone (it groups them
+//            // by sample, but it cannot change their absolute position on the genome as they all must
+//            // start at the current location
+            if ( keepSubmittedReads )
+                submittedReads.addAll(newReads);
+
+            final PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
             addReadsToSample(statesBySample, newReads);
         }
 
@@ -208,8 +216,6 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
      */
     @Requires("read != null")
     protected void submitRead(final GATKSAMRecord read) {
-        if ( keepSubmittedReads )
-            submittedReads.add(read);
         samplePartitioner.submitRead(read);
     }
 
diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
index 47e386ab5..727023b83 100644
--- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
@@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.ReadProperties;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
 import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
+import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.NGSPlatform;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.Utils;
@@ -350,7 +351,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
 //                Arrays.asList(3));
     }
 
-    @Test(enabled = true, dataProvider = "LIBSTest")
+    @Test(enabled = true && ! DEBUG, dataProvider = "LIBSTest")
     public void testLIBS(LIBSTest params) {
         // create the iterator by state with the fake reads and fake records
         final GATKSAMRecord read = params.makeRead();
@@ -406,22 +407,25 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
     //
     // ------------------------------------------------------------
 
-    @DataProvider(name = "LIBSKeepSubmittedReads")
-    public Object[][] makeLIBSKeepSubmittedReads() {
+    @DataProvider(name = "LIBS_ComplexPileupTests")
+    public Object[][] makeLIBS_ComplexPileupTests() {
         final List<Object[]> tests = new LinkedList<Object[]>();
 
-        for ( final boolean doSampling : Arrays.asList(true, false) ) {
-            for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) {
+        for ( final int downsampleTo : Arrays.asList(-1, 1, 2, 5, 10, 30)) {
+            for ( final int nReadsPerLocus : Arrays.asList(1, 10, 60) ) {
                 for ( final int nLoci : Arrays.asList(1, 10, 25) ) {
                     for ( final int nSamples : Arrays.asList(1, 2, 10) ) {
                         for ( final boolean keepReads : Arrays.asList(true, false) ) {
                             for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true, false) ) {
-//        for ( final int nReadsPerLocus : Arrays.asList(1) ) {
-//            for ( final int nLoci : Arrays.asList(1) ) {
-//                for ( final int nSamples : Arrays.asList(1) ) {
-//                    for ( final boolean keepReads : Arrays.asList(true) ) {
-//                        for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) {
-                                tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, doSampling});
+//        for ( final int downsampleTo : Arrays.asList(1)) {
+//            for ( final int nReadsPerLocus : Arrays.asList(10) ) {
+//                for ( final int nLoci : Arrays.asList(25) ) {
+//                    for ( final int nSamples : Arrays.asList(1) ) {
+//                        for ( final boolean keepReads : Arrays.asList(true) ) {
+//                            for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) {
+                                tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples,
+                                        keepReads, grabReadsAfterEachCycle,
+                                        downsampleTo});
                             }
                         }
                     }
@@ -432,14 +436,15 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
         return tests.toArray(new Object[][]{});
     }
 
-    @Test(enabled = true && ! DEBUG, dataProvider = "LIBSKeepSubmittedReads")
-    public void testLIBSKeepSubmittedReads(final int nReadsPerLocus,
-                                           final int nLoci,
-                                           final int nSamples,
-                                           final boolean keepReads,
-                                           final boolean grabReadsAfterEachCycle,
-                                           final boolean downsample) {
-        logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample));
+    //@Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests")
+    @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests")
+    public void testLIBS_ComplexPileupTests(final int nReadsPerLocus,
+                                            final int nLoci,
+                                            final int nSamples,
+                                            final boolean keepReads,
+                                            final boolean grabReadsAfterEachCycle,
+                                            final int downsampleTo) {
+        //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample));
         final int readLength = 10;
 
         final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
@@ -453,10 +458,9 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
             header.addReadGroup(rg);
         }
 
-        final int maxCoveragePerSampleAtLocus = nReadsPerLocus * readLength / 2;
-        final int maxDownsampledCoverage = Math.max(maxCoveragePerSampleAtLocus / 2, 1);
+        final boolean downsample = downsampleTo != -1;
         final DownsamplingMethod downsampler = downsample
-                ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, maxDownsampledCoverage, null, false)
+                ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false)
                 : new DownsamplingMethod(DownsampleType.NONE, null, null, false);
         final List<GATKSAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
         li = new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
@@ -472,6 +476,8 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
             final AlignmentContext alignmentContext = li.next();
             final ReadBackedPileup p = alignmentContext.getBasePileup();
 
+            AssertWellOrderedPileup(p);
+
             if ( downsample ) {
                 // just not a safe test
                 //Assert.assertTrue(p.getNumberOfElements() <= maxDownsampledCoverage * nSamples, "Too many reads at locus after downsampling");
@@ -480,22 +486,29 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
                 Assert.assertTrue(p.getNumberOfElements() >= minPileupSize);
             }
 
+            // the number of reads starting here
+            int nReadsStartingHere = 0;
+            for ( final GATKSAMRecord read : p.getReads() )
+                if ( read.getAlignmentStart() == alignmentContext.getPosition() )
+                    nReadsStartingHere++;
+
+            // we can have no more than maxDownsampledCoverage per sample
+            final int maxCoveragePerLocus = downsample ? downsampleTo : nReadsPerLocus;
+            Assert.assertTrue(nReadsStartingHere <= maxCoveragePerLocus * nSamples);
+
             seenSoFar.addAll(p.getReads());
             if ( keepReads && grabReadsAfterEachCycle ) {
                 final List<GATKSAMRecord> locusReads = li.transferReadsFromAllPreviousPileups();
 
-                // the number of reads starting here
-                int nReadsStartingHere = 0;
-                for ( final GATKSAMRecord read : p.getReads() )
-                    if ( read.getAlignmentStart() == alignmentContext.getPosition() )
-                        nReadsStartingHere++;
 
-                if ( downsample )
+                if ( downsample ) {
                     // with downsampling we might have some reads here that were downsampled away
-                    // in the pileup
+                    // in the pileup.  We want to ensure that no more than the max coverage per sample is added
                     Assert.assertTrue(locusReads.size() >= nReadsStartingHere);
-                else
+                    Assert.assertTrue(locusReads.size() <= maxCoveragePerLocus * nSamples);
+                } else {
                     Assert.assertEquals(locusReads.size(), nReadsStartingHere);
+                }
                 keptReads.addAll(locusReads);
 
                 // check that all reads we've seen so far are in our keptReads
@@ -543,6 +556,26 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
             for ( final GATKSAMRecord read : seenSoFar ) {
                 Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read);
             }
+
+            if ( ! downsample ) {
+                // check that every read in the list of keep reads occurred at least once in one of the pileups
+                for ( final GATKSAMRecord keptRead : keptReads ) {
+                    Assert.assertTrue(seenSoFar.contains(keptRead), "There's a read " + keptRead + " in our keptReads list that never appeared in any pileup");
+                }
+            }
+        }
+    }
+
+    private void AssertWellOrderedPileup(final ReadBackedPileup pileup) {
+        if ( ! pileup.isEmpty() ) {
+            int leftMostPos = -1;
+
+            for ( final PileupElement pe : pileup ) {
+                Assert.assertTrue(pileup.getLocation().getContig().equals(pe.getRead().getReferenceName()), "ReadBackedPileup contains an element " + pe + " that's on a different contig than the pileup itself");
+                Assert.assertTrue(pe.getRead().getAlignmentStart() >= leftMostPos,
+                        "ReadBackedPileup contains an element " + pe + " whose read's alignment start " + pe.getRead().getAlignmentStart()
+                                + " occurs before the leftmost position we've seen previously " + leftMostPos);
+            }
         }
     }
 }

From a4334a67e088d9cd221dadc011edd1478dc7b28f Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sat, 12 Jan 2013 19:22:36 -0500
Subject: [PATCH 04/34] SamplePartitioner optimizations and bugfixes

-- Use a linked hash map instead of a hash map since we want to iterate through the map fairly often
-- Ensure that we call doneSubmittingReads before getting reads for samples.  This function call fell out before and since it wasn't enforced I only noticed the problem while writing comments
-- Don't make unnecessary calls to contains for map.  Just use get() and check that the result is null
-- Use a LinkedList in PassThroughDownsampler, since this is faster for add() than the existing ArrayList, and we were's using random access to any resulting
---
 .../downsampling/PassThroughDownsampler.java  |  14 +-
 .../utils/locusiterator/ReadStateManager.java |  10 +-
 .../locusiterator/SamplePartitioner.java      | 124 +++++++++++++++---
 3 files changed, 122 insertions(+), 26 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java
index 600834012..b06d5f5b4 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java
@@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.downsampling;
 
 import net.sf.samtools.SAMRecord;
 
-import java.util.ArrayList;
 import java.util.Collection;
+import java.util.LinkedList;
 import java.util.List;
 
 /**
@@ -41,7 +41,7 @@ import java.util.List;
  */
 public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
 
-    private ArrayList<T> selectedReads;
+    private LinkedList<T> selectedReads;
 
     public PassThroughDownsampler() {
         clear();
@@ -59,9 +59,13 @@ public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsam
     }
 
     public boolean hasFinalizedItems() {
-        return selectedReads.size() > 0;
+        return ! selectedReads.isEmpty();
     }
 
+    /**
+     * Note that this list is a linked list and so doesn't support fast random access
+     * @return
+     */
     public List<T> consumeFinalizedItems() {
         // pass by reference rather than make a copy, for speed
         List<T> downsampledItems = selectedReads;
@@ -74,7 +78,7 @@ public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsam
     }
 
     public T peekFinalized() {
-        return selectedReads.isEmpty() ? null : selectedReads.get(0);
+        return selectedReads.isEmpty() ? null : selectedReads.getFirst();
     }
 
     public T peekPending() {
@@ -90,7 +94,7 @@ public class PassThroughDownsampler<T extends SAMRecord> implements ReadsDownsam
     }
 
     public void clear() {
-        selectedReads = new ArrayList<T>();
+        selectedReads = new LinkedList<T>();
     }
 
     public void reset() {
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
index 955dbcef7..b5dbe2ddb 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
@@ -193,13 +193,15 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
             submitRead(iterator.next());
         }
 
+        samplePartitioner.doneSubmittingReads();
+
         for (final String sample : samples) {
             final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
 
-//            // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample
-//            // and add to the list of reads.  Note this may reorder the list of reads someone (it groups them
-//            // by sample, but it cannot change their absolute position on the genome as they all must
-//            // start at the current location
+            // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample
+            // and add to the list of reads.  Note this may reorder the list of reads someone (it groups them
+            // by sample, but it cannot change their absolute position on the genome as they all must
+            // start at the current location
             if ( keepSubmittedReads )
                 submittedReads.addAll(newReads);
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java
index 1653c6a92..7dada292b 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java
@@ -25,6 +25,8 @@
 
 package org.broadinstitute.sting.utils.locusiterator;
 
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.downsampling.Downsampler;
 import org.broadinstitute.sting.gatk.downsampling.PassThroughDownsampler;
@@ -33,49 +35,137 @@ import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
 import java.util.*;
 
 /**
- * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler.
+ * Divides reads by sample and (if requested) does a preliminary downsampling pass
+ * with a ReservoirDownsampler.
  *
  * Note: stores reads by sample ID string, not by sample object
  */
 class SamplePartitioner<T extends SAMRecord> {
-    private Map<String, Downsampler<T>> readsBySample;
+    /**
+     * Map from sample name (as a string) to a downsampler of reads for that sample
+     */
+    final private Map<String, Downsampler<T>> readsBySample;
 
+    /**
+     * Are we in a state where we're done submitting reads and have semi-finalized the
+     * underlying per sample downsampler?
+     */
+    boolean doneSubmittingReads = false;
+
+    /**
+     * Create a new SamplePartitioner capable of splitting reads up into buckets of reads for
+     * each sample in samples, and perform a preliminary downsampling of these reads
+     * (separately for each sample) if downsampling is requested in LIBSDownsamplingInfo
+     *
+     * Note that samples must be comprehensive, in that all reads every submitted to this
+     * partitioner must come from one of the samples provided here.  If not, submitRead
+     * will throw an exception.  Duplicates in the list of samples will be ignored
+     *
+     * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage?
+     * @param samples the complete list of samples we're going to partition reads into
+     */
+    @Ensures({
+            "readsBySample != null",
+            "! readsBySample.isEmpty()",
+            "readsBySample.size() == new HashSet(samples).size()"
+    })
     public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
-        readsBySample = new HashMap<String, Downsampler<T>>(samples.size());
-        for ( String sample : samples ) {
+        if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null");
+        if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples);
+
+        readsBySample = new LinkedHashMap<String, Downsampler<T>>(samples.size());
+        for ( final String sample : samples ) {
             readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo));
         }
     }
 
+    /**
+     * Create a new, ready to use downsampler based on the parameters in LIBSDownsamplingInfo
+     * @param LIBSDownsamplingInfo the parameters to use in creating the downsampler
+     * @return a downsampler appropriate for LIBSDownsamplingInfo.  If no downsampling is requested,
+     *   uses the PassThroughDownsampler, which does nothing at all.
+     */
+    @Requires("LIBSDownsamplingInfo != null")
+    @Ensures("result != null")
     private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
         return LIBSDownsamplingInfo.isPerformDownsampling()
                 ? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage())
                 : new PassThroughDownsampler<T>();
     }
 
-    public void submitRead(T read) {
-        String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
-        if (readsBySample.containsKey(sampleName))
-            readsBySample.get(sampleName).submit(read);
+    /**
+     * Offer this read to the partitioner, putting it into the bucket of reads for the sample
+     * of read (obtained via the read's read group).
+     *
+     * If the read group is missing, uses the special "null" read group
+     *
+     * @throws IllegalStateException if the sample of read wasn't present in the original
+     *   set of samples provided to this SamplePartitioner at construction
+     *
+     * @param read the read to add to the sample's list of reads
+     */
+    @Requires("read != null")
+    @Ensures("doneSubmittingReads == false")
+    public void submitRead(final T read) {
+        final String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
+        final Downsampler<T> downsampler = readsBySample.get(sampleName);
+        if ( downsampler == null )
+            throw new IllegalStateException("Offered read with sample name " + sampleName + " to SamplePartitioner " +
+                    "but this sample wasn't provided as one of possible samples at construction");
+
+        downsampler.submit(read);
+        doneSubmittingReads = false;
     }
 
+    /**
+     * Tell this partitioner that all reads in this cycle have been submitted, so that we
+     * can finalize whatever downsampling is required by each sample.
+     *
+     * Note that we *must* call this function before getReadsForSample, or else that
+     * function will exception out.
+     */
+    @Ensures("doneSubmittingReads == true")
     public void doneSubmittingReads() {
-        for ( Map.Entry<String, Downsampler<T>> perSampleReads : readsBySample.entrySet() ) {
-            perSampleReads.getValue().signalEndOfInput();
+        for ( final Downsampler<T> downsampler : readsBySample.values() ) {
+            downsampler.signalEndOfInput();
         }
+        doneSubmittingReads = true;
     }
 
-    public Collection<T> getReadsForSample(String sampleName) {
-        if ( ! readsBySample.containsKey(sampleName) )
-            throw new NoSuchElementException("Sample name not found");
+    /**
+     * Get the final collection of reads for this sample for this cycle
+     *
+     * The cycle is defined as all of the reads that occur between
+     * the first call to submitRead until doneSubmittingReads is called.  At that
+     * point additional downsampling may occur (depending on construction arguments)
+     * and that set of reads is returned here.
+     *
+     * Note that this function can only be called once per cycle, as underlying
+     * collection of reads is cleared.
+     *
+     * @param sampleName the sample we want reads for, must be present in the original samples
+     * @return a non-null collection of reads for sample in this cycle
+     */
+    @Ensures("result != null")
+    public Collection<T> getReadsForSample(final String sampleName) {
+        if ( ! doneSubmittingReads ) throw new IllegalStateException("getReadsForSample called before doneSubmittingReads was called");
 
-        return readsBySample.get(sampleName).consumeFinalizedItems();
+        final Downsampler<T> downsampler = readsBySample.get(sampleName);
+        if ( downsampler == null ) throw new NoSuchElementException("Sample name not found");
+
+        return downsampler.consumeFinalizedItems();
     }
 
+    /**
+     * Resets this SamplePartitioner, indicating that we're starting a new
+     * cycle of adding reads to each underlying downsampler.
+     */
+    @Ensures("doneSubmittingReads == false")
     public void reset() {
-        for ( Map.Entry<String, Downsampler<T>> perSampleReads : readsBySample.entrySet() ) {
-            perSampleReads.getValue().clear();
-            perSampleReads.getValue().reset();
+        for ( final Downsampler<T> downsampler : readsBySample.values() ) {
+            downsampler.clear();
+            downsampler.reset();
         }
+        doneSubmittingReads = false;
     }
 }

From 5c2799554aca87f3a5a0d95c609baff574f5e261 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sun, 13 Jan 2013 12:23:51 -0500
Subject: [PATCH 05/34] Refactor updateReadStates into
 PerSampleReadStateManager, add tracking of downsampling rate

---
 .../utils/locusiterator/LIBSPerformance.java  |  4 +-
 .../utils/locusiterator/ReadStateManager.java | 70 ++++++++++++++-----
 2 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java
index 0985ed196..2d074f420 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java
@@ -63,6 +63,8 @@ public class LIBSPerformance extends CommandLineProgram {
     @Argument(fullName = "L", shortName = "L", doc = "Query location", required = false)
     public String location = null;
 
+    @Argument(fullName = "dt", shortName = "dt", doc = "Enable downsampling", required = false)
+    public boolean downsample = false;
 
     @Override
     public int execute() throws IOException {
@@ -86,7 +88,7 @@ public class LIBSPerformance extends CommandLineProgram {
         for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() )
             samples.add(rg.getSample());
 
-        final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(false, -1);
+        final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(downsample, 250);
 
         final LocusIteratorByState libs =
                 new LocusIteratorByState(
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
index b5dbe2ddb..3276291ef 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
@@ -29,6 +29,7 @@ import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import net.sf.picard.util.PeekableIterator;
 import net.sf.samtools.CigarOperator;
+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.downsampling.Downsampler;
 import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -50,6 +51,8 @@ import java.util.*;
  * Time: 2:02 PM
  */
 final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> {
+    private final static Logger logger = Logger.getLogger(ReadStateManager.class);
+    private final static boolean CAPTURE_DOWNSAMPLING_STATS = true;
     private final List<String> samples;
     private final PeekableIterator<GATKSAMRecord> iterator;
     private final SamplePartitioner<GATKSAMRecord> samplePartitioner;
@@ -138,18 +141,8 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
      * of the next pileup.
      */
     public void updateReadStates() {
-        for (final PerSampleReadStateManager readStateManager : readStatesBySample.values() ) {
-            final Iterator<AlignmentStateMachine> it = readStateManager.iterator();
-            while (it.hasNext()) {
-                final AlignmentStateMachine state = it.next();
-                final CigarOperator op = state.stepForwardOnGenome();
-                if (op == null) {
-                    // we discard the read only when we are past its end AND indel at the end of the read (if any) was
-                    // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
-                    // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
-                    it.remove();                                                // we've stepped off the end of the object
-                }
-            }
+        for (final PerSampleReadStateManager perSampleReadStateManager : readStatesBySample.values() ) {
+            perSampleReadStateManager.updateReadStates();
         }
     }
 
@@ -301,13 +294,17 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
     }
 
     // TODO -- refactor into separate class with pointer to ReadStateManager for updates to the total counts
-    protected class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
+    protected final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
         private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
         private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
-
         private int thisSampleReadStates = 0;
 
+        private final int downsamplingTarget;
+        private int nSitesNeedingDownsampling = 0;
+        private int nSites = 0;
+
         public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
+            this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1;
             this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling()
                     ? new LevelingDownsampler<LinkedList<AlignmentStateMachine>, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage())
                     : null;
@@ -326,7 +323,8 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
             thisSampleReadStates += states.size();
             totalReadStates += states.size();
 
-            if ( levelingDownsampler != null ) {
+            if ( isDownsampling() ) {
+                captureDownsamplingStats();
                 levelingDownsampler.submit(readStatesByAlignmentStart);
                 levelingDownsampler.signalEndOfInput();
 
@@ -339,6 +337,28 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
             }
         }
 
+        private boolean isDownsampling() {
+            return levelingDownsampler != null;
+        }
+
+        @Requires("isDownsampling()")
+        private void captureDownsamplingStats() {
+            if ( CAPTURE_DOWNSAMPLING_STATS ) {
+                nSites++;
+                final int loc = getFirst().getGenomePosition();
+                String message = "Pass through";
+                final boolean downsampling = thisSampleReadStates > downsamplingTarget;
+                if ( downsampling ) {
+                    nSitesNeedingDownsampling++;
+                    message = "Downsampling";
+                }
+
+                if ( downsampling || nSites % 10000 == 0 )
+                    logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e",
+                            message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites)));
+            }
+        }
+
         public boolean isEmpty() {
             return readStatesByAlignmentStart.isEmpty();
         }
@@ -351,11 +371,25 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
             return thisSampleReadStates;
         }
 
+        public void updateReadStates() {
+            final Iterator<AlignmentStateMachine> it = iterator();
+            while (it.hasNext()) {
+                final AlignmentStateMachine state = it.next();
+                final CigarOperator op = state.stepForwardOnGenome();
+                if (op == null) {
+                    // we discard the read only when we are past its end AND indel at the end of the read (if any) was
+                    // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
+                    // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
+                    it.remove();                                                // we've stepped off the end of the object
+                }
+            }
+        }
+
         public Iterator<AlignmentStateMachine> iterator() {
             return new Iterator<AlignmentStateMachine>() {
-                private Iterator<LinkedList<AlignmentStateMachine>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
-                private LinkedList<AlignmentStateMachine> currentPositionReadStates = null;
-                private Iterator<AlignmentStateMachine> currentPositionReadStatesIterator = null;
+                private final Iterator<LinkedList<AlignmentStateMachine>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
+                private LinkedList<AlignmentStateMachine> currentPositionReadStates;
+                private Iterator<AlignmentStateMachine> currentPositionReadStatesIterator;
 
                 public boolean hasNext() {
                     return  alignmentStartIterator.hasNext() ||

From 5a5422e4f8220ecde133490eeef6b58fa3084397 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sun, 13 Jan 2013 13:02:17 -0500
Subject: [PATCH 06/34] Refactor PerSampleReadStates into a separate class

-- No longer update the total counts in each per-sample state manager, but instead return delta counts that are updated by the overall ReadStateManager
-- One step on the way to improving the underlying representation of the data in PerSampleReadStateManager
-- Make LocusIteratorByState final
---
 .../locusiterator/LocusIteratorByState.java   |   6 +-
 .../PerSampleReadStateManager.java            | 203 ++++++++++++++++++
 .../utils/locusiterator/ReadStateManager.java | 138 +-----------
 .../LocusIteratorByStateUnitTest.java         |   5 +-
 ...=> PerSampleReadStateManagerUnitTest.java} |  11 +-
 5 files changed, 214 insertions(+), 149 deletions(-)
 create mode 100644 public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java
 rename public/java/test/org/broadinstitute/sting/utils/locusiterator/{ReadStateManagerUnitTest.java => PerSampleReadStateManagerUnitTest.java} (92%)

diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
index 9499bfa35..e7b75f1f2 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
@@ -65,7 +65,7 @@ import java.util.*;
  * occurs, if requested.  This allows users of LIBS to see both a ReadBackedPileup view of the data as well as
  * a stream of unique, sorted reads
  */
-public class LocusIteratorByState extends LocusIterator {
+public final class LocusIteratorByState extends LocusIterator {
     /**
      * our log, which we want to capture anything from this class
      */
@@ -233,9 +233,9 @@ public class LocusIteratorByState extends LocusIterator {
             final GenomeLoc location = getLocation();
             final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
 
-            for (final Map.Entry<String, ReadStateManager.PerSampleReadStateManager> sampleStatePair : readStates ) {
+            for (final Map.Entry<String, PerSampleReadStateManager> sampleStatePair : readStates ) {
                 final String sample = sampleStatePair.getKey();
-                final ReadStateManager.PerSampleReadStateManager readState = sampleStatePair.getValue();
+                final PerSampleReadStateManager readState = sampleStatePair.getValue();
                 final Iterator<AlignmentStateMachine> iterator = readState.iterator();
                 final List<PileupElement> pile = new ArrayList<PileupElement>(readState.size());
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java
new file mode 100644
index 000000000..c2a47bbdb
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.locusiterator;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import net.sf.samtools.CigarOperator;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.downsampling.Downsampler;
+import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * ReadStateManager for a single sample
+ *
+ * User: depristo
+ * Date: 1/13/13
+ * Time: 12:28 PM
+ */
+final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
+    private final static Logger logger = Logger.getLogger(ReadStateManager.class);
+    private final static boolean CAPTURE_DOWNSAMPLING_STATS = true;
+
+    private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
+    private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
+    private int thisSampleReadStates = 0;
+
+    private final int downsamplingTarget;
+    private int nSitesNeedingDownsampling = 0;
+    private int nSites = 0;
+
+    public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
+        this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1;
+        this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling()
+                ? new LevelingDownsampler<LinkedList<AlignmentStateMachine>, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage())
+                : null;
+    }
+
+    /**
+     * Assumes it can just keep the states linked lists without making a copy
+     * @param states the new states to add to this manager
+     * @return The change in the number of states, after including states and potentially downsampling
+     */
+    @Requires("states != null")
+    @Ensures("result >= 0")
+    public int addStatesAtNextAlignmentStart(LinkedList<AlignmentStateMachine> states) {
+        if ( states.isEmpty() ) {
+            return 0;
+        }
+
+        readStatesByAlignmentStart.add(states);
+        int nStatesAdded = states.size();
+
+        if ( isDownsampling() ) {
+            captureDownsamplingStats();
+            levelingDownsampler.submit(readStatesByAlignmentStart);
+            levelingDownsampler.signalEndOfInput();
+
+            nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems();
+
+            // use returned List directly rather than make a copy, for efficiency's sake
+            readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
+            levelingDownsampler.reset();
+        }
+
+        thisSampleReadStates += nStatesAdded;
+        return nStatesAdded;
+    }
+
+    private boolean isDownsampling() {
+        return levelingDownsampler != null;
+    }
+
+    private AlignmentStateMachine getFirst() {
+        if (readStatesByAlignmentStart.isEmpty())
+            return null;
+        else
+            return readStatesByAlignmentStart.get(0).getFirst();
+    }
+
+    @Requires("isDownsampling()")
+    private void captureDownsamplingStats() {
+        if ( CAPTURE_DOWNSAMPLING_STATS ) {
+            nSites++;
+            final int loc = getFirst().getGenomePosition();
+            String message = "Pass through";
+            final boolean downsampling = thisSampleReadStates > downsamplingTarget;
+            if ( downsampling ) {
+                nSitesNeedingDownsampling++;
+                message = "Downsampling";
+            }
+
+            if ( downsampling || nSites % 10000 == 0 )
+                logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e",
+                        message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites)));
+        }
+    }
+
+    /**
+     * Is there at least one alignment for this sample in this manager?
+     * @return true if there's at least one alignment, false otherwise
+     */
+    public boolean isEmpty() {
+        return readStatesByAlignmentStart.isEmpty();
+    }
+
+    public AlignmentStateMachine peek() {
+        return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
+    }
+
+    /**
+     * Get the number of read states currently in this manager
+     * @return the number of read states
+     */
+    @Ensures("result >= 0")
+    public int size() {
+        return thisSampleReadStates;
+    }
+
+    /**
+     * Advances all read states forward by one element, removing states that are
+     * no long aligned to the current position.
+     * @return the number of states we're removed after advancing
+     */
+    public int updateReadStates() {
+        int nRemoved = 0;
+        final Iterator<AlignmentStateMachine> it = iterator();
+        while (it.hasNext()) {
+            final AlignmentStateMachine state = it.next();
+            final CigarOperator op = state.stepForwardOnGenome();
+            if (op == null) {
+                // we discard the read only when we are past its end AND indel at the end of the read (if any) was
+                // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
+                // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
+                it.remove();                                                // we've stepped off the end of the object
+                nRemoved++;
+            }
+        }
+
+        return nRemoved;
+    }
+
+    // todo -- reimplement
+    public Iterator<AlignmentStateMachine> iterator() {
+        return new Iterator<AlignmentStateMachine>() {
+            private final Iterator<LinkedList<AlignmentStateMachine>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
+            private LinkedList<AlignmentStateMachine> currentPositionReadStates;
+            private Iterator<AlignmentStateMachine> currentPositionReadStatesIterator;
+
+            @Override
+            public boolean hasNext() {
+                return  alignmentStartIterator.hasNext() ||
+                        (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
+            }
+
+            @Override
+            public AlignmentStateMachine next() {
+                if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
+                    currentPositionReadStates = alignmentStartIterator.next();
+                    currentPositionReadStatesIterator = currentPositionReadStates.iterator();
+                }
+
+                return currentPositionReadStatesIterator.next();
+            }
+
+            @Override
+            public void remove() {
+                currentPositionReadStatesIterator.remove();
+                thisSampleReadStates--;
+
+                if ( currentPositionReadStates.isEmpty() ) {
+                    alignmentStartIterator.remove();
+                }
+            }
+        };
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
index 3276291ef..4011875a6 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
@@ -28,10 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import net.sf.picard.util.PeekableIterator;
-import net.sf.samtools.CigarOperator;
 import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.downsampling.Downsampler;
-import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
 import java.util.*;
@@ -50,9 +47,7 @@ import java.util.*;
  * Date: 1/5/13
  * Time: 2:02 PM
  */
-final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> {
-    private final static Logger logger = Logger.getLogger(ReadStateManager.class);
-    private final static boolean CAPTURE_DOWNSAMPLING_STATS = true;
+final class ReadStateManager implements Iterable<Map.Entry<String, PerSampleReadStateManager>> {
     private final List<String> samples;
     private final PeekableIterator<GATKSAMRecord> iterator;
     private final SamplePartitioner<GATKSAMRecord> samplePartitioner;
@@ -97,7 +92,7 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
      * @return Iterator over sample + per sample read state manager pairs for this read state manager.
      */
     @Override
-    public Iterator<Map.Entry<String, ReadStateManager.PerSampleReadStateManager>> iterator() {
+    public Iterator<Map.Entry<String, PerSampleReadStateManager>> iterator() {
         return readStatesBySample.entrySet().iterator();
     }
 
@@ -142,7 +137,7 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
      */
     public void updateReadStates() {
         for (final PerSampleReadStateManager perSampleReadStateManager : readStatesBySample.values() ) {
-            perSampleReadStateManager.updateReadStates();
+            totalReadStates -= perSampleReadStateManager.updateReadStates();
         }
     }
 
@@ -290,131 +285,6 @@ final class ReadStateManager implements Iterable<Map.Entry<String, ReadStateMana
                 newReadStates.add(state);
         }
 
-        readStates.addStatesAtNextAlignmentStart(newReadStates);
-    }
-
-    // TODO -- refactor into separate class with pointer to ReadStateManager for updates to the total counts
-    protected final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
-        private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
-        private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
-        private int thisSampleReadStates = 0;
-
-        private final int downsamplingTarget;
-        private int nSitesNeedingDownsampling = 0;
-        private int nSites = 0;
-
-        public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
-            this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1;
-            this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling()
-                    ? new LevelingDownsampler<LinkedList<AlignmentStateMachine>, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage())
-                    : null;
-        }
-
-        /**
-         * Assumes it can just keep the states linked lists without making a copy
-         * @param states
-         */
-        public void addStatesAtNextAlignmentStart(LinkedList<AlignmentStateMachine> states) {
-            if ( states.isEmpty() ) {
-                return;
-            }
-
-            readStatesByAlignmentStart.add(states);
-            thisSampleReadStates += states.size();
-            totalReadStates += states.size();
-
-            if ( isDownsampling() ) {
-                captureDownsamplingStats();
-                levelingDownsampler.submit(readStatesByAlignmentStart);
-                levelingDownsampler.signalEndOfInput();
-
-                thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
-                totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
-
-                // use returned List directly rather than make a copy, for efficiency's sake
-                readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
-                levelingDownsampler.reset();
-            }
-        }
-
-        private boolean isDownsampling() {
-            return levelingDownsampler != null;
-        }
-
-        @Requires("isDownsampling()")
-        private void captureDownsamplingStats() {
-            if ( CAPTURE_DOWNSAMPLING_STATS ) {
-                nSites++;
-                final int loc = getFirst().getGenomePosition();
-                String message = "Pass through";
-                final boolean downsampling = thisSampleReadStates > downsamplingTarget;
-                if ( downsampling ) {
-                    nSitesNeedingDownsampling++;
-                    message = "Downsampling";
-                }
-
-                if ( downsampling || nSites % 10000 == 0 )
-                    logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e",
-                            message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites)));
-            }
-        }
-
-        public boolean isEmpty() {
-            return readStatesByAlignmentStart.isEmpty();
-        }
-
-        public AlignmentStateMachine peek() {
-            return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
-        }
-
-        public int size() {
-            return thisSampleReadStates;
-        }
-
-        public void updateReadStates() {
-            final Iterator<AlignmentStateMachine> it = iterator();
-            while (it.hasNext()) {
-                final AlignmentStateMachine state = it.next();
-                final CigarOperator op = state.stepForwardOnGenome();
-                if (op == null) {
-                    // we discard the read only when we are past its end AND indel at the end of the read (if any) was
-                    // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
-                    // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
-                    it.remove();                                                // we've stepped off the end of the object
-                }
-            }
-        }
-
-        public Iterator<AlignmentStateMachine> iterator() {
-            return new Iterator<AlignmentStateMachine>() {
-                private final Iterator<LinkedList<AlignmentStateMachine>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
-                private LinkedList<AlignmentStateMachine> currentPositionReadStates;
-                private Iterator<AlignmentStateMachine> currentPositionReadStatesIterator;
-
-                public boolean hasNext() {
-                    return  alignmentStartIterator.hasNext() ||
-                            (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
-                }
-
-                public AlignmentStateMachine next() {
-                    if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
-                        currentPositionReadStates = alignmentStartIterator.next();
-                        currentPositionReadStatesIterator = currentPositionReadStates.iterator();
-                    }
-
-                    return currentPositionReadStatesIterator.next();
-                }
-
-                public void remove() {
-                    currentPositionReadStatesIterator.remove();
-                    thisSampleReadStates--;
-                    totalReadStates--;
-
-                    if ( currentPositionReadStates.isEmpty() ) {
-                        alignmentStartIterator.remove();
-                    }
-                }
-            };
-        }
+        totalReadStates += readStates.addStatesAtNextAlignmentStart(newReadStates);
     }
 }
diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
index 727023b83..7ae2d97a1 100644
--- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
@@ -418,8 +418,8 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
                         for ( final boolean keepReads : Arrays.asList(true, false) ) {
                             for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true, false) ) {
 //        for ( final int downsampleTo : Arrays.asList(1)) {
-//            for ( final int nReadsPerLocus : Arrays.asList(10) ) {
-//                for ( final int nLoci : Arrays.asList(25) ) {
+//            for ( final int nReadsPerLocus : Arrays.asList(1) ) {
+//                for ( final int nLoci : Arrays.asList(1) ) {
 //                    for ( final int nSamples : Arrays.asList(1) ) {
 //                        for ( final boolean keepReads : Arrays.asList(true) ) {
 //                            for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) {
@@ -436,7 +436,6 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
         return tests.toArray(new Object[][]{});
     }
 
-    //@Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests")
     @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests")
     public void testLIBS_ComplexPileupTests(final int nReadsPerLocus,
                                             final int nLoci,
diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java
similarity index 92%
rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java
rename to public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java
index 76b324d85..b9f2fb29a 100644
--- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java
@@ -38,11 +38,7 @@ import java.util.*;
 /**
  * testing of the new (non-legacy) version of LocusIteratorByState
  */
-public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
-    ///////////////////////////////////////
-    // Read State Manager Tests          //
-    ///////////////////////////////////////
-
+public class PerSampleReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
     private class PerSampleReadStateManagerTest extends TestDataProvider {
         private List<Integer> readCountsPerAlignmentStart;
         private List<SAMRecord> reads;
@@ -63,10 +59,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
         }
 
         public void run() {
-            final List<String> samples = LocusIteratorByState.sampleListForSAMWithoutReadGroups();
-            final Iterator<GATKSAMRecord> iterator = new LinkedList<GATKSAMRecord>().iterator();
-            ReadStateManager readStateManager = new ReadStateManager(iterator, samples, LIBSDownsamplingInfo.NO_DOWNSAMPLING, false);
-            ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = readStateManager.new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING);
+            PerSampleReadStateManager perSampleReadStateManager = new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING);
 
             makeReads();
 

From c7f0ca8ac53e320d2762da917158be51a9b2d8ae Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sun, 13 Jan 2013 14:36:25 -0500
Subject: [PATCH 07/34] Optimization for LIBS: PerSampleReadStateManager now
 uses a simple LinkedList of AlignmentStateMachine

-- Instead of storing a list of list of alignment starts, which is expensive to manipulate, we instead store a linear list of alignment starts.  Not grouped as previously.  This enables us to simplify iteration and update operations, making them much faster
-- Critically, the downsampler still requires this list of list.  We convert back and forth between these two representations as required, which is very rarely for normal data sets (WGS NA12878 on chr20 is 0.2%, 4x WGS is even less).
---
 .../PerSampleReadStateManager.java            | 170 ++++++++++++------
 .../utils/locusiterator/ReadStateManager.java |   2 +-
 2 files changed, 115 insertions(+), 57 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java
index c2a47bbdb..3f3bc706f 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java
@@ -26,6 +26,7 @@
 package org.broadinstitute.sting.utils.locusiterator;
 
 import com.google.java.contract.Ensures;
+import com.google.java.contract.Invariant;
 import com.google.java.contract.Requires;
 import net.sf.samtools.CigarOperator;
 import org.apache.log4j.Logger;
@@ -43,18 +44,42 @@ import java.util.List;
  * Date: 1/13/13
  * Time: 12:28 PM
  */
+@Invariant({
+        "readStartsAreWellOrdered()",
+        "! isDownsampling() || downsamplingTarget > 0",
+        "nSites >= 0",
+        "nSitesNeedingDownsampling >= 0",
+        "nSitesNeedingDownsampling <= nSites"
+})
 final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine> {
     private final static Logger logger = Logger.getLogger(ReadStateManager.class);
-    private final static boolean CAPTURE_DOWNSAMPLING_STATS = true;
+    private final static boolean CAPTURE_DOWNSAMPLING_STATS = false;
+
+    /**
+     * A list (potentially empty) of alignment state machines.
+     *
+     * The state machines must be ordered by the alignment start of their underlying reads, with the
+     * lowest alignment starts on the left, and the largest on the right
+     */
+    private LinkedList<AlignmentStateMachine> readStatesByAlignmentStart = new LinkedList<AlignmentStateMachine>();
 
-    private List<LinkedList<AlignmentStateMachine>> readStatesByAlignmentStart = new LinkedList<LinkedList<AlignmentStateMachine>>();
     private final Downsampler<LinkedList<AlignmentStateMachine>> levelingDownsampler;
-    private int thisSampleReadStates = 0;
-
     private final int downsamplingTarget;
+
+    /**
+     * The number of sites where downsampling has been invoked
+     */
     private int nSitesNeedingDownsampling = 0;
+
+    /**
+     * The number of sites we've visited
+     */
     private int nSites = 0;
 
+    /**
+     * Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo
+     * @param LIBSDownsamplingInfo the downsampling params we want to use
+     */
     public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
         this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1;
         this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling()
@@ -62,55 +87,118 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
                 : null;
     }
 
+    /**
+     * Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines,
+     * where each list contains machines with a unique genome site.  The outer list is ordered
+     * by alignment start.
+     *
+     * For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then
+     * the resulting grouping will be [[10, 10], [11], [12, 12], [13]].
+     *
+     * @return a non-null list of lists
+     */
+    @Ensures("result != null")
+    private List<LinkedList<AlignmentStateMachine>> groupByAlignmentStart() {
+        final LinkedList<LinkedList<AlignmentStateMachine>> grouped = new LinkedList<LinkedList<AlignmentStateMachine>>();
+
+        AlignmentStateMachine last = null;
+        for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) {
+            if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) {
+                // we've advanced to a place where the state machine has a different state,
+                // so start a new list
+                grouped.add(new LinkedList<AlignmentStateMachine>());
+                last = stateMachine;
+            }
+            grouped.getLast().add(stateMachine);
+        }
+
+        return grouped;
+    }
+
+    /**
+     * Flattens the grouped list of list of alignment state machines into a single list in order
+     * @return a non-null list contains the state machines
+     */
+    @Ensures("result != null")
+    private LinkedList<AlignmentStateMachine> flattenByAlignmentStart(final List<LinkedList<AlignmentStateMachine>> grouped) {
+        final LinkedList<AlignmentStateMachine> flat = new LinkedList<AlignmentStateMachine>();
+        for ( final List<AlignmentStateMachine> l : grouped )
+            flat.addAll(l);
+        return flat;
+    }
+
+    /**
+     * Test that the reads are ordered by their alignment starts
+     * @return true if well ordered, false otherwise
+     */
+    private boolean readStartsAreWellOrdered() {
+        int lastStart = -1;
+        for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) {
+            if ( lastStart > machine.getRead().getAlignmentStart() )
+                return false;
+            lastStart = machine.getRead().getAlignmentStart();
+        }
+        return true;
+    }
+
     /**
      * Assumes it can just keep the states linked lists without making a copy
      * @param states the new states to add to this manager
-     * @return The change in the number of states, after including states and potentially downsampling
+     * @return The change in the number of states, after including states and potentially downsampling.  Note
+     * that this return result might be negative, if downsampling is enabled, as we might drop
+     * more sites than have been added by the downsampler
      */
     @Requires("states != null")
-    @Ensures("result >= 0")
-    public int addStatesAtNextAlignmentStart(LinkedList<AlignmentStateMachine> states) {
+    public int addStatesAtNextAlignmentStart(final LinkedList<AlignmentStateMachine> states) {
         if ( states.isEmpty() ) {
             return 0;
         }
 
-        readStatesByAlignmentStart.add(states);
+        readStatesByAlignmentStart.addAll(states);
         int nStatesAdded = states.size();
 
-        if ( isDownsampling() ) {
+        if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) {
+            // only go into the downsampling branch if we are downsampling and the coverage > the target
             captureDownsamplingStats();
-            levelingDownsampler.submit(readStatesByAlignmentStart);
+            levelingDownsampler.submit(groupByAlignmentStart());
             levelingDownsampler.signalEndOfInput();
 
             nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems();
 
             // use returned List directly rather than make a copy, for efficiency's sake
-            readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
+            readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems());
             levelingDownsampler.reset();
         }
 
-        thisSampleReadStates += nStatesAdded;
         return nStatesAdded;
     }
 
+    /**
+     * Is downsampling enabled for this manager?
+     * @return true if we are downsampling, false otherwise
+     */
     private boolean isDownsampling() {
         return levelingDownsampler != null;
     }
 
-    private AlignmentStateMachine getFirst() {
-        if (readStatesByAlignmentStart.isEmpty())
-            return null;
-        else
-            return readStatesByAlignmentStart.get(0).getFirst();
+    /**
+     * Get the leftmost alignment state machine, or null if the read states is empty
+     * @return a potentially null AlignmentStateMachine
+     */
+    public AlignmentStateMachine getFirst() {
+        return isEmpty() ? null : readStatesByAlignmentStart.getFirst();
     }
 
+    /**
+     * Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true
+     */
     @Requires("isDownsampling()")
     private void captureDownsamplingStats() {
         if ( CAPTURE_DOWNSAMPLING_STATS ) {
             nSites++;
             final int loc = getFirst().getGenomePosition();
             String message = "Pass through";
-            final boolean downsampling = thisSampleReadStates > downsamplingTarget;
+            final boolean downsampling = size() > downsamplingTarget;
             if ( downsampling ) {
                 nSitesNeedingDownsampling++;
                 message = "Downsampling";
@@ -118,7 +206,7 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
 
             if ( downsampling || nSites % 10000 == 0 )
                 logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e",
-                        message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites)));
+                        message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites)));
         }
     }
 
@@ -130,17 +218,13 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
         return readStatesByAlignmentStart.isEmpty();
     }
 
-    public AlignmentStateMachine peek() {
-        return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
-    }
-
     /**
      * Get the number of read states currently in this manager
      * @return the number of read states
      */
     @Ensures("result >= 0")
     public int size() {
-        return thisSampleReadStates;
+        return readStatesByAlignmentStart.size();
     }
 
     /**
@@ -166,38 +250,12 @@ final class PerSampleReadStateManager implements Iterable<AlignmentStateMachine>
         return nRemoved;
     }
 
-    // todo -- reimplement
+    /**
+     * Iterate over the AlignmentStateMachine in this manager in alignment start order.
+     * @return a valid iterator
+     */
+    @Ensures("result != null")
     public Iterator<AlignmentStateMachine> iterator() {
-        return new Iterator<AlignmentStateMachine>() {
-            private final Iterator<LinkedList<AlignmentStateMachine>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
-            private LinkedList<AlignmentStateMachine> currentPositionReadStates;
-            private Iterator<AlignmentStateMachine> currentPositionReadStatesIterator;
-
-            @Override
-            public boolean hasNext() {
-                return  alignmentStartIterator.hasNext() ||
-                        (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
-            }
-
-            @Override
-            public AlignmentStateMachine next() {
-                if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
-                    currentPositionReadStates = alignmentStartIterator.next();
-                    currentPositionReadStatesIterator = currentPositionReadStates.iterator();
-                }
-
-                return currentPositionReadStatesIterator.next();
-            }
-
-            @Override
-            public void remove() {
-                currentPositionReadStatesIterator.remove();
-                thisSampleReadStates--;
-
-                if ( currentPositionReadStates.isEmpty() ) {
-                    alignmentStartIterator.remove();
-                }
-            }
-        };
+        return readStatesByAlignmentStart.iterator();
     }
 }
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
index 4011875a6..09ec3b264 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
@@ -122,7 +122,7 @@ final class ReadStateManager implements Iterable<Map.Entry<String, PerSampleRead
     public AlignmentStateMachine getFirst() {
         for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) {
             if ( ! manager.isEmpty() )
-                return manager.peek();
+                return manager.getFirst();
         }
         return null;
     }

From 7eea6b8f92fe2b5a5624eb900a95ace89c203924 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sun, 13 Jan 2013 20:43:10 -0500
Subject: [PATCH 08/34] ReservoirDownsampler optimizations

-- Add an option to not allocate always ArrayLists of targetSampleSize, but rather the previous size + MARGIN.  This helps for LIBS as most of the time we don't need nearly so much space as we allow
-- consumeFinalizedItems returns an empty list if the reservior is empty, which it often true for our BAM files with low coverage
-- Allow empty sample lists for SamplePartitioner as these are used by the RefTraversals and other non-read based traversals

Make the reservoir downsampler use a linked list, rather than a fixed sized array list, in the expectFewOverflows case
---
 .../downsampling/ReservoirDownsampler.java    | 76 +++++++++++++++----
 .../locusiterator/SamplePartitioner.java      |  9 ++-
 2 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java
index 0d7a0dd14..4331fd723 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java
@@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
+import java.util.*;
 
 /**
  * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
@@ -42,10 +40,25 @@ import java.util.List;
  * @author David Roazen
  */
 public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
+    private final int targetSampleSize;
 
-    private ArrayList<T> reservoir;
+    /**
+     * if true, this downsampler will be optimized for the case
+     * where most of the time we won't fill up anything like the
+     * targetSampleSize elements.  If this is false, we will allocate
+     * internal buffers to targetSampleSize initially, which minimizes
+     * the cost of allocation if we often use targetSampleSize or more
+     * elements.
+     */
+    private final boolean expectFewOverflows;
 
-    private int targetSampleSize;
+    /**
+     * At times this can be a linked list or an array list, depending on how we're accessing the
+     * data and whether or not we're expecting few overflows
+     */
+    private List<T> reservoir;
+
+    private boolean isLinkedList;
 
     private int totalReadsSeen;
 
@@ -56,17 +69,35 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
      *
      * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
      *                         after downsampling will be min(totalReads, targetSampleSize)
+     * @param expectFewOverflows if true, this downsampler will be optimized for the case
+     *                           where most of the time we won't fill up anything like the
+     *                           targetSampleSize elements.  If this is false, we will allocate
+     *                           internal buffers to targetSampleSize initially, which minimizes
+     *                           the cost of allocation if we often use targetSampleSize or more
+     *                           elements.
      */
-    public ReservoirDownsampler ( int targetSampleSize ) {
+    public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) {
         if ( targetSampleSize <= 0 ) {
             throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
         }
 
         this.targetSampleSize = targetSampleSize;
+        this.expectFewOverflows = expectFewOverflows;
         clear();
         reset();
     }
 
+    /**
+     * Construct a ReservoirDownsampler
+     *
+     * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
+     *                         after downsampling will be min(totalReads, targetSampleSize)
+     */
+    public ReservoirDownsampler ( int targetSampleSize ) {
+        this(targetSampleSize, false);
+    }
+
+
     public void submit ( T newRead ) {
         totalReadsSeen++;
 
@@ -74,7 +105,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
             reservoir.add(newRead);
         }
         else {
-            int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
+            if ( isLinkedList ) {
+                reservoir = new ArrayList<T>(reservoir);
+                isLinkedList = false;
+            }
+
+            final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
             if ( randomSlot < targetSampleSize ) {
                 reservoir.set(randomSlot, newRead);
             }
@@ -93,10 +129,15 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
     }
 
     public List<T> consumeFinalizedItems() {
-        // pass by reference rather than make a copy, for speed
-        List<T> downsampledItems = reservoir;
-        clear();
-        return downsampledItems;
+        if ( reservoir.isEmpty() ) {
+            // if there's nothing here, don't both allocating a new list completely
+            return Collections.emptyList();
+        } else {
+            // pass by reference rather than make a copy, for speed
+            List<T> downsampledItems = reservoir;
+            clear();
+            return downsampledItems;
+        }
     }
 
     public boolean hasPendingItems() {
@@ -119,9 +160,18 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
         // NO-OP
     }
 
+    /**
+     * Clear the data structures used to hold information
+     */
     public void clear() {
-        reservoir = new ArrayList<T>(targetSampleSize);
-        totalReadsSeen = 0;    // an internal stat used by the downsampling process, so not cleared by reset() below
+        // if we aren't expecting many overflows, allocate a linked list not an arraylist
+        reservoir = expectFewOverflows ? new LinkedList<T>() : new ArrayList<T>(targetSampleSize);
+
+        // it's a linked list if we allocate one
+        isLinkedList = expectFewOverflows;
+
+        // an internal stat used by the downsampling process, so not cleared by reset() below
+        totalReadsSeen = 0;
     }
 
     public void reset() {
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java
index 7dada292b..9bb474e4d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java
@@ -62,16 +62,17 @@ class SamplePartitioner<T extends SAMRecord> {
      * will throw an exception.  Duplicates in the list of samples will be ignored
      *
      * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage?
-     * @param samples the complete list of samples we're going to partition reads into
+     * @param samples the complete list of samples we're going to partition reads into. Can be
+     *                empty, but in that case this code cannot function properly if you
+     *                attempt to add data to it.
      */
     @Ensures({
             "readsBySample != null",
-            "! readsBySample.isEmpty()",
             "readsBySample.size() == new HashSet(samples).size()"
     })
     public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
         if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null");
-        if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples);
+        if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list");
 
         readsBySample = new LinkedHashMap<String, Downsampler<T>>(samples.size());
         for ( final String sample : samples ) {
@@ -89,7 +90,7 @@ class SamplePartitioner<T extends SAMRecord> {
     @Ensures("result != null")
     private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
         return LIBSDownsamplingInfo.isPerformDownsampling()
-                ? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage())
+                ? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage(), true)
                 : new PassThroughDownsampler<T>();
     }
 

From b8b2b9b2de6270e1aead4f17ecf01b27d7f123f7 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sun, 13 Jan 2013 20:44:28 -0500
Subject: [PATCH 09/34] ManagingReferenceOrderedView optimization: don't allow
 a fresh RefMetaDataTracker in the frequent case where there's no reference
 meta data

---
 .../providers/ManagingReferenceOrderedView.java    | 14 +++++++++-----
 .../sting/gatk/refdata/RefMetaDataTracker.java     |  1 +
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java
index 7d3cac33d..09b72f5eb 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java
@@ -77,13 +77,17 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView {
      * @return A tracker containing information about this locus.
      */
     public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
-        List<RODRecordList> bindings = states.isEmpty() ? Collections.<RODRecordList>emptyList() : new ArrayList<RODRecordList>(states.size());
+        if ( states.isEmpty() )
+            return RefMetaDataTracker.EMPTY_TRACKER;
+        else {
+            List<RODRecordList> bindings = new ArrayList<RODRecordList>(states.size());
 
-        for ( ReferenceOrderedDataState state: states )
-            // todo -- warning, I removed the reference to the name from states
-            bindings.add( state.iterator.seekForward(loc) );
+            for ( ReferenceOrderedDataState state: states )
+                // todo -- warning, I removed the reference to the name from states
+                bindings.add( state.iterator.seekForward(loc) );
 
-        return new RefMetaDataTracker(bindings);
+            return new RefMetaDataTracker(bindings);
+        }
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
index 9cb38b840..5a1b015fe 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
@@ -61,6 +61,7 @@ public class RefMetaDataTracker {
 
     final Map<String, RODRecordList> bindings;
     final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class);
+    public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
 
     // ------------------------------------------------------------------------------------------
     //

From 39bc9e999d8215486708a77a8cff31b9084f7dca Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Mon, 14 Jan 2013 08:34:42 -0500
Subject: [PATCH 10/34] Add a test to LocusIteratorByState to ensure that we
 aren't holding reads anywhere

-- Run an iterator with 100Ks of reads, each carrying MBs of byte[] data, through LIBS, all starting at the same position.  Will crash with an out-of-memory error if we're holding reads anywhere in the system.
-- Is there a better way to test this behavior?
---
 .../LocusIteratorByStateUnitTest.java         | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
index 7ae2d97a1..37494903c 100644
--- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
@@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.locusiterator;
 
 import net.sf.samtools.CigarOperator;
 import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMReadGroupRecord;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.ReadProperties;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@@ -577,4 +578,94 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
             }
         }
     }
+
+    // ---------------------------------------------------------------------------
+    // make sure that downsampling isn't holding onto a bazillion reads
+    //
+    @DataProvider(name = "LIBS_NotHoldingTooManyReads")
+    public Object[][] makeLIBS_NotHoldingTooManyReads() {
+        final List<Object[]> tests = new LinkedList<Object[]>();
+
+        for ( final int downsampleTo : Arrays.asList(1, 10)) {
+            for ( final int nReadsPerLocus : Arrays.asList(100, 1000, 10000, 100000) ) {
+                for ( final int payloadInBytes : Arrays.asList(0, 1024, 1024*1024) ) {
+                    tests.add(new Object[]{nReadsPerLocus, downsampleTo, payloadInBytes});
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_NotHoldingTooManyReads")
+//    @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000)
+    public void testLIBS_NotHoldingTooManyReads(final int nReadsPerLocus, final int downsampleTo, final int payloadInBytes) {
+        logger.warn(String.format("testLIBS_NotHoldingTooManyReads %d %d %d", nReadsPerLocus, downsampleTo, payloadInBytes));
+        final int readLength = 10;
+
+        final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
+        final int nSamples = 1;
+        final List<String> samples = new ArrayList<String>(nSamples);
+        for ( int i = 0; i < nSamples; i++ ) {
+            final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
+            final String sample = "sample" + i;
+            samples.add(sample);
+            rg.setSample(sample);
+            rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
+            header.addReadGroup(rg);
+        }
+
+        final boolean downsample = downsampleTo != -1;
+        final DownsamplingMethod downsampler = downsample
+                ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false)
+                : new DownsamplingMethod(DownsampleType.NONE, null, null, false);
+
+        // final List<GATKSAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
+
+        final WeakReadTrackingIterator iterator = new WeakReadTrackingIterator(nReadsPerLocus, readLength, payloadInBytes, header);
+
+        li = new LocusIteratorByState(iterator,
+                createTestReadProperties(downsampler, false),
+                genomeLocParser,
+                samples);
+
+        while ( li.hasNext() ) {
+            final AlignmentContext next = li.next();
+            Assert.assertTrue(next.getBasePileup().getNumberOfElements() <= downsampleTo, "Too many elements in pileup " + next);
+            // TODO -- assert that there are <= X reads in memory after GC for some X
+        }
+    }
+
+    private static class WeakReadTrackingIterator implements Iterator<GATKSAMRecord> {
+        final int nReads, readLength, payloadInBytes;
+        int readI = 0;
+        final SAMFileHeader header;
+
+        private WeakReadTrackingIterator(int nReads, int readLength, final int payloadInBytes, final SAMFileHeader header) {
+            this.nReads = nReads;
+            this.readLength = readLength;
+            this.header = header;
+            this.payloadInBytes = payloadInBytes;
+        }
+
+        @Override public boolean hasNext() { return readI < nReads; }
+        @Override public void remove() { throw new UnsupportedOperationException("no remove"); }
+
+        @Override
+        public GATKSAMRecord next() {
+            readI++;
+            return makeRead();
+        }
+
+        private GATKSAMRecord makeRead() {
+            final SAMReadGroupRecord rg = header.getReadGroups().get(0);
+            final String readName = String.format("%s.%d.%s", "read", readI, rg.getId());
+            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, 1, readLength);
+            read.setReadGroup(new GATKSAMReadGroupRecord(rg));
+            if ( payloadInBytes > 0 )
+                // add a payload byte array to push memory use per read even higher
+                read.setAttribute("PL", new byte[payloadInBytes]);
+            return read;
+        }
+    }
 }

From 94800771e3c48e39fcef5280e5c31919031e8066 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 15 Jan 2013 10:19:18 -0500
Subject: [PATCH 11/34] 1. Initial implementation of bam writing for the
 HaplotypeCaller with -bam argument; currently only assembled haplotypes are
 emitted. 2. Framework is set up in the VariantAnnotator for the
 HaplotypeCaller to be able to call in to annotate dbSNP plus comp RODs. 
 Until the HC uses meta data though, this won't work.

---
 .../annotator/VariantAnnotatorEngine.java     | 27 ++++++---
 .../haplotypecaller/HaplotypeCaller.java      | 57 ++++++++++++++++++-
 .../HaplotypeCallerIntegrationTest.java       |  5 ++
 .../broadinstitute/sting/utils/Haplotype.java | 12 ++++
 4 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index 99dadea54..f03a25c04 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
+import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.variant.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -214,10 +215,10 @@ public class VariantAnnotatorEngine {
         Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
 
         // annotate db occurrences
-        vc = annotateDBs(tracker, ref, vc, infoAnnotations);
+        vc = annotateDBs(tracker, ref.getLocus(), vc, infoAnnotations);
 
         // annotate expressions where available
-        annotateExpressions(tracker, ref, infoAnnotations);
+        annotateExpressions(tracker, ref.getLocus(), infoAnnotations);
 
         // go through all the requested info annotationTypes
         for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
@@ -254,10 +255,22 @@ public class VariantAnnotatorEngine {
         return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make();
     }
 
-    private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
+    public VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc) {
+        final Map<String, Object> newInfoAnnotations = new HashMap<String, Object>(0);
+        vc = annotateDBs(tracker, loc, vc, newInfoAnnotations);
+
+        if ( !newInfoAnnotations.isEmpty() ) {
+            final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(newInfoAnnotations);
+            vc = builder.make();
+        }
+
+        return vc;
+    }
+
+    private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map<String, Object> infoAnnotations) {
         for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
             if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
-                final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
+                final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType());
                 
                 // add the ID if appropriate
                 if ( rsID != null ) {
@@ -273,7 +286,7 @@ public class VariantAnnotatorEngine {
                 }
             } else {
                 boolean overlapsComp = false;
-                for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
+                for ( VariantContext comp : tracker.getValues(dbSet.getKey(), loc) ) {
                     if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) {
                         overlapsComp = true;
                         break;
@@ -287,9 +300,9 @@ public class VariantAnnotatorEngine {
         return vc;
     }
 
-    private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
+    private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map<String, Object> infoAnnotations) {
         for ( VAExpression expression : requestedExpressions ) {
-            Collection<VariantContext> VCs = tracker.getValues(expression.binding, ref.getLocus());
+            Collection<VariantContext> VCs = tracker.getValues(expression.binding, loc);
             if ( VCs.size() == 0 )
                 continue;
 
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 439a9b3b8..00db62bff 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -47,6 +47,8 @@
 package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 
 import com.google.java.contract.Ensures;
+import com.sun.corba.se.impl.logging.UtilSystemException;
+import net.sf.samtools.*;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.CommandLineGATK;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@@ -57,6 +59,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
 import org.broadinstitute.sting.gatk.filters.BadMateFilter;
+import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
 import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.*;
@@ -67,6 +70,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection
 import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
 import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
 import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
 import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
 import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
 import org.broadinstitute.sting.utils.clipping.ReadClipper;
@@ -142,6 +146,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
     protected PrintStream graphWriter = null;
 
+    /**
+     * The assembled haplotypes will be written as BAM to this file if requested.  Really for debugging purposes only.  Note that the output here
+     * does not include uninformative reads so that not every input read is emitted to the bam.
+     */
+    @Hidden
+    @Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false)
+    protected StingSAMFileWriter bamWriter = null;
+    private SAMFileHeader bamHeader = null;
+    private long uniqueNameCounter = 1;
+    private final String readGroupId = "ArtificialHaplotype";
+
     /**
      * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
      */
@@ -242,6 +257,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     // the genotyping engine
     private GenotypingEngine genotypingEngine = null;
 
+    private VariantAnnotatorEngine annotationEngine = null;
+
     // fasta reference reader to supplement the edges of the reference sequence
     private CachingIndexedFastaSequenceFile referenceReader;
 
@@ -286,7 +303,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
 
         // initialize the output VCF header
-        final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
+        annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
 
         Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
 
@@ -320,6 +337,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer );
         likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
         genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
+
+        if ( bamWriter != null ) {
+            // prepare the bam header
+            bamHeader = new SAMFileHeader();
+            bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary());
+            final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(1);
+            final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId);
+            rg.setSample("HC");
+            rg.setSequencingCenter("BI");
+            readGroups.add(rg);
+            bamHeader.setReadGroups(readGroups);
+            bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
+            bamWriter.writeHeader(bamHeader);
+            bamWriter.setPresorted(true);
+        }
     }
 
     //---------------------------------------------------------------------------------------------------------------
@@ -408,7 +440,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     //---------------------------------------------------------------------------------------------------------------
 
     @Override
-    public Integer map( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
+    public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
         if ( justDetermineActiveRegions )
             // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
             return 1;
@@ -461,9 +493,30 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
                                                                                      activeRegion.getLocation(),
                                                                                      getToolkit().getGenomeLocParser(),
                                                                                      activeAllelesToGenotype ) ) {
+            annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call),  call);
             vcfWriter.add( call );
         }
 
+        if ( bamWriter != null ) {
+            Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() );
+            final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion);
+            for ( Haplotype haplotype : haplotypes ) {
+                // TODO -- clean up this code
+                final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
+                record.setReadBases(haplotype.getBases());
+                record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
+                record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
+                record.setCigar(haplotype.getCigar());
+                record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0);
+                record.setReadName("HC" + uniqueNameCounter++);
+                record.setReadUnmappedFlag(false);
+                record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex());
+                record.setAttribute(SAMTag.RG.toString(), readGroupId);
+                record.setFlags(16);
+                bamWriter.addAlignment(record);
+            }
+        }
+
         if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
 
         return 1; // One active region was processed during this map call
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 8f5e275e6..e39975ea0 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -75,6 +75,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
         HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
     }
 
+    @Test(enabled = false)
+    public void testHaplotypeCallerSingleSampleWithDbsnp() {
+        HCTest(NA12878_BAM, "-D " + b37dbSNP132, "");
+    }
+
     @Test
     public void testHaplotypeCallerMultiSampleGGA() {
         HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index efe9460cb..2706f2f99 100644
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -191,6 +191,10 @@ public class Haplotype {
     public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
         @Override
         public int compare( final Haplotype hap1, final Haplotype hap2 ) {
+            return compareHaplotypeBases(hap1, hap2);
+        }
+
+        public static int compareHaplotypeBases(final Haplotype hap1, final Haplotype hap2) {
             final byte[] arr1 = hap1.getBases();
             final byte[] arr2 = hap2.getBases();
             // compares byte arrays using lexical ordering
@@ -203,6 +207,14 @@ public class Haplotype {
         }
     }
 
+    public static class HaplotypePositionComparator implements Comparator<Haplotype>, Serializable {
+        @Override
+        public int compare( final Haplotype hap1, final Haplotype hap2 ) {
+            final int comp = hap1.getAlignmentStartHapwrtRef() - hap2.getAlignmentStartHapwrtRef();
+            return comp == 0 ? HaplotypeBaseComparator.compareHaplotypeBases(hap1, hap2) : comp;
+        }
+    }
+
     public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
                                                                                final int startPos,
                                                                                final ReferenceContext ref,

From 3c37ea014b91a57e55e56b0ac93033f7e3597ac8 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Tue, 15 Jan 2013 10:24:45 -0500
Subject: [PATCH 12/34] Retire original TraverseActiveRegion, leaving only the
 new optimized version

-- Required some updates to MD5s, which was unexpected, and will be sorted out later with more detailed unit tests
---
 .../HaplotypeCallerIntegrationTest.java       |  12 +-
 .../sting/gatk/GenomeAnalysisEngine.java      |   2 +-
 .../arguments/GATKArgumentCollection.java     |   5 -
 .../sting/gatk/executive/MicroScheduler.java  |   7 +-
 .../traversals/TraverseActiveRegions.java     | 214 ++++++-
 .../TraverseActiveRegionsOptimized.java       | 253 ---------
 .../TraverseActiveRegionsOriginal.java        | 262 ---------
 ...TraverseActiveRegionsOriginalUnitTest.java | 523 ------------------
 ...ava => TraverseActiveRegionsUnitTest.java} |   6 +-
 9 files changed, 214 insertions(+), 1070 deletions(-)
 delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java
 delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java
 delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java
 rename public/java/test/org/broadinstitute/sting/gatk/traversals/{TraverseActiveRegionsOptimizedUnitTest.java => TraverseActiveRegionsUnitTest.java} (99%)

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 8f5e275e6..780934c03 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -67,18 +67,18 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
+        HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc");
     }
 
     @Test
     public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
+        HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a");
     }
 
     @Test
     public void testHaplotypeCallerMultiSampleGGA() {
         HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
-                "c679ae7f04bdfda896b5c046d35e043c");
+                "a938cdd7262968597fc8eb6c1c0a69f1");
     }
 
     private void HCTestComplexGGA(String bam, String args, String md5) {
@@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
         HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
-                "1a034b7eb572e1b6f659d6e5d57b3e76");
+                "d590c8d6d5e58d685401b65a23846893");
     }
 
     private void HCTestComplexVariants(String bam, String args, String md5) {
@@ -129,7 +129,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleIndelQualityScores() {
-        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323");
     }
 
     // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void HCTestProblematicReadsModifiedInActiveRegions() {
         final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37"));
         executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index a5926aeae..f9d6955c0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -842,7 +842,7 @@ public class GenomeAnalysisEngine {
         if (argCollection.keepProgramRecords)
             removeProgramRecords = false;
 
-        final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker && argCollection.newART;
+        final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker;
 
         return new SAMDataSource(
                 samReaderIDs,
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index b6f0d5f90..ab09064dd 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -448,10 +448,5 @@ public class GATKArgumentCollection {
     @Hidden
     public boolean generateShadowBCF = false;
     // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
-
-    @Hidden
-    @Argument(fullName="newART", shortName = "newART", doc = "use the new ART traversal", required=false)
-    public boolean newART = false;
-
 }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index c127899f6..371cce778 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -245,12 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
         } else if (walker instanceof ReadPairWalker) {
             return new TraverseReadPairs();
         } else if (walker instanceof ActiveRegionWalker) {
-            if ( engine.getArguments().newART ) {
-                // todo -- create optimized traversal
-                return new TraverseActiveRegionsOptimized();
-            } else {
-                return new TraverseActiveRegionsOriginal();
-            }
+            return new TraverseActiveRegions();
         } else {
             throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
         }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index 45dbb6dc8..03aaf95f2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.WalkerManager;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.datasources.providers.*;
+import org.broadinstitute.sting.gatk.datasources.reads.Shard;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
 import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
@@ -43,8 +44,7 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
 import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
-import java.util.LinkedList;
-import java.util.List;
+import java.util.*;
 
 /**
  * Created with IntelliJ IDEA.
@@ -53,7 +53,7 @@ import java.util.List;
  * Time: 4:45 PM
  * To change this template use File | Settings | File Templates.
  */
-public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
+public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
     protected final static boolean DEBUG = false;
 
     // set by the tranversal
@@ -66,14 +66,6 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
     protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
     protected final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
 
-    abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker);
-
-    /**
-     * Special function called in LinearMicroScheduler to empty out the work queue.
-     * Ugly for now but will be cleaned up when we push this functionality more into the engine
-     */
-    public abstract T endTraversal(final Walker<M, T> walker, T sum);
-
     protected int getActiveRegionExtension() {
         return activeRegionExtension;
     }
@@ -160,4 +152,204 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
             }
         }
     }
+
+    private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
+    private Shard lastShard = null;
+
+    @Override
+    public T traverse( final ActiveRegionWalker<M,T> walker,
+                       final LocusShardDataProvider dataProvider,
+                       T sum) {
+        if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
+
+        final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
+        // TODO -- there's got to be a better way to know this
+        if ( lastShard != dataProvider.getShard() ) {
+            maybeDuplicatedReads.addAll(myReads);
+            logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() +  " reads");
+            if ( DEBUG ) logger.warn("Clearing myReads");
+        }
+        lastShard = dataProvider.getShard();
+
+        final LocusView locusView = new AllLocusView(dataProvider);
+
+        final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
+
+        final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
+        ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
+
+        ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
+
+        // We keep processing while the next reference location is within the interval
+        GenomeLoc prevLoc = null;
+        while( locusView.hasNext() ) {
+            final AlignmentContext locus = locusView.next();
+            final GenomeLoc location = locus.getLocation();
+
+            // Grab all the previously unseen reads from this pileup and add them to the massive read list
+            // Note that this must occur before we leave because we are outside the intervals because
+            // reads may occur outside our intervals but overlap them in the future
+            final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
+            for( final GATKSAMRecord read : reads ) {
+                notifyOfCurrentPosition(read);
+                // most of the time maybeDuplicatedReads is empty
+                // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
+                // TODO -- potential list of duplicates we can clear the hashset
+                if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
+                    if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
+                } else {
+                    if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
+                    myReads.add((GATKSAMRecord)read);
+                }
+            }
+
+            // skip this location -- it's not part of our engine intervals
+            if ( outsideEngineIntervals(location) )
+                continue;
+
+            if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
+                // we've move across some interval boundary, restart profile
+                profile = incorporateActiveRegions(profile, activeRegions);
+            }
+
+            dataProvider.getShard().getReadMetrics().incrementNumIterations();
+
+            // create reference context. Note that if we have a pileup of "extended events", the context will
+            // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
+            final ReferenceContext refContext = referenceView.getReferenceContext(location);
+
+            // Iterate forward to get all reference ordered data covering this location
+            final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
+
+            // Call the walkers isActive function for this locus and add them to the list to be integrated later
+            profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
+
+            prevLoc = location;
+
+            printProgress(locus.getLocation());
+        }
+
+        updateCumulativeMetrics(dataProvider.getShard());
+
+        if ( ! profile.isEmpty() )
+            incorporateActiveRegions(profile, activeRegions);
+
+        // add active regions to queue of regions to process
+        // first check if can merge active regions over shard boundaries
+        if( !activeRegions.isEmpty() ) {
+            if( !workQueue.isEmpty() ) {
+                final ActiveRegion last = workQueue.getLast();
+                final ActiveRegion first = activeRegions.get(0);
+                if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
+                    workQueue.removeLast();
+                    activeRegions.remove(first);
+                    workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
+                }
+            }
+            workQueue.addAll( activeRegions );
+        }
+
+        logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
+
+        // now go and process all of the active regions
+        sum = processActiveRegions(walker, sum, false);
+
+        return sum;
+    }
+
+    private GenomeLoc startOfLiveRegion = null;
+
+    protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
+        notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
+    }
+
+    protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
+        if ( startOfLiveRegion == null )
+            startOfLiveRegion = currentLocation;
+        else
+            startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
+    }
+
+    protected GenomeLoc getStartOfLiveRegion() {
+        return startOfLiveRegion;
+    }
+
+    protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
+        return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
+                || ! region.onSameContig(getStartOfLiveRegion());
+    }
+
+    private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
+        if( walker.activeRegionOutStream != null ) {
+            writeActiveRegionsToStream(walker);
+            return sum;
+        } else {
+            return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
+        }
+    }
+
+    private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
+        // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
+        // TODO can implement parallel traversal here
+        while( workQueue.peek() != null ) {
+            final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
+            if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
+                final ActiveRegion activeRegion = workQueue.remove();
+                if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
+                sum = processActiveRegion( activeRegion, sum, walker );
+            } else {
+                break;
+            }
+        }
+
+        return sum;
+    }
+
+    @Override
+    public String toString() {
+        return "TraverseActiveRegions";
+    }
+
+    private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
+        return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
+    }
+
+    protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
+        final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
+        while ( liveReads.hasNext() ) {
+            boolean killed = false;
+            final GATKSAMRecord read = liveReads.next();
+            final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
+
+            if( activeRegion.getLocation().overlapsP( readLoc ) ) {
+                activeRegion.add(read);
+
+                if ( ! walker.wantsNonPrimaryReads() ) {
+                    if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
+                    liveReads.remove();
+                    killed = true;
+                }
+            } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
+                activeRegion.add( read );
+            }
+
+            if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
+                if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
+                liveReads.remove();
+            }
+        }
+
+        logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
+        final M x = walker.map(activeRegion, null);
+        return walker.reduce( x, sum );
+    }
+
+
+    /**
+     * Special function called in LinearMicroScheduler to empty out the work queue.
+     * Ugly for now but will be cleaned up when we push this functionality more into the engine
+     */
+    public T endTraversal(final Walker<M, T> walker, T sum) {
+        return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java
deleted file mode 100644
index 809c7ea6a..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2012 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.traversals;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.providers.*;
-import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
-import org.broadinstitute.sting.gatk.walkers.Walker;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
-import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.util.*;
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: 12/9/11
- */
-
-public class TraverseActiveRegionsOptimized<M,T> extends TraverseActiveRegions<M,T> {
-    private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
-    private Shard lastShard = null;
-
-    @Override
-    public T traverse( final ActiveRegionWalker<M,T> walker,
-                       final LocusShardDataProvider dataProvider,
-                       T sum) {
-        if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
-
-        final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
-        // TODO -- there's got to be a better way to know this
-        if ( lastShard != dataProvider.getShard() ) {
-            maybeDuplicatedReads.addAll(myReads);
-            logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() +  " reads");
-            if ( DEBUG ) logger.warn("Clearing myReads");
-        }
-        lastShard = dataProvider.getShard();
-
-        final LocusView locusView = new AllLocusView(dataProvider);
-
-        final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
-
-        final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
-        ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
-
-        ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
-
-        // We keep processing while the next reference location is within the interval
-        GenomeLoc prevLoc = null;
-        while( locusView.hasNext() ) {
-            final AlignmentContext locus = locusView.next();
-            final GenomeLoc location = locus.getLocation();
-
-            // Grab all the previously unseen reads from this pileup and add them to the massive read list
-            // Note that this must occur before we leave because we are outside the intervals because
-            // reads may occur outside our intervals but overlap them in the future
-            final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
-            for( final GATKSAMRecord read : reads ) {
-                notifyOfCurrentPosition(read);
-                // most of the time maybeDuplicatedReads is empty
-                // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
-                // TODO -- potential list of duplicates we can clear the hashset
-                if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
-                    if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
-                } else {
-                    if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
-                    myReads.add((GATKSAMRecord)read);
-                }
-            }
-
-            // skip this location -- it's not part of our engine intervals
-            if ( outsideEngineIntervals(location) )
-                continue;
-
-            if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
-                // we've move across some interval boundary, restart profile
-                profile = incorporateActiveRegions(profile, activeRegions);
-            }
-
-            dataProvider.getShard().getReadMetrics().incrementNumIterations();
-
-            // create reference context. Note that if we have a pileup of "extended events", the context will
-            // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
-            final ReferenceContext refContext = referenceView.getReferenceContext(location);
-
-            // Iterate forward to get all reference ordered data covering this location
-            final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
-
-            // Call the walkers isActive function for this locus and add them to the list to be integrated later
-            profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
-
-            prevLoc = location;
-
-            printProgress(locus.getLocation());
-        }
-
-        updateCumulativeMetrics(dataProvider.getShard());
-
-        if ( ! profile.isEmpty() )
-            incorporateActiveRegions(profile, activeRegions);
-
-        // add active regions to queue of regions to process
-        // first check if can merge active regions over shard boundaries
-        if( !activeRegions.isEmpty() ) {
-            if( !workQueue.isEmpty() ) {
-                final ActiveRegion last = workQueue.getLast();
-                final ActiveRegion first = activeRegions.get(0);
-                if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
-                    workQueue.removeLast();
-                    activeRegions.remove(first);
-                    workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
-                }
-            }
-            workQueue.addAll( activeRegions );
-        }
-
-        logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
-
-        // now go and process all of the active regions
-        sum = processActiveRegions(walker, sum, false);
-
-        return sum;
-    }
-
-    private GenomeLoc startOfLiveRegion = null;
-
-    protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
-        notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
-    }
-
-    protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
-        if ( startOfLiveRegion == null )
-            startOfLiveRegion = currentLocation;
-        else
-            startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
-    }
-
-    protected GenomeLoc getStartOfLiveRegion() {
-        return startOfLiveRegion;
-    }
-
-    protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
-        return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
-                || ! region.onSameContig(getStartOfLiveRegion());
-    }
-
-    private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
-        if( walker.activeRegionOutStream != null ) {
-            writeActiveRegionsToStream(walker);
-            return sum;
-        } else {
-            return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
-        }
-    }
-
-    private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
-        // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
-        // TODO can implement parallel traversal here
-        while( workQueue.peek() != null ) {
-            final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
-            if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
-                final ActiveRegion activeRegion = workQueue.remove();
-                if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
-                sum = processActiveRegion( activeRegion, sum, walker );
-            } else {
-                break;
-            }
-        }
-
-        return sum;
-    }
-
-    @Override
-    public String toString() {
-        return "TraverseActiveRegionsOptimized";
-    }
-
-    private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
-        return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
-    }
-
-    @Override
-    protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
-        final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
-        while ( liveReads.hasNext() ) {
-            boolean killed = false;
-            final GATKSAMRecord read = liveReads.next();
-            final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
-
-            if( activeRegion.getLocation().overlapsP( readLoc ) ) {
-                activeRegion.add(read);
-
-                if ( ! walker.wantsNonPrimaryReads() ) {
-                    if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
-                    liveReads.remove();
-                    killed = true;
-                }
-            } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
-                activeRegion.add( read );
-            }
-
-            if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
-                if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
-                liveReads.remove();
-            }
-        }
-
-        logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
-        final M x = walker.map(activeRegion, null);
-        return walker.reduce( x, sum );
-    }
-
-
-    /**
-     * Special function called in LinearMicroScheduler to empty out the work queue.
-     * Ugly for now but will be cleaned up when we push this functionality more into the engine
-     */
-    @Override
-    public T endTraversal(final Walker<M, T> walker, T sum) {
-        return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
-    }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java
deleted file mode 100644
index 0786bc800..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2012 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.traversals;
-
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.WalkerManager;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.providers.*;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
-import org.broadinstitute.sting.gatk.walkers.DataSource;
-import org.broadinstitute.sting.gatk.walkers.Walker;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
-import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
-import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.util.*;
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: 12/9/11
- */
-
-public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,T> {
-    private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
-
-    @Override
-    public T traverse( final ActiveRegionWalker<M,T> walker,
-                       final LocusShardDataProvider dataProvider,
-                       T sum) {
-        logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
-
-        final LocusView locusView = new AllLocusView(dataProvider);
-
-        final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
-        final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
-        final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
-
-        int minStart = Integer.MAX_VALUE;
-        final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
-        ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
-
-        ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
-
-        // We keep processing while the next reference location is within the interval
-        GenomeLoc prevLoc = null;
-        while( locusView.hasNext() ) {
-            final AlignmentContext locus = locusView.next();
-            final GenomeLoc location = locus.getLocation();
-
-            // Grab all the previously unseen reads from this pileup and add them to the massive read list
-            // Note that this must occur before we leave because we are outside the intervals because
-            // reads may occur outside our intervals but overlap them in the future
-            // TODO -- this whole HashSet logic should be changed to a linked list of reads with
-            // TODO -- subsequent pass over them to find the ones overlapping the active regions
-            for( final PileupElement p : locus.getBasePileup() ) {
-                final GATKSAMRecord read = p.getRead();
-                if( !myReads.contains(read) ) {
-                    myReads.add(read);
-                }
-
-                // If this is the last pileup for this shard calculate the minimum alignment start so that we know
-                // which active regions in the work queue are now safe to process
-                minStart = Math.min(minStart, read.getAlignmentStart());
-            }
-
-            // skip this location -- it's not part of our engine intervals
-            if ( outsideEngineIntervals(location) )
-                continue;
-
-            if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
-                // we've move across some interval boundary, restart profile
-                profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
-            }
-
-            dataProvider.getShard().getReadMetrics().incrementNumIterations();
-
-            // create reference context. Note that if we have a pileup of "extended events", the context will
-            // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
-            final ReferenceContext refContext = referenceView.getReferenceContext(location);
-
-            // Iterate forward to get all reference ordered data covering this location
-            final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
-
-            // Call the walkers isActive function for this locus and add them to the list to be integrated later
-            profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
-
-            prevLoc = location;
-
-            printProgress(locus.getLocation());
-        }
-
-        updateCumulativeMetrics(dataProvider.getShard());
-
-        if ( ! profile.isEmpty() )
-            incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
-
-        // add active regions to queue of regions to process
-        // first check if can merge active regions over shard boundaries
-        if( !activeRegions.isEmpty() ) {
-            if( !workQueue.isEmpty() ) {
-                final ActiveRegion last = workQueue.getLast();
-                final ActiveRegion first = activeRegions.get(0);
-                if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
-                    workQueue.removeLast();
-                    activeRegions.remove(first);
-                    workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
-                }
-            }
-            workQueue.addAll( activeRegions );
-        }
-
-        logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
-
-        // now go and process all of the active regions
-        sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
-
-        return sum;
-    }
-
-    /**
-     * Take the individual isActive calls and integrate them into contiguous active regions and
-     * add these blocks of work to the work queue
-     * band-pass filter the list of isActive probabilities and turn into active regions
-     *
-     * @param profile
-     * @param activeRegions
-     * @param activeRegionExtension
-     * @param maxRegionSize
-     * @return
-     */
-    private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
-                                                     final List<ActiveRegion> activeRegions,
-                                                     final int activeRegionExtension,
-                                                     final int maxRegionSize) {
-        if ( profile.isEmpty() )
-            throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
-
-        final ActivityProfile bandPassFiltered = profile.bandPassFilter();
-        activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
-        return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
-    }
-
-    // --------------------------------------------------------------------------------
-    //
-    // code to handle processing active regions
-    //
-    // --------------------------------------------------------------------------------
-
-    private T processActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
-        if( walker.activeRegionOutStream != null ) {
-            writeActiveRegionsToStream(walker);
-            return sum;
-        } else {
-            return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
-        }
-    }
-
-    private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
-        // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
-        // TODO can implement parallel traversal here
-        while( workQueue.peek() != null ) {
-            final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
-            if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
-                final ActiveRegion activeRegion = workQueue.remove();
-                sum = processActiveRegion( activeRegion, sum, walker );
-            } else {
-                break;
-            }
-        }
-
-        return sum;
-    }
-
-    @Override
-    protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M,T> walker ) {
-        final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
-        for( final GATKSAMRecord read : myReads ) {
-            final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
-            if( activeRegion.getLocation().overlapsP( readLoc ) ) {
-                // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
-                long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
-                ActiveRegion bestRegion = activeRegion;
-                for( final ActiveRegion otherRegionToTest : workQueue ) {
-                    if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
-                        maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
-                        bestRegion = otherRegionToTest;
-                    }
-                }
-                bestRegion.add( read );
-
-                // The read is also added to all other regions in which it overlaps but marked as non-primary
-                if( walker.wantsNonPrimaryReads() ) {
-                    if( !bestRegion.equals(activeRegion) ) {
-                        activeRegion.add( read );
-                    }
-                    for( final ActiveRegion otherRegionToTest : workQueue ) {
-                        if( !bestRegion.equals(otherRegionToTest) ) {
-                            // check for non-primary vs. extended
-                            if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
-                                otherRegionToTest.add( read );
-                            } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
-                                otherRegionToTest.add( read );
-                            }
-                        }
-                    }
-                }
-                placedReads.add( read );
-                // check for non-primary vs. extended
-            } else if( activeRegion.getLocation().overlapsP( readLoc ) ) {
-                if ( walker.wantsNonPrimaryReads() ) {
-                    activeRegion.add( read );
-                }
-            } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
-                activeRegion.add( read );
-            }
-        }
-        myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
-        // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
-
-        logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
-        final M x = walker.map( activeRegion, null );
-        return walker.reduce( x, sum );
-    }
-
-    /**
-     * Special function called in LinearMicroScheduler to empty out the work queue.
-     * Ugly for now but will be cleaned up when we push this functionality more into the engine
-     */
-    public T endTraversal( final Walker<M,T> walker, T sum) {
-        return processActiveRegions((ActiveRegionWalker<M, T>) walker, sum, Integer.MAX_VALUE, null);
-    }
-}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java
deleted file mode 100644
index 35a0931df..000000000
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * Copyright (c) 2012 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.traversals;
-
-import com.google.java.contract.PreconditionError;
-import net.sf.samtools.*;
-import org.broadinstitute.sting.commandline.Tags;
-import org.broadinstitute.sting.gatk.datasources.reads.*;
-import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
-import org.broadinstitute.sting.gatk.walkers.Walker;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
-import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
-import org.broadinstitute.sting.utils.interval.IntervalUtils;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import net.sf.picard.reference.IndexedFastaSequenceFile;
-import org.broadinstitute.sting.BaseTest;
-import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
-import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.executive.WindowMaker;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
-import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
-import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
-import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
-import org.testng.Assert;
-import org.testng.annotations.BeforeClass;
-import org.testng.annotations.Test;
-
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.util.*;
-
-
-/**
- * Created with IntelliJ IDEA.
- * User: depristo
- * Date: 1/10/13
- * Time: 8:03 PM
- * To change this template use File | Settings | File Templates.
- */
-public class TraverseActiveRegionsOriginalUnitTest extends BaseTest {
-
-    private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
-        private final double prob;
-        private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
-
-        protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
-        protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
-
-        public DummyActiveRegionWalker() {
-            this.prob = 1.0;
-        }
-
-        public DummyActiveRegionWalker(double constProb) {
-            this.prob = constProb;
-        }
-
-        public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
-            this.prob = 1.0;
-            this.states = wantStates;
-        }
-
-        @Override
-        public EnumSet<ActiveRegionReadState> desiredReadStates() {
-            return states;
-        }
-
-        @Override
-        public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
-            isActiveCalls.add(ref.getLocus());
-            return new ActivityProfileResult(ref.getLocus(), prob);
-        }
-
-        @Override
-        public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
-            mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
-            return 0;
-        }
-
-        @Override
-        public Integer reduceInit() {
-            return 0;
-        }
-
-        @Override
-        public Integer reduce(Integer value, Integer sum) {
-            return 0;
-        }
-    }
-
-    private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegionsOriginal<Integer, Integer>();
-
-    private IndexedFastaSequenceFile reference;
-    private SAMSequenceDictionary dictionary;
-    private GenomeLocParser genomeLocParser;
-
-    private List<GenomeLoc> intervals;
-
-    private static final String testBAM = "TraverseActiveRegionsUnitTest.bam";
-    private static final String testBAI = "TraverseActiveRegionsUnitTest.bai";
-
-    @BeforeClass
-    private void init() throws FileNotFoundException {
-        reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
-        dictionary = reference.getSequenceDictionary();
-        genomeLocParser = new GenomeLocParser(dictionary);
-
-        // TODO: reads with indels
-        // TODO: reads which span many regions
-        // TODO: reads which are partially between intervals (in/outside extension)
-        // TODO: duplicate reads
-        // TODO: read at the end of a contig
-        // TODO: reads which are completely outside intervals but within extension
-        // TODO: test the extension itself
-        // TODO: unmapped reads
-
-        intervals = new ArrayList<GenomeLoc>();
-        intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
-        intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
-        intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
-        intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
-        intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
-        intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100));
-        intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100));
-        intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
-
-        List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
-        reads.add(buildSAMRecord("simple", "1", 100, 200));
-        reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
-        reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
-        reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
-        reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008));
-        reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000));
-        reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050));
-        reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
-        reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
-        reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385));
-        reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400));
-        reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414));
-        reads.add(buildSAMRecord("simple20", "20", 10025, 10075));
-
-        createBAM(reads);
-    }
-
-    private void createBAM(List<GATKSAMRecord> reads) {
-        File outFile = new File(testBAM);
-        outFile.deleteOnExit();
-        File indexFile = new File(testBAI);
-        indexFile.deleteOnExit();
-
-        SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile);
-        for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) {
-            out.addAlignment(read);
-        }
-        out.close();
-    }
-
-    @Test
-    public void testAllBasesSeen() {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
-
-        List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
-        // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
-        verifyEqualIntervals(intervals, activeIntervals);
-    }
-
-    private List<GenomeLoc> getIsActiveIntervals(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
-        List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
-        for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) {
-            t.traverse(walker, dataProvider, 0);
-            activeIntervals.addAll(walker.isActiveCalls);
-        }
-
-        return activeIntervals;
-    }
-
-    @Test (expectedExceptions = PreconditionError.class)
-    public void testIsActiveRangeLow () {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1);
-        getActiveRegions(walker, intervals).values();
-    }
-
-    @Test (expectedExceptions = PreconditionError.class)
-    public void testIsActiveRangeHigh () {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1);
-        getActiveRegions(walker, intervals).values();
-    }
-
-    @Test
-    public void testActiveRegionCoverage() {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
-
-        Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
-        verifyActiveRegionCoverage(intervals, activeRegions);
-    }
-
-    private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
-        List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
-        List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
-
-        for (GenomeLoc interval : intervals) {
-            intervalStarts.add(interval.getStartLocation());
-            intervalStops.add(interval.getStopLocation());
-        }
-
-        Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();
-
-        for (ActiveRegion activeRegion : activeRegions) {
-            for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
-                // Contract: Regions do not overlap
-                Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
-                baseRegionMap.put(activeLoc, activeRegion);
-            }
-
-            GenomeLoc start = activeRegion.getLocation().getStartLocation();
-            if (intervalStarts.contains(start))
-                intervalStarts.remove(start);
-
-            GenomeLoc stop = activeRegion.getLocation().getStopLocation();
-            if (intervalStops.contains(stop))
-                intervalStops.remove(stop);
-        }
-
-        for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
-            // Contract: Each location in the interval(s) is in exactly one region
-            // Contract: The total set of regions exactly matches the analysis interval(s)
-            Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
-            baseRegionMap.remove(baseLoc);
-        }
-
-        // Contract: The total set of regions exactly matches the analysis interval(s)
-        Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");
-
-        // Contract: All explicit interval boundaries must also be region boundaries
-        Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
-        Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
-    }
-
-    @Test
-    public void testActiveRegionExtensionOnContig() {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
-
-        Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
-        for (ActiveRegion activeRegion : activeRegions) {
-            GenomeLoc loc = activeRegion.getExtendedLoc();
-
-            // Contract: active region extensions must stay on the contig
-            Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig");
-            int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength();
-            Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig");
-        }
-    }
-
-    @Test
-    public void testPrimaryReadMapping() {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
-
-        // Contract: Each read has the Primary state in a single region (or none)
-        // This is the region of maximum overlap for the read (earlier if tied)
-
-        // simple: Primary in 1:1-999
-        // overlap_equal: Primary in 1:1-999
-        // overlap_unequal: Primary in 1:1-999
-        // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
-        // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
-        // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
-        // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
-        // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
-        // outside_intervals: none
-        // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
-        // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
-        // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
-        // simple20: Primary in 20:10000-10100
-
-        Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
-        ActiveRegion region;
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
-        verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
-        verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
-        verifyReadMapping(region, "boundary_equal", "boundary_1_post");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
-        verifyReadMapping(region, "shard_boundary_1_pre");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
-        verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
-        verifyReadMapping(region, "simple20");
-    }
-
-    @Test
-    public void testNonPrimaryReadMapping() {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
-                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY));
-
-        // Contract: Each read has the Primary state in a single region (or none)
-        // This is the region of maximum overlap for the read (earlier if tied)
-
-        // Contract: Each read has the Non-Primary state in all other regions it overlaps
-
-        // simple: Primary in 1:1-999
-        // overlap_equal: Primary in 1:1-999
-        // overlap_unequal: Primary in 1:1-999
-        // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
-        // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
-        // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
-        // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
-        // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
-        // outside_intervals: none
-        // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
-        // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
-        // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
-        // simple20: Primary in 20:10000-10100
-
-        Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
-        ActiveRegion region;
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
-        verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
-        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
-        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
-        verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
-        verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
-        verifyReadMapping(region, "simple20");
-    }
-
-    @Test
-    public void testExtendedReadMapping() {
-        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
-                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED));
-
-        // Contract: Each read has the Primary state in a single region (or none)
-        // This is the region of maximum overlap for the read (earlier if tied)
-
-        // Contract: Each read has the Non-Primary state in all other regions it overlaps
-        // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended
-
-        // simple: Primary in 1:1-999
-        // overlap_equal: Primary in 1:1-999
-        // overlap_unequal: Primary in 1:1-999
-        // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
-        // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
-        // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
-        // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
-        // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
-        // outside_intervals: none
-        // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
-        // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
-        // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
-        // simple20: Primary in 20:10000-10100
-
-        Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
-        ActiveRegion region;
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
-        verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
-        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
-        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
-        verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
-        verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
-
-        region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
-        verifyReadMapping(region, "simple20");
-    }
-
-    @Test
-    public void testUnmappedReads() {
-        // TODO
-    }
-
-    private void verifyReadMapping(ActiveRegion region, String... reads) {
-        Collection<String> wantReads = new ArrayList<String>(Arrays.asList(reads));
-        for (SAMRecord read : region.getReads()) {
-            String regionReadName = read.getReadName();
-            Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region);
-            wantReads.remove(regionReadName);
-        }
-
-        Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region);
-    }
-
-    private Map<GenomeLoc, ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
-        for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM))
-            t.traverse(walker, dataProvider, 0);
-
-        t.endTraversal(walker, 0);
-
-        return walker.mappedActiveRegions;
-    }
-
-    private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
-        List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
-        if (interval.size() == 1)
-            bases.add(interval);
-        else {
-            for (int location = interval.getStart(); location <= interval.getStop(); location++)
-                bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
-        }
-
-        return bases;
-    }
-
-    private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
-        Set<GenomeLoc> bases = new TreeSet<GenomeLoc>();    // for sorting and uniqueness
-        for (GenomeLoc interval : intervals)
-            bases.addAll(toSingleBaseLocs(interval));
-
-        return bases;
-    }
-
-    private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
-        Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
-        Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);
-
-        Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());
-
-        Iterator<GenomeLoc> aIter = aBases.iterator();
-        Iterator<GenomeLoc> bIter = bBases.iterator();
-        while (aIter.hasNext() && bIter.hasNext()) {
-            GenomeLoc aLoc = aIter.next();
-            GenomeLoc bLoc = bIter.next();
-            Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
-        }
-    }
-
-    // copied from LocusViewTemplate
-    protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
-        SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test");
-        header.setSequenceDictionary(dictionary);
-        header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
-        GATKSAMRecord record = new GATKSAMRecord(header);
-
-        record.setReadName(readName);
-        record.setReferenceIndex(dictionary.getSequenceIndex(contig));
-        record.setAlignmentStart(alignmentStart);
-
-        Cigar cigar = new Cigar();
-        int len = alignmentEnd - alignmentStart + 1;
-        cigar.add(new CigarElement(len, CigarOperator.M));
-        record.setCigar(cigar);
-        record.setReadString(new String(new char[len]).replace("\0", "A"));
-        record.setBaseQualities(new byte[len]);
-
-        return record;
-    }
-
-    private List<LocusShardDataProvider> createDataProviders(final Walker walker, List<GenomeLoc> intervals, String bamFile) {
-        GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
-        engine.setGenomeLocParser(genomeLocParser);
-        t.initialize(engine, walker);
-
-        Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
-        SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags());
-        samFiles.add(readerID);
-
-        SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser);
-
-        List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
-        for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
-            for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) {
-                providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
-            }
-        }
-
-        return providers;
-    }
-}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
similarity index 99%
rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java
rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
index 038cd2853..c4dadbcce 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
@@ -76,7 +76,7 @@ import java.util.*;
  * Test the Active Region Traversal Contract
  * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
  */
-public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
+public class TraverseActiveRegionsUnitTest extends BaseTest {
     private final static boolean ENFORCE_CONTRACTS = false;
     private final static boolean DEBUG = false;
 
@@ -131,7 +131,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
     @DataProvider(name = "TraversalEngineProvider")
     public Object[][] makeTraversals() {
         final List<Object[]> traversals = new LinkedList<Object[]>();
-        traversals.add(new Object[]{new TraverseActiveRegionsOptimized<Integer, Integer>()});
+        traversals.add(new Object[]{new TraverseActiveRegions<Integer, Integer>()});
         return traversals.toArray(new Object[][]{});
     }
 
@@ -537,7 +537,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
                 new ValidationExclusion(),
                 new ArrayList<ReadFilter>(),
                 new ArrayList<ReadTransformer>(),
-                false, (byte)30, false, t instanceof TraverseActiveRegionsOptimized);
+                false, (byte)30, false, true);
 
         List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
         for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {

From d3baa4b8cac086fb433b2e408e1165713b9ff0b7 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 15 Jan 2013 11:36:20 -0500
Subject: [PATCH 13/34] Have Haplotype extend the Allele class. This way, we
 don't need to create a new Allele for every read/Haplotype pair to be placed
 in the PerReadAlleleLikelihoodMap (very inefficient).  Also, now we can
 easily get the Haplotype associated with the best allele for a given read.

---
 .../haplotypecaller/HaplotypeCaller.java      | 87 ++++++++++++-------
 .../LikelihoodCalculationEngine.java          |  3 +-
 .../broadinstitute/sting/utils/Haplotype.java | 64 +++++++-------
 .../variant/variantcontext/Allele.java        |  4 +-
 4 files changed, 93 insertions(+), 65 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 00db62bff..04da91f65 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -47,7 +47,6 @@
 package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 
 import com.google.java.contract.Ensures;
-import com.sun.corba.se.impl.logging.UtilSystemException;
 import net.sf.samtools.*;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.CommandLineGATK;
@@ -155,7 +154,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     protected StingSAMFileWriter bamWriter = null;
     private SAMFileHeader bamHeader = null;
     private long uniqueNameCounter = 1;
-    private final String readGroupId = "ArtificialHaplotype";
+    private final static String readGroupId = "ArtificialHaplotype";
 
     /**
      * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
@@ -338,20 +337,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
         genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
 
-        if ( bamWriter != null ) {
-            // prepare the bam header
-            bamHeader = new SAMFileHeader();
-            bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary());
-            final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(1);
-            final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId);
-            rg.setSample("HC");
-            rg.setSequencingCenter("BI");
-            readGroups.add(rg);
-            bamHeader.setReadGroups(readGroups);
-            bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
-            bamWriter.writeHeader(bamHeader);
-            bamWriter.setPresorted(true);
-        }
+        if ( bamWriter != null )
+            setupBamWriter();
     }
 
     //---------------------------------------------------------------------------------------------------------------
@@ -461,8 +448,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
 
         finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails
-        final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region
-        referenceHaplotype.setIsReference(true);
+        final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region
         final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING);
         //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion ));
         final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype );
@@ -498,22 +484,19 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         }
 
         if ( bamWriter != null ) {
+            // sort the haplotypes in coordinate order and then write them to the bam
             Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() );
             final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion);
-            for ( Haplotype haplotype : haplotypes ) {
-                // TODO -- clean up this code
-                final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
-                record.setReadBases(haplotype.getBases());
-                record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
-                record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
-                record.setCigar(haplotype.getCigar());
-                record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0);
-                record.setReadName("HC" + uniqueNameCounter++);
-                record.setReadUnmappedFlag(false);
-                record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex());
-                record.setAttribute(SAMTag.RG.toString(), readGroupId);
-                record.setFlags(16);
-                bamWriter.addAlignment(record);
+            for ( Haplotype haplotype : haplotypes )
+                writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype));
+
+            // now, output the interesting reads for each sample
+            for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
+                for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
+                    final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
+                    if ( bestAllele != Allele.NO_CALL )
+                        writeReadAgainstHaplotype(entry.getKey(), (Haplotype)bestAllele);
+                }
             }
         }
 
@@ -608,6 +591,46 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         }
 
         return returnMap;
+    }
+
+    private void setupBamWriter() {
+        // prepare the bam header
+        bamHeader = new SAMFileHeader();
+        bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary());
+        bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
+
+        // include the original read groups plus a new artificial one for the haplotypes
+        final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(getToolkit().getSAMFileHeader().getReadGroups());
+        final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId);
+        rg.setSample("HC");
+        rg.setSequencingCenter("BI");
+        readGroups.add(rg);
+        bamHeader.setReadGroups(readGroups);
+
+        bamWriter.writeHeader(bamHeader);
+        bamWriter.setPresorted(true);
+    }
+
+    private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) {
+        final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
+        record.setReadBases(haplotype.getBases());
+        record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
+        record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
+        record.setCigar(haplotype.getCigar());
+        record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0);
+        record.setReadName("HC" + uniqueNameCounter++);
+        record.setReadUnmappedFlag(false);
+        record.setReferenceIndex(paddedRefLoc.getContigIndex());
+        record.setAttribute(SAMTag.RG.toString(), readGroupId);
+        record.setFlags(16);
+        bamWriter.addAlignment(record);
+    }
+
+    private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype) {
+
+
+
+
     }
 
     /*
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index 8b844817d..e05ad85a9 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -138,6 +138,7 @@ public class LikelihoodCalculationEngine {
                 readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality
                 //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
                 //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
+                // TODO -- why is Q18 hard-coded here???
                 readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
             }
 
@@ -151,7 +152,7 @@ public class LikelihoodCalculationEngine {
                 final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
                 previousHaplotypeSeen = haplotype;
 
-                perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()),
+                perReadAlleleLikelihoodMap.add(read, haplotype,
                         pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
                                 readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
             }
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index 2706f2f99..4830bf053 100644
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -37,59 +37,71 @@ import org.broadinstitute.variant.variantcontext.VariantContext;
 import java.io.Serializable;
 import java.util.*;
 
-public class Haplotype {
-    protected final byte[] bases;
+public class Haplotype  extends Allele {
     protected final double[] quals;
     private GenomeLoc genomeLocation = null;
     private HashMap<Integer, VariantContext> eventMap = null;
-    private boolean isRef = false;
     private Cigar cigar;
     private int alignmentStartHapwrtRef;
     public int leftBreakPoint = 0;
     public int rightBreakPoint = 0;
     private Event artificialEvent = null;
 
+    /**
+     * Main constructor
+     *
+     * @param bases bases
+     * @param quals quals
+     * @param isRef is reference allele?
+     */
+    public Haplotype( final byte[] bases, final double[] quals, final boolean isRef ) {
+        super(bases.clone(), isRef);
+        this.quals = quals.clone();
+    }
+
     /**
      * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
      *
      * @param bases bases
      * @param qual  qual
      */
-    public Haplotype( final byte[] bases, final int qual ) {
-        this.bases = bases.clone();
+    public Haplotype( final byte[] bases, final int qual, final boolean isRef ) {
+        super(bases.clone(), isRef);
         quals = new double[bases.length];
         Arrays.fill(quals, (double)qual);
     }
 
+    public Haplotype( final byte[] bases, final int qual ) {
+        this(bases, qual, false);
+    }
+
+    public Haplotype( final byte[] bases, final boolean isRef ) {
+        this(bases, 0, isRef);
+    }
+
     public Haplotype( final byte[] bases, final double[] quals ) {
-        this.bases = bases.clone();
-        this.quals = quals.clone();
+        this(bases, quals, false);
     }
 
     public Haplotype( final byte[] bases ) {
-        this(bases, 0);
+        this(bases, 0, false);
     }
 
     protected Haplotype( final byte[] bases, final Event artificialEvent ) {
-        this(bases, 0);
+        this(bases, 0, false);
         this.artificialEvent = artificialEvent;
     }
 
     public Haplotype( final byte[] bases, final GenomeLoc loc ) {
-        this(bases);
+        this(bases, 0, false);
         this.genomeLocation = loc;
     }
 
     @Override
     public boolean equals( Object h ) {
-        return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases);
+        return h instanceof Haplotype && super.equals(h);
     }
     
-    @Override
-    public int hashCode() {
-        return Arrays.hashCode(bases);
-    }
-
     public HashMap<Integer, VariantContext> getEventMap() {
         return eventMap;
     }
@@ -98,17 +110,9 @@ public class Haplotype {
         this.eventMap = eventMap;
     }
 
-    public boolean isReference() {
-        return isRef;
-    }
-
-    public void setIsReference( boolean isRef ) {
-        this.isRef = isRef;
-    }
-
     public double getQualitySum() {
         double s = 0;
-        for (int k=0; k < bases.length; k++) {
+        for (int k=0; k < quals.length; k++) {
             s += quals[k];
         }
         return s;
@@ -116,14 +120,14 @@ public class Haplotype {
 
     @Override
     public String toString() {
-        return new String(bases);
+        return getDisplayString();
     }
 
     public double[] getQuals() {
         return quals.clone();
     }
     public byte[] getBases() {
-        return bases.clone();
+        return super.getBases().clone();
     }
 
     public long getStartPosition() {
@@ -178,13 +182,13 @@ public class Haplotype {
     public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
         // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
         final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
-        if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype
+        if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= getBases().length ) { // desired change falls inside deletion so don't bother creating a new haplotype
             return null;
         }
         byte[] newHaplotypeBases = new byte[]{};
-        newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
+        newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), 0, haplotypeInsertLocation)); // bases before the variant
         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
-        newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
+        newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), haplotypeInsertLocation + refAllele.length(), getBases().length)); // bases after the variant
         return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation));
     }
 
diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java
index 33bca1a8a..0a0b4d0b7 100644
--- a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java
+++ b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java
@@ -111,7 +111,7 @@ public class Allele implements Comparable<Allele> {
     /** A generic static NO_CALL allele for use */
 
     // no public way to create an allele
-    private Allele(byte[] bases, boolean isRef) {
+    protected Allele(byte[] bases, boolean isRef) {
         // null alleles are no longer allowed
         if ( wouldBeNullAllele(bases) ) {
             throw new IllegalArgumentException("Null alleles are not supported");
@@ -140,7 +140,7 @@ public class Allele implements Comparable<Allele> {
             throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'");
     }
 
-    private Allele(String bases, boolean isRef) {
+    protected Allele(String bases, boolean isRef) {
         this(bases.getBytes(), isRef);
     }
 

From 0d282a7750df16b154f87cc83e391188c70e1dab Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 00:12:02 -0500
Subject: [PATCH 15/34] Bam writing from HaplotypeCaller seems to be working on
 all my test cases.  Note that it's a hidden debugging option for now. Please
 let me know if you notice any bad behavior with it.

---
 .../haplotypecaller/HaplotypeCaller.java      | 60 +++++++++++++++++--
 .../LikelihoodCalculationEngine.java          |  4 --
 .../broadinstitute/sting/utils/Haplotype.java |  8 ---
 3 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 04da91f65..4da2e1179 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -484,18 +484,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         }
 
         if ( bamWriter != null ) {
-            // sort the haplotypes in coordinate order and then write them to the bam
-            Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() );
+            // write the haplotypes to the bam
             final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion);
             for ( Haplotype haplotype : haplotypes )
                 writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype));
 
-            // now, output the interesting reads for each sample
+            // next, output the interesting reads for each sample aligned against the appropriate haplotype
             for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
                 for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
                     final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
                     if ( bestAllele != Allele.NO_CALL )
-                        writeReadAgainstHaplotype(entry.getKey(), (Haplotype)bestAllele);
+                        writeReadAgainstHaplotype(entry.getKey(), (Haplotype) bestAllele, paddedRefLoc.getStart());
                 }
             }
         }
@@ -607,8 +606,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         readGroups.add(rg);
         bamHeader.setReadGroups(readGroups);
 
+        bamWriter.setPresorted(false);
         bamWriter.writeHeader(bamHeader);
-        bamWriter.setPresorted(true);
     }
 
     private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) {
@@ -626,11 +625,60 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         bamWriter.addAlignment(record);
     }
 
-    private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype) {
+    private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype, final int referenceStart) {
 
+        final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), read.getReadBases(), 5.0, -10.0, -22.0, -1.2);
+        final int readStartOnHaplotype = swPairwiseAlignment.getAlignmentStart2wrt1();
+        final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype;
+        read.setAlignmentStart(readStartOnReference);
 
+        final Cigar cigar = generateReadCigarFromHaplotype(read, readStartOnHaplotype, haplotype.getCigar());
+        read.setCigar(cigar);
 
+        bamWriter.addAlignment(read);
+    }
 
+    private Cigar generateReadCigarFromHaplotype(final GATKSAMRecord read, final int readStartOnHaplotype, final Cigar haplotypeCigar) {
+
+        int currentReadPos = 0;
+        int currentHapPos = 0;
+        final List<CigarElement> readCigarElements = new ArrayList<CigarElement>();
+
+        for ( final CigarElement cigarElement : haplotypeCigar.getCigarElements() ) {
+
+            if ( cigarElement.getOperator() == CigarOperator.D ) {
+                if ( currentReadPos > 0 )
+                    readCigarElements.add(cigarElement);
+            } else if ( cigarElement.getOperator() == CigarOperator.M || cigarElement.getOperator() == CigarOperator.I ) {
+
+                final int elementLength = cigarElement.getLength();
+                final int nextReadPos = currentReadPos + elementLength;
+                final int nextHapPos = currentHapPos + elementLength;
+
+                // do we want this element?
+                if ( currentReadPos > 0 ) {
+                    // do we want the entire element?
+                    if ( nextReadPos < read.getReadLength() ) {
+                        readCigarElements.add(cigarElement);
+                        currentReadPos = nextReadPos;
+                    }
+                    // otherwise, we can finish up and return the cigar
+                    else {
+                        readCigarElements.add(new CigarElement(read.getReadLength() - currentReadPos, cigarElement.getOperator()));
+                        return new Cigar(readCigarElements);
+                    }
+                }
+                // do we want part of the element to start?
+                else if ( currentReadPos == 0 && nextHapPos > readStartOnHaplotype ) {
+                    currentReadPos = Math.min(nextHapPos - readStartOnHaplotype, read.getReadLength());
+                    readCigarElements.add(new CigarElement(currentReadPos, cigarElement.getOperator()));
+                }
+
+                currentHapPos = nextHapPos;
+            }
+        }
+
+        return new Cigar(readCigarElements);
     }
 
     /*
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index e05ad85a9..aafdbf126 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -145,10 +145,6 @@ public class LikelihoodCalculationEngine {
             for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
                 final Haplotype haplotype = haplotypes.get(jjj);
 
-                // TODO -- need to test against a reference/position with non-standard bases
-                //if ( !Allele.acceptableAlleleBases(haplotype.getBases(), false) )
-                //    continue;
-
                 final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
                 previousHaplotypeSeen = haplotype;
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index 4830bf053..8c40b9972 100644
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -211,14 +211,6 @@ public class Haplotype  extends Allele {
         }
     }
 
-    public static class HaplotypePositionComparator implements Comparator<Haplotype>, Serializable {
-        @Override
-        public int compare( final Haplotype hap1, final Haplotype hap2 ) {
-            final int comp = hap1.getAlignmentStartHapwrtRef() - hap2.getAlignmentStartHapwrtRef();
-            return comp == 0 ? HaplotypeBaseComparator.compareHaplotypeBases(hap1, hap2) : comp;
-        }
-    }
-
     public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
                                                                                final int startPos,
                                                                                final ReferenceContext ref,

From 392b5cbcdfd5200f04d0b26f9e73d16399e17769 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 10:22:43 -0500
Subject: [PATCH 16/34] The CachingIndexedFastaSequenceFile now automatically
 converts IUPAC bases to Ns and errors out on other non-standard bases. This
 way walkers won't see anything except the standard bases plus Ns in the
 reference. Added option to turn off this feature (to maintain backwards
 compatibility).

As part of this commit I cleaned up the BaseUtils code by adding a Base enum and removing all of the static indexes for
each of the bases.  This uncovered a bug in the way the DepthOfCoverage walker counts deletions (it was counting Ns instead!) that isn't covered by tests.  Fortunately that walker is being deprecated soon...
---
 .../gatk/walkers/annotator/GCContent.java     |   4 +-
 .../walkers/coverage/DepthOfCoverage.java     |   2 +-
 .../coverage/DepthOfCoverageStats.java        |   2 +-
 .../validation/ValidationAmplicons.java       |   2 +-
 .../ConcordanceMetricsUnitTest.java           |  48 ++++-----
 .../gatk/walkers/coverage/CoverageUtils.java  |   6 +-
 .../walkers/coverage/GCContentByInterval.java |   2 +-
 .../CachingIndexedFastaSequenceFile.java      |  44 +++++---
 .../sting/utils/pileup/PileupElement.java     |   3 +-
 .../sting/utils/sam/AlignmentUtils.java       |   9 +-
 .../variant/utils/BaseUtils.java              | 102 ++++++++++++------
 .../variant/utils/BaseUtilsUnitTest.java      |  15 +++
 .../GenotypeLikelihoodsUnitTest.java          |   6 +-
 13 files changed, 153 insertions(+), 92 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
index 3bb3d7d5a..2b3290595 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
@@ -95,9 +95,9 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota
 
         for ( byte base : ref.getBases() ) {
             int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
-            if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex )
+            if ( baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal() )
                 gc++;
-            else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex )
+            else if ( baseIndex == BaseUtils.Base.A.ordinal() || baseIndex == BaseUtils.Base.T.ordinal() )
                 at++;
             else
                 ; // ignore
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
index 1e4c55e0d..b10daab58 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
@@ -938,7 +938,7 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
         int nbases = 0;
         for ( byte b : BaseUtils.EXTENDED_BASES ) {
             nbases++;
-            if ( includeDeletions || b != BaseUtils.D ) {
+            if ( includeDeletions || b != BaseUtils.Base.D.base ) {
                 s.append((char)b);
                 s.append(":");
                 s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java
index 359af254b..6fe452a82 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java
@@ -223,7 +223,7 @@ public class DepthOfCoverageStats {
             int total = 0;
             int[] counts = countsBySample.get(s);
             for ( byte base : BaseUtils.EXTENDED_BASES ) {
-                if ( includeDeletions || ! ( base == BaseUtils.D) ) { // note basesAreEqual assigns TRUE to (N,D) as both have simple index -1
+                if ( includeDeletions || ! ( base == BaseUtils.Base.D.base) ) { // note basesAreEqual assigns TRUE to (N,D) as both have simple index -1
                     total += counts[BaseUtils.extendedBaseToBaseIndex(base)];
                 }
             }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
index 9ba74b98a..8547c0993 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
@@ -376,7 +376,7 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
                 if ( lowerCaseSNPs ) {
                     sequence.append(Character.toLowerCase((char) ref.getBase()));
                 } else {
-                    sequence.append((char) BaseUtils.N);
+                    sequence.append((char) BaseUtils.Base.N.base);
                 }
 
                 rawSequence.append(Character.toUpperCase((char) ref.getBase()));
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java
index 28f128dd3..6db44efd5 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java
@@ -111,8 +111,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
     private Pair<VariantContext,VariantContext> getData1() {
 
-        Allele reference_A = Allele.create(BaseUtils.A,true);
-        Allele alt_C = Allele.create(BaseUtils.C);
+        Allele reference_A = Allele.create(BaseUtils.Base.A.base,true);
+        Allele alt_C = Allele.create(BaseUtils.Base.C.base);
 
         Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A));
         Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C));
@@ -160,9 +160,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
     private Pair<VariantContext,VariantContext> getData2() {
 
-        Allele reference_A = Allele.create(BaseUtils.A,true);
-        Allele alt_C = Allele.create(BaseUtils.C);
-        Allele alt_T = Allele.create(BaseUtils.T);
+        Allele reference_A = Allele.create(BaseUtils.Base.A.base,true);
+        Allele alt_C = Allele.create(BaseUtils.Base.C.base);
+        Allele alt_T = Allele.create(BaseUtils.Base.T.base);
 
         Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A));
         Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_T));
@@ -213,10 +213,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
     private Pair<VariantContext,VariantContext> getData3() {
 
-        Allele reference_ACT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C,BaseUtils.T},true);
-        Allele alt_AC = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C});
-        Allele alt_A = Allele.create(BaseUtils.A);
-        Allele alt_ATT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.T,BaseUtils.T});
+        Allele reference_ACT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base,BaseUtils.Base.T.base},true);
+        Allele alt_AC = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base});
+        Allele alt_A = Allele.create(BaseUtils.Base.A.base);
+        Allele alt_ATT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.T.base,BaseUtils.Base.T.base});
 
         Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_ATT));
         Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A));
@@ -267,9 +267,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
     private Pair<VariantContext,VariantContext> getData4() {
 
-        Allele reference_A = Allele.create(BaseUtils.A,true);
-        Allele alt_C = Allele.create(BaseUtils.C);
-        Allele alt_T = Allele.create(BaseUtils.T);
+        Allele reference_A = Allele.create(BaseUtils.Base.A.base,true);
+        Allele alt_C = Allele.create(BaseUtils.Base.C.base);
+        Allele alt_T = Allele.create(BaseUtils.Base.T.base);
 
         Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A));
         Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL));
@@ -316,9 +316,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
     private Pair<VariantContext,VariantContext> getData5() {
 
-        Allele reference_A = Allele.create(BaseUtils.A,true);
-        Allele alt_C = Allele.create(BaseUtils.C);
-        Allele alt_T = Allele.create(BaseUtils.T);
+        Allele reference_A = Allele.create(BaseUtils.Base.A.base,true);
+        Allele alt_C = Allele.create(BaseUtils.Base.C.base);
+        Allele alt_T = Allele.create(BaseUtils.Base.T.base);
 
         Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A));
         Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", new ArrayList<Allele>(0));
@@ -368,8 +368,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
     private List<Pair<VariantContext,VariantContext>> getData6() {
 
-        Allele reference_A = Allele.create(BaseUtils.A,true);
-        Allele alt_C = Allele.create(BaseUtils.C);
+        Allele reference_A = Allele.create(BaseUtils.Base.A.base,true);
+        Allele alt_C = Allele.create(BaseUtils.Base.C.base);
 
 
         // site 1 -
@@ -396,8 +396,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
         Pair<VariantContext,VariantContext> testDataSite1 = new Pair<VariantContext, VariantContext>(eval_1_builder.make(),truth_1_builder.make());
 
-        reference_A = Allele.create(BaseUtils.A,true);
-        Allele alt_T = Allele.create(BaseUtils.T);
+        reference_A = Allele.create(BaseUtils.Base.A.base,true);
+        Allele alt_T = Allele.create(BaseUtils.Base.T.base);
 
         // site 2 -
         //  sample 1: no-call/hom-ref
@@ -421,7 +421,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
         Pair<VariantContext,VariantContext> testDataSite2 = new Pair<VariantContext, VariantContext>(eval_1_builder.make(),truth_1_builder.make());
 
-        Allele alt_G = Allele.create(BaseUtils.G);
+        Allele alt_G = Allele.create(BaseUtils.Base.G.base);
 
         // site 3 -
         //  sample 1: alleles do not match
@@ -605,10 +605,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest {
 
     public List<Pair<VariantContext,VariantContext>> getData7() {
 
-        Allele ref1 = Allele.create(BaseUtils.T,true);
-        Allele alt1 = Allele.create(BaseUtils.C);
-        Allele alt2 = Allele.create(BaseUtils.G);
-        Allele alt3 = Allele.create(BaseUtils.A);
+        Allele ref1 = Allele.create(BaseUtils.Base.T.base,true);
+        Allele alt1 = Allele.create(BaseUtils.Base.C.base);
+        Allele alt2 = Allele.create(BaseUtils.Base.G.base);
+        Allele alt3 = Allele.create(BaseUtils.Base.A.base);
 
         GenomeLoc loc1 = genomeLocParser.createGenomeLoc("chr1",1,1);
         VariantContextBuilder site1Eval = new VariantContextBuilder();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java
index 573291d06..fe2eee2a2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java
@@ -217,9 +217,9 @@ public class CoverageUtils {
 
     private static void updateCounts(int[] counts, PileupElement e) {
         if ( e.isDeletion() ) {
-            counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount();
-        } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) {
-            counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount();
+            counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount();
+        } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) {
+            counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount();
         } else {
             try {
                 counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
index 9cd1be2d9..668d3fd5f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
@@ -86,7 +86,7 @@ public class GCContentByInterval extends LocusWalker<Long, Long> {
         if (tracker == null)
             return null;
         int baseIndex = ref.getBaseIndex();
-        return (baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex) ? 1L : 0L;
+        return (baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal()) ? 1L : 0L;
     }
 
     public Long reduce(Long toAdd, Long runningCount) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
index 3d43d5d4d..88eaa8910 100644
--- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
+++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
@@ -33,6 +33,7 @@ import net.sf.samtools.SAMSequenceRecord;
 import net.sf.samtools.util.StringUtil;
 import org.apache.log4j.Priority;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.variant.utils.BaseUtils;
 
 import java.io.File;
 import java.io.FileNotFoundException;
@@ -41,9 +42,10 @@ import java.util.Arrays;
 /**
  * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer.
  *
- * Thread-safe!  Uses a thread-local cache
+ * Thread-safe!  Uses a thread-local cache.
  *
- * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set
+ * Automatically upper-cases the bases coming in, unless the flag preserveCase is explicitly set.
+ * Automatically converts IUPAC bases to Ns, unless the flag preserveIUPAC is explicitly set.
  */
 public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
     protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class);
@@ -64,10 +66,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
     private final long cacheMissBackup;
 
     /**
-     * If true, we will preserve the case of the original base in the genome, not
+     * If true, we will preserve the case of the original base in the genome
      */
     private final boolean preserveCase;
 
+    /**
+     * If true, we will preserve the IUPAC bases in the genome
+     */
+    private final boolean preserveIUPAC;
+
     // information about checking efficiency
     long cacheHits = 0;
     long cacheMisses = 0;
@@ -97,13 +104,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
      * @param index the index of the fasta file, used for efficient random access
      * @param cacheSize the size in bp of the cache we will use for this reader
      * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
+     * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns
      */
-    public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) {
+    public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) {
         super(fasta, index);
         if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
         this.cacheSize = cacheSize;
         this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
         this.preserveCase = preserveCase;
+        this.preserveIUPAC = preserveIUPAC;
     }
 
     /**
@@ -122,19 +131,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
         this.cacheSize = cacheSize;
         this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
         this.preserveCase = preserveCase;
+        preserveIUPAC = false;
     }
 
-//    /**
-//     * Open the given indexed fasta sequence file.  Throw an exception if the file cannot be opened.
-//     *
-//     * @param fasta The file to open.
-//     * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
-//     * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
-//     */
-//    public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) {
-//        this(fasta, index, DEFAULT_CACHE_SIZE);
-//    }
-
     /**
      * Same as general constructor but allows one to override the default cacheSize
      *
@@ -145,7 +144,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
      * @param cacheSize the size in bp of the cache we will use for this reader
      */
     public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) {
-        this(fasta, index, cacheSize, false);
+        this(fasta, index, cacheSize, false, false);
     }
 
     /**
@@ -240,6 +239,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
         return ! isPreservingCase();
     }
 
+    /**
+     * Is this CachingIndexedFastaReader keeping the IUPAC bases in the fasta, or is it turning them into Ns?
+     *
+     * @return true if the IUPAC bases coming from this reader are not modified
+     */
+    public boolean isPreservingIUPAC() {
+        return preserveIUPAC;
+    }
+
     /**
      * Gets the subsequence of the contig in the range [start,stop]
      *
@@ -261,8 +269,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
             cacheMisses++;
             result = super.getSubsequenceAt(contig, start, stop);
             if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
+            if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true);
         } else {
-            // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true
+            // todo -- potential optimization is to check if contig.name == contig, as this in general will be true
             SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
 
             if (stop > contigInfo.getSequenceLength())
@@ -276,6 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
 
                 // convert all of the bases in the sequence to upper case if we aren't preserving cases
                 if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
+                if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true);
             } else {
                 cacheHits++;
             }
diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
index c0e18f227..5a5358208 100644
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
@@ -31,7 +31,6 @@ import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
 import org.broadinstitute.variant.utils.BaseUtils;
 import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
@@ -52,7 +51,7 @@ public class PileupElement implements Comparable<PileupElement> {
     private final static EnumSet<CigarOperator> ON_GENOME_OPERATORS =
             EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D);
 
-    public static final byte DELETION_BASE = BaseUtils.D;
+    public static final byte DELETION_BASE = BaseUtils.Base.D.base;
     public static final byte DELETION_QUAL = (byte) 16;
     public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87;
     public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88;
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
index 0907a0239..b7a813ec2 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@@ -31,7 +31,6 @@ import net.sf.samtools.CigarOperator;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.variant.utils.BaseUtils;
-import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@@ -402,13 +401,13 @@ public class AlignmentUtils {
             switch (ce.getOperator()) {
                 case I:
                     if (alignPos > 0) {
-                        if (alignment[alignPos - 1] == BaseUtils.A) {
+                        if (alignment[alignPos - 1] == BaseUtils.Base.A.base) {
                             alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE;
-                        } else if (alignment[alignPos - 1] == BaseUtils.C) {
+                        } else if (alignment[alignPos - 1] == BaseUtils.Base.C.base) {
                             alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE;
-                        } else if (alignment[alignPos - 1] == BaseUtils.T) {
+                        } else if (alignment[alignPos - 1] == BaseUtils.Base.T.base) {
                             alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE;
-                        } else if (alignment[alignPos - 1] == BaseUtils.G) {
+                        } else if (alignment[alignPos - 1] == BaseUtils.Base.G.base) {
                             alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE;
                         }
                     }
diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java
index 819041a3e..7a37e8de5 100644
--- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java
+++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java
@@ -26,6 +26,7 @@
 package org.broadinstitute.variant.utils;
 
 import net.sf.samtools.util.StringUtil;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 
 import java.util.Arrays;
 import java.util.Random;
@@ -34,42 +35,66 @@ import java.util.Random;
  * BaseUtils contains some basic utilities for manipulating nucleotides.
  */
 public class BaseUtils {
-    public final static byte A = (byte) 'A';
-    public final static byte C = (byte) 'C';
-    public final static byte G = (byte) 'G';
-    public final static byte T = (byte) 'T';
 
-    public final static byte N = (byte) 'N';
-    public final static byte D = (byte) 'D';
+    public enum Base {
+        A ((byte)'A'),
+        C ((byte)'C'),
+        G ((byte)'G'),
+        T ((byte)'T'),
+        N ((byte)'N'),
+        D ((byte)'D');
 
-    //
-    // todo -- we need a generalized base abstraction using the Base enum.
-    //
+        public byte base;
+
+        private Base(final byte base) {
+            this.base = base;
+        }
+    }
+
+    // todo -- add this to the generalized base abstraction using the Base enum.
     public final static byte[] BASES = {'A', 'C', 'G', 'T'};
     public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
 
     static private final int[] baseIndexMap = new int[256];
     static {
         Arrays.fill(baseIndexMap, -1);
-        baseIndexMap['A'] = 0;
-        baseIndexMap['a'] = 0;
-        baseIndexMap['*'] = 0;    // the wildcard character counts as an A
-        baseIndexMap['C'] = 1;
-        baseIndexMap['c'] = 1;
-        baseIndexMap['G'] = 2;
-        baseIndexMap['g'] = 2;
-        baseIndexMap['T'] = 3;
-        baseIndexMap['t'] = 3;
+        baseIndexMap['A'] = Base.A.ordinal();
+        baseIndexMap['a'] = Base.A.ordinal();
+        baseIndexMap['*'] = Base.A.ordinal();    // the wildcard character counts as an A
+        baseIndexMap['C'] = Base.C.ordinal();
+        baseIndexMap['c'] = Base.C.ordinal();
+        baseIndexMap['G'] = Base.G.ordinal();
+        baseIndexMap['g'] = Base.G.ordinal();
+        baseIndexMap['T'] = Base.T.ordinal();
+        baseIndexMap['t'] = Base.T.ordinal();
     }
 
-    // todo -- fix me (enums?)
-    public static final byte DELETION_INDEX = 4;
-    public static final byte NO_CALL_INDEX = 5; // (this is 'N')
-
-    public static final int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A');
-    public static final int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C');
-    public static final int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G');
-    public static final int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T');
+    static private final int[] baseIndexWithIupacMap = baseIndexMap.clone();
+    static {
+        baseIndexWithIupacMap['*'] = -1;    // the wildcard character is bad
+        baseIndexWithIupacMap['N'] = Base.N.ordinal();
+        baseIndexWithIupacMap['n'] = Base.N.ordinal();
+        baseIndexWithIupacMap['R'] = Base.N.ordinal();
+        baseIndexWithIupacMap['r'] = Base.N.ordinal();
+        baseIndexWithIupacMap['Y'] = Base.N.ordinal();
+        baseIndexWithIupacMap['y'] = Base.N.ordinal();
+        baseIndexWithIupacMap['M'] = Base.N.ordinal();
+        baseIndexWithIupacMap['m'] = Base.N.ordinal();
+        baseIndexWithIupacMap['K'] = Base.N.ordinal();
+        baseIndexWithIupacMap['k'] = Base.N.ordinal();
+        baseIndexWithIupacMap['W'] = Base.N.ordinal();
+        baseIndexWithIupacMap['w'] = Base.N.ordinal();
+        baseIndexWithIupacMap['S'] = Base.N.ordinal();
+        baseIndexWithIupacMap['s'] = Base.N.ordinal();
+        baseIndexWithIupacMap['B'] = Base.N.ordinal();
+        baseIndexWithIupacMap['b'] = Base.N.ordinal();
+        baseIndexWithIupacMap['D'] = Base.N.ordinal();
+        baseIndexWithIupacMap['d'] = Base.N.ordinal();
+        baseIndexWithIupacMap['H'] = Base.N.ordinal();
+        baseIndexWithIupacMap['h'] = Base.N.ordinal();
+        baseIndexWithIupacMap['V'] = Base.N.ordinal();
+        baseIndexWithIupacMap['v'] = Base.N.ordinal();
+    }
 
     // Use a fixed random seed to allow for deterministic results when using random bases
     private static final Random randomNumberGen = new Random(47382911L);
@@ -96,10 +121,10 @@ public class BaseUtils {
     }
 
     public static boolean isTransition(byte base1, byte base2) {
-        int b1 = simpleBaseToBaseIndex(base1);
-        int b2 = simpleBaseToBaseIndex(base2);
-        return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 ||
-                b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
+        final int b1 = simpleBaseToBaseIndex(base1);
+        final int b2 = simpleBaseToBaseIndex(base2);
+        return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() ||
+                b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal();
     }
 
     public static boolean isTransversion(byte base1, byte base2) {
@@ -141,6 +166,19 @@ public class BaseUtils {
         return base >= 'A' && base <= 'Z';
     }
 
+    public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) {
+        final int length = bases.length;
+        for ( int i = 0; i < length; i++ ) {
+            final int baseIndex = baseIndexWithIupacMap[bases[i]];
+            if ( baseIndex == Base.N.ordinal() ) {
+                bases[i] = 'N';
+            } else if ( errorOnBadReferenceBase && baseIndex == -1 ) {
+                throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'");
+            }
+        }
+        return bases;
+    }
+
     /**
      * Converts a IUPAC nucleotide code to a pair of bases
      *
@@ -231,10 +269,10 @@ public class BaseUtils {
         switch (base) {
             case 'd':
             case 'D':
-                return DELETION_INDEX;
+                return Base.D.ordinal();
             case 'n':
             case 'N':
-                return NO_CALL_INDEX;
+                return Base.N.ordinal();
 
             default:
                 return simpleBaseToBaseIndex(base);
diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java
index 372d13a7a..4f918f718 100644
--- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java
@@ -50,6 +50,21 @@ public class BaseUtilsUnitTest extends BaseTest {
         Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0);
     }
 
+    @Test
+    public void testConvertIUPACtoN() {
+
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false), new byte[]{'A', 'A', 'A'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false), new byte[]{'N', 'A', 'A'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false), new byte[]{'A', 'N', 'A'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false), new byte[]{'A', 'A', 'N'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false), new byte[]{'N', 'N', 'N'});
+    }
+
+    private void checkBytesAreEqual(final byte[] b1, final byte[] b2) {
+        for ( int i = 0; i < b1.length; i++ )
+            Assert.assertEquals(b1[i], b2[i]);
+    }
+
     @Test
     public void testTransitionTransversion() {
         logger.warn("Executing testTransitionTransversion");
diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java
index 49720d1f6..03d6f457f 100644
--- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java
+++ b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java
@@ -154,9 +154,9 @@ public class GenotypeLikelihoodsUnitTest {
 
     public void testGetQualFromLikelihoodsMultiAllelic() {
         GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
-        Allele ref = Allele.create(BaseUtils.A,true);
-        Allele alt1 = Allele.create(BaseUtils.C);
-        Allele alt2 = Allele.create(BaseUtils.T);
+        Allele ref = Allele.create(BaseUtils.Base.A.base,true);
+        Allele alt1 = Allele.create(BaseUtils.Base.C.base);
+        Allele alt2 = Allele.create(BaseUtils.Base.T.base);
         List<Allele> allAlleles = Arrays.asList(ref,alt1,alt2);
         List<Allele> gtAlleles = Arrays.asList(alt1,alt2);
         GenotypeBuilder gtBuilder = new GenotypeBuilder();

From 445735a4a53f22a5ddd214b95f8e0f4eed8bd593 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 11:10:13 -0500
Subject: [PATCH 17/34] There was no reason to be sharing the Haplotype
 infrastructure between the HaplotypeCaller and the HaplotypeScore annotation
 since they were really looking for different things. Separated them out,
 adding efficiencies for the HaplotypeScore version.

---
 .../walkers/annotator/HaplotypeScore.java     | 57 +++++++++++++++----
 .../SimpleDeBruijnAssembler.java              |  2 +-
 .../broadinstitute/sting/utils/Haplotype.java | 49 ++--------------
 3 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
index fe4075117..af6304297 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
@@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
 import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.variant.utils.BaseUtils;
-import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.variant.vcf.VCFHeaderLineType;
@@ -217,14 +216,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
             final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
 
             List<Haplotype> hlist = new ArrayList<Haplotype>();
-            hlist.add(new Haplotype(haplotype1.getBases(), 60));
+            hlist.add(new Haplotype(haplotype1.getBases(), (byte)60));
 
             for (int k = 1; k < haplotypesToCompute; k++) {
                 Haplotype haplotype2 = consensusHaplotypeQueue.poll();
                 if (haplotype2 == null) {
                     haplotype2 = haplotype1;
                 } // Sometimes only the reference haplotype can be found
-                hlist.add(new Haplotype(haplotype2.getBases(), 20));
+                hlist.add(new Haplotype(haplotype2.getBases(), (byte)20));
             }
             return hlist;
         } else
@@ -236,8 +235,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
 
         final byte[] haplotypeBases = new byte[contextSize];
         Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD);
-        final double[] baseQualities = new double[contextSize];
-        Arrays.fill(baseQualities, 0.0);
+        final byte[] baseQualities = new byte[contextSize];
+        Arrays.fill(baseQualities, (byte)0);
 
         byte[] readBases = read.getReadBases();
         readBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readBases); // Adjust the read bases based on the Cigar string
@@ -267,7 +266,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
                 readQuals[baseOffset] = (byte) 0;
             } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them
             haplotypeBases[i] = readBases[baseOffset];
-            baseQualities[i] = (double) readQuals[baseOffset];
+            baseQualities[i] = readQuals[baseOffset];
         }
 
         return new Haplotype(haplotypeBases, baseQualities);
@@ -286,10 +285,10 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
 
         final int length = a.length;
         final byte[] consensusChars = new byte[length];
-        final double[] consensusQuals = new double[length];
+        final byte[] consensusQuals = new byte[length];
 
-        final double[] qualsA = haplotypeA.getQuals();
-        final double[] qualsB = haplotypeB.getQuals();
+        final byte[] qualsA = haplotypeA.getQuals();
+        final byte[] qualsB = haplotypeB.getQuals();
 
         for (int i = 0; i < length; i++) {
             chA = a[i];
@@ -300,7 +299,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
 
             if ((chA == wc) && (chB == wc)) {
                 consensusChars[i] = wc;
-                consensusQuals[i] = 0.0;
+                consensusQuals[i] = 0;
             } else if ((chA == wc)) {
                 consensusChars[i] = chB;
                 consensusQuals[i] = qualsB[i];
@@ -309,7 +308,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
                 consensusQuals[i] = qualsA[i];
             } else {
                 consensusChars[i] = chA;
-                consensusQuals[i] = qualsA[i] + qualsB[i];
+                consensusQuals[i] = (byte)((int)qualsA[i] + (int)qualsB[i]);
             }
         }
 
@@ -433,7 +432,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
 
     }
 
-
     public List<String> getKeyNames() {
         return Arrays.asList("HaplotypeScore");
     }
@@ -441,4 +439,39 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
     public List<VCFInfoHeaderLine> getDescriptions() {
         return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes"));
     }
+
+    private static class Haplotype  {
+        private final byte[] bases;
+        private final byte[] quals;
+        private int qualitySum = -1;
+
+        public Haplotype( final byte[] bases, final byte[] quals ) {
+            this.bases = bases;
+            this.quals = quals;
+        }
+
+        public Haplotype( final byte[] bases, final byte qual ) {
+            this.bases = bases;
+            quals = new byte[bases.length];
+            Arrays.fill(quals, qual);
+        }
+
+        public double getQualitySum() {
+            if ( qualitySum == -1 ) {
+                qualitySum = 0;
+                for ( final byte qual : quals ) {
+                    qualitySum += (int)qual;
+                }
+            }
+            return qualitySum;
+        }
+
+        public byte[] getQuals() {
+            return quals.clone();
+        }
+
+        public byte[] getBases() {
+            return bases.clone();
+        }
+    }
 }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
index e1a94eee7..e16994fa4 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@@ -338,7 +338,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
             for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
 
-                final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
+                final Haplotype h = new Haplotype( path.getBases( graph ) );
                 if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ) ) {
 
                     // for GGA mode, add the desired allele into the haplotype if it isn't already present
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index 8c40b9972..66aed1173 100644
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -37,8 +37,8 @@ import org.broadinstitute.variant.variantcontext.VariantContext;
 import java.io.Serializable;
 import java.util.*;
 
-public class Haplotype  extends Allele {
-    protected final double[] quals;
+public class Haplotype extends Allele {
+
     private GenomeLoc genomeLocation = null;
     private HashMap<Integer, VariantContext> eventMap = null;
     private Cigar cigar;
@@ -51,49 +51,23 @@ public class Haplotype  extends Allele {
      * Main constructor
      *
      * @param bases bases
-     * @param quals quals
      * @param isRef is reference allele?
      */
-    public Haplotype( final byte[] bases, final double[] quals, final boolean isRef ) {
-        super(bases.clone(), isRef);
-        this.quals = quals.clone();
-    }
-
-    /**
-     * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
-     *
-     * @param bases bases
-     * @param qual  qual
-     */
-    public Haplotype( final byte[] bases, final int qual, final boolean isRef ) {
-        super(bases.clone(), isRef);
-        quals = new double[bases.length];
-        Arrays.fill(quals, (double)qual);
-    }
-
-    public Haplotype( final byte[] bases, final int qual ) {
-        this(bases, qual, false);
-    }
-
     public Haplotype( final byte[] bases, final boolean isRef ) {
-        this(bases, 0, isRef);
-    }
-
-    public Haplotype( final byte[] bases, final double[] quals ) {
-        this(bases, quals, false);
+        super(bases.clone(), isRef);
     }
 
     public Haplotype( final byte[] bases ) {
-        this(bases, 0, false);
+        this(bases, false);
     }
 
     protected Haplotype( final byte[] bases, final Event artificialEvent ) {
-        this(bases, 0, false);
+        this(bases, false);
         this.artificialEvent = artificialEvent;
     }
 
     public Haplotype( final byte[] bases, final GenomeLoc loc ) {
-        this(bases, 0, false);
+        this(bases, false);
         this.genomeLocation = loc;
     }
 
@@ -110,22 +84,11 @@ public class Haplotype  extends Allele {
         this.eventMap = eventMap;
     }
 
-    public double getQualitySum() {
-        double s = 0;
-        for (int k=0; k < quals.length; k++) {
-            s += quals[k];
-        }
-        return s;
-    }
-
     @Override
     public String toString() {
         return getDisplayString();
     }
 
-    public double[] getQuals() {
-        return quals.clone();
-    }
     public byte[] getBases() {
         return super.getBases().clone();
     }

From 4ffb43079f020e1a1ed3dc2fffc02a1bf660e01d Mon Sep 17 00:00:00 2001
From: Khalid Shakir <kshakir@broadinstitute.org>
Date: Wed, 16 Jan 2013 12:43:15 -0500
Subject: [PATCH 18/34] Re-committing the following changes from Dec 18:
 Refactored interval specific arguments out of GATKArgumentCollection into
 InvtervalArgumentCollection such that it can be used in other
 CommandLinePrograms. Updated SelectHeaders to print out full interval
 arguments. Added RemoteFile.createUrl(Date expiration) to enable creation of
 presigned URLs for download over http: or file:.

---
 .../walkers/variantutils/SelectHeaders.java   | 51 +++++++++++---
 .../IntervalArgumentCollection.java           | 70 +++++++++++++++++++
 .../sting/gatk/GenomeAnalysisEngine.java      | 38 +---------
 .../arguments/GATKArgumentCollection.java     | 45 +-----------
 .../sting/utils/interval/IntervalUtils.java   | 42 +++++++++++
 .../broadinstitute/variant/vcf/VCFHeader.java |  4 ++
 .../utils/interval/IntervalUtilsUnitTest.java |  4 +-
 .../sting/queue/util/RemoteFile.scala         |  3 +
 8 files changed, 166 insertions(+), 91 deletions(-)
 create mode 100644 public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java
index 81a17b6ae..38fa060cc 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java
@@ -57,6 +57,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.gatk.walkers.TreeReducible;
 import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
+import org.broadinstitute.sting.utils.interval.IntervalSetRule;
 import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
 import org.broadinstitute.variant.vcf.*;
 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
@@ -180,18 +182,47 @@ public class SelectHeaders extends RodWalker<Integer, Integer> implements TreeRe
         headerLines = new LinkedHashSet<VCFHeaderLine>(getSelectedHeaders(headerLines));
 
         // Optionally add in the intervals.
-        if (includeIntervals && getToolkit().getArguments().intervals != null) {
-            for (IntervalBinding<Feature> intervalBinding : getToolkit().getArguments().intervals) {
-                String source = intervalBinding.getSource();
-                if (source == null)
-                    continue;
-                File file = new File(source);
-                if (file.exists()) {
-                    headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
-                } else {
-                    headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
+        if (includeIntervals) {
+            IntervalArgumentCollection intervalArguments = getToolkit().getArguments().intervalArguments;
+            if (intervalArguments.intervals != null) {
+                for (IntervalBinding<Feature> intervalBinding : intervalArguments.intervals) {
+                    String source = intervalBinding.getSource();
+                    if (source == null)
+                        continue;
+                    File file = new File(source);
+                    if (file.exists()) {
+                        headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
+                    } else {
+                        headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
+                    }
                 }
             }
+
+            if (intervalArguments.excludeIntervals != null) {
+                for (IntervalBinding<Feature> intervalBinding : intervalArguments.excludeIntervals) {
+                    String source = intervalBinding.getSource();
+                    if (source == null)
+                        continue;
+                    File file = new File(source);
+                    if (file.exists()) {
+                        headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
+                    } else {
+                        headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, source));
+                    }
+                }
+            }
+
+            if (intervalArguments.intervalMerging != IntervalMergingRule.ALL) {
+                headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_MERGING_KEY, String.valueOf(intervalArguments.intervalMerging)));
+            }
+
+            if (intervalArguments.intervalSetRule != IntervalSetRule.UNION) {
+                headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_SET_RULE_KEY, String.valueOf(intervalArguments.intervalSetRule)));
+            }
+
+            if (intervalArguments.intervalPadding != 0) {
+                headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_PADDING_KEY, String.valueOf(intervalArguments.intervalPadding)));
+            }
         }
 
         TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java
new file mode 100644
index 000000000..3f76ae652
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java
@@ -0,0 +1,70 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.sting.commandline;
+
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
+import org.broadinstitute.sting.utils.interval.IntervalSetRule;
+
+import java.util.List;
+
+public class IntervalArgumentCollection {
+    /**
+     * Using this option one can instruct the GATK engine to traverse over only part of the genome.  This argument can be specified multiple times.
+     * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
+     * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
+     * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
+     */
+    @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
+    public List<IntervalBinding<Feature>> intervals = null;
+
+    /**
+     * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome.  This argument can be specified multiple times.
+     * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
+     * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
+     */
+    @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
+    public List<IntervalBinding<Feature>> excludeIntervals = null;
+
+    /**
+     * How should the intervals specified by multiple -L or -XL arguments be combined?  Using this argument one can, for example, traverse over all of the positions
+     * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
+     */
+    @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
+    public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
+
+    /**
+     * Should abutting (but not overlapping) intervals be treated as separate intervals?
+     */
+    @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
+    public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
+
+    /**
+     * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
+     */
+    @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
+    public int intervalPadding = 0;
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index f9d6955c0..9b801be7d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -55,7 +55,6 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
 import org.broadinstitute.sting.gatk.walkers.*;
 import org.broadinstitute.sting.utils.*;
 import org.broadinstitute.sting.utils.classloader.PluginManager;
-import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.interval.IntervalUtils;
@@ -361,7 +360,6 @@ public class GenomeAnalysisEngine {
      * Returns a list of active, initialized read transformers
      *
      * @param walker the walker we need to apply read transformers too
-     * @return a non-null list of read transformers
      */
     public void initializeReadTransformers(final Walker walker) {
         final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
@@ -672,41 +670,7 @@ public class GenomeAnalysisEngine {
      * Setup the intervals to be processed
      */
     protected void initializeIntervals() {
-        // return if no interval arguments at all
-        if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
-            return;
-
-        // Note that the use of '-L all' is no longer supported.
-
-        // if include argument isn't given, create new set of all possible intervals
-
-        final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
-                this.referenceDataSource,
-                argCollection.intervals,
-                argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
-                argCollection.excludeIntervals);
-
-        final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
-        final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
-
-        // if no exclude arguments, can return parseIntervalArguments directly
-        if ( excludeSortedSet == null )
-            intervals = includeSortedSet;
-
-        // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
-        else {
-            intervals = includeSortedSet.subtractRegions(excludeSortedSet);
-
-            // logging messages only printed when exclude (-XL) arguments are given
-            final long toPruneSize = includeSortedSet.coveredSize();
-            final long toExcludeSize = excludeSortedSet.coveredSize();
-            final long intervalSize = intervals.coveredSize();
-            logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
-            logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
-                    toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
-        }
-
-        logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
+        intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments);
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index ab09064dd..62ca38ad2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -26,11 +26,7 @@
 package org.broadinstitute.sting.gatk.arguments;
 
 import net.sf.samtools.SAMFileReader;
-import org.broad.tribble.Feature;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Hidden;
-import org.broadinstitute.sting.commandline.Input;
-import org.broadinstitute.sting.commandline.IntervalBinding;
+import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
 import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
@@ -38,8 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
 import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
-import org.broadinstitute.sting.utils.interval.IntervalSetRule;
 
 import java.io.File;
 import java.util.ArrayList;
@@ -100,41 +94,8 @@ public class GATKArgumentCollection {
     @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
     public List<String> readFilters = new ArrayList<String>();
 
-    /**
-     * Using this option one can instruct the GATK engine to traverse over only part of the genome.  This argument can be specified multiple times.
-     * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
-     * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
-     * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
-     */
-    @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
-    public List<IntervalBinding<Feature>> intervals = null;
-
-    /**
-     * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome.  This argument can be specified multiple times.
-     * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
-     * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
-     */
-    @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
-    public List<IntervalBinding<Feature>> excludeIntervals = null;
-
-    /**
-     * How should the intervals specified by multiple -L or -XL arguments be combined?  Using this argument one can, for example, traverse over all of the positions
-     * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
-     */
-    @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
-    public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
-
-    /**
-     * Should abutting (but not overlapping) intervals be treated as separate intervals?
-     */
-    @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
-    public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
-
-    /**
-     * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
-     */
-    @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
-    public int intervalPadding = 0;
+    @ArgumentCollection
+    public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection();
 
     @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
     public File referenceFile = null;
diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
index c647a7b80..7374dda14 100644
--- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
@@ -32,6 +32,7 @@ import net.sf.picard.util.IntervalList;
 import net.sf.samtools.SAMFileHeader;
 import org.apache.log4j.Logger;
 import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.IntervalArgumentCollection;
 import org.broadinstitute.sting.commandline.IntervalBinding;
 import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
 import org.broadinstitute.sting.utils.GenomeLoc;
@@ -534,6 +535,47 @@ public class IntervalUtils {
         }
     }
 
+    public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) {
+        GenomeLocSortedSet intervals = null;
+
+        // return if no interval arguments at all
+        if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
+            return intervals;
+
+        // Note that the use of '-L all' is no longer supported.
+
+        // if include argument isn't given, create new set of all possible intervals
+
+        final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
+                referenceDataSource,
+                argCollection.intervals,
+                argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
+                argCollection.excludeIntervals);
+
+        final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
+        final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
+
+        // if no exclude arguments, can return parseIntervalArguments directly
+        if ( excludeSortedSet == null )
+            intervals = includeSortedSet;
+
+            // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
+        else {
+            intervals = includeSortedSet.subtractRegions(excludeSortedSet);
+
+            // logging messages only printed when exclude (-XL) arguments are given
+            final long toPruneSize = includeSortedSet.coveredSize();
+            final long toExcludeSize = excludeSortedSet.coveredSize();
+            final long intervalSize = intervals.coveredSize();
+            logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
+            logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
+                    toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
+        }
+
+        logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
+        return intervals;
+    }
+
     public static Pair<GenomeLocSortedSet, GenomeLocSortedSet> parseIntervalBindingsPair(
             final ReferenceDataSource referenceDataSource,
             final List<IntervalBinding<Feature>> intervals,
diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java
index 583a01417..9bdb86a48 100644
--- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java
@@ -73,6 +73,10 @@ public class VCFHeader {
     public static final String REFERENCE_KEY = "reference";
     public static final String CONTIG_KEY = "contig";
     public static final String INTERVALS_KEY = "intervals";
+    public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
+    public static final String INTERVAL_MERGING_KEY = "interval_merging";
+    public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
+    public static final String INTERVAL_PADDING_KEY = "interval_padding";
 
     // were the input samples sorted originally (or are we sorting them)?
     private boolean samplesWereAlreadySorted = true;
diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
index 35f9d4137..2be2745de 100644
--- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
@@ -1068,7 +1068,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
         List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
         intervalArgs.add(new IntervalBinding<Feature>(picardIntervalFile.getAbsolutePath()));
 
-        IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser);
+        IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
     }
 
     @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
@@ -1081,7 +1081,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
         List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
         intervalArgs.add(new IntervalBinding<Feature>(gatkIntervalFile.getAbsolutePath()));
 
-        IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser);
+        IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
     }
 
     private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception {
diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala
index 28be82136..23a99b586 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala
@@ -27,6 +27,8 @@ package org.broadinstitute.sting.queue.util
 
 import java.io.File
 import org.broadinstitute.sting.utils.io.FileExtension
+import java.util.Date
+import java.net.URL
 
 /**
  * An extension of java.io.File that can be pulled from or pushed to a remote location.
@@ -35,5 +37,6 @@ trait RemoteFile extends File with FileExtension {
   def pullToLocal()
   def pushToRemote()
   def deleteRemote()
+  def createUrl(expiration: Date): URL
   def remoteDescription: String
 }

From d18dbcbac103c0ce8f0480e04efcdd00a50f3394 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 14:55:33 -0500
Subject: [PATCH 19/34] Added tests for changing IUPAC bases to Ns, for failing
 on bad ref bases, and for the HaplotypeCaller not failing when running over a
 region with an IUPAC base.

Out of curiosity, why does Picard's IndexedFastaSequenceFile allow one to query for start position 0?  When doing so, that base is a line feed (-1 offset to the first base in the contig) which is an illegal base (and which caused me no end of trouble)...
---
 .../HaplotypeCallerIntegrationTest.java       |  9 +++++
 .../CachingIndexedFastaSequenceFile.java      | 14 +++----
 .../variant/utils/BaseUtils.java              |  6 ++-
 ...chingIndexedFastaSequenceFileUnitTest.java | 39 +++++++++++++++++--
 4 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index d95da6b7f..6183fc411 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -50,6 +50,7 @@ import org.broadinstitute.sting.WalkerTest;
 import org.testng.annotations.Test;
 
 import java.util.Arrays;
+import java.util.Collections;
 
 public class HaplotypeCallerIntegrationTest extends WalkerTest {
     final static String REF = b37KGReference;
@@ -156,6 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
         executeTest("HCTestStructuralIndels: ", spec);
     }
 
+    @Test
+    public void HCTestDoesNotFailOnBadRefBase() {
+        // don't care about the output - just want to make sure it doesn't fail
+        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.<String>emptyList());
+        executeTest("HCTestDoesNotFailOnBadRefBase: ", spec);
+    }
+
     // --------------------------------------------------------------------------------------------------------------
     //
     // testing reduced reads
diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
index 88eaa8910..a749625cd 100644
--- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
+++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java
@@ -125,13 +125,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
      * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
      * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
      */
-    public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException {
+    public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean  preserveIUPAC) throws FileNotFoundException {
         super(fasta);
         if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
         this.cacheSize = cacheSize;
         this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
         this.preserveCase = preserveCase;
-        preserveIUPAC = false;
+        this.preserveIUPAC = preserveIUPAC;
     }
 
     /**
@@ -168,7 +168,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
      * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
      */
     public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException {
-        this(fasta, DEFAULT_CACHE_SIZE, preserveCase);
+        this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false);
     }
 
     /**
@@ -181,7 +181,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
      * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
      */
     public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
-        this(fasta, cacheSize, false);
+        this(fasta, cacheSize, false, false);
     }
 
     /**
@@ -261,7 +261,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
      *         all of the bases in the ReferenceSequence returned by this method will be upper cased.
      */
     @Override
-    public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) {
+    public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) {
         final ReferenceSequence result;
         final Cache myCache = cache.get();
 
@@ -269,7 +269,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
             cacheMisses++;
             result = super.getSubsequenceAt(contig, start, stop);
             if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
-            if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true);
+            if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1);
         } else {
             // todo -- potential optimization is to check if contig.name == contig, as this in general will be true
             SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
@@ -285,7 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
 
                 // convert all of the bases in the sequence to upper case if we aren't preserving cases
                 if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
-                if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true);
+                if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0);
             } else {
                 cacheHits++;
             }
diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java
index 7a37e8de5..a6ac2ca53 100644
--- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java
+++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java
@@ -166,9 +166,11 @@ public class BaseUtils {
         return base >= 'A' && base <= 'Z';
     }
 
-    public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) {
+    public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) {
         final int length = bases.length;
-        for ( int i = 0; i < length; i++ ) {
+        final int start = ignoreConversionOfFirstByte ? 1 : 0;
+
+        for ( int i = start; i < length; i++ ) {
             final int baseIndex = baseIndexWithIupacMap[bases[i]];
             if ( baseIndex == Base.N.ordinal() ) {
                 bases[i] = 'N';
diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java
index c67e52f2e..0c1b5b069 100644
--- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java
@@ -32,8 +32,10 @@ package org.broadinstitute.sting.utils.fasta;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.picard.reference.ReferenceSequence;
 import net.sf.samtools.SAMSequenceRecord;
+import org.apache.commons.lang.StringUtils;
 import org.apache.log4j.Priority;
 import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.testng.Assert;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
@@ -49,7 +51,7 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 
 /**
- * Basic unit test for GenomeLoc
+ * Basic unit test for CachingIndexedFastaSequenceFile
  */
 public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
     private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
@@ -80,7 +82,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
 
     @Test(dataProvider = "fastas", enabled = true && ! DEBUG)
     public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
-        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
+        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
 
         SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
         logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
@@ -122,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
     @Test(dataProvider = "fastas", enabled = true && ! DEBUG)
     public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
         final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
-        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
+        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
 
         SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
 
@@ -167,7 +169,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
 
     @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000)
     public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException {
-        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
+        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
 
         logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt));
         for ( int iterations = 0; iterations < 1; iterations++ ) {
@@ -230,4 +232,33 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
         else
             return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
     }
+
+    @Test(enabled = true)
+    public void testIupacChanges() throws FileNotFoundException, InterruptedException {
+        final String testFasta = privateTestDir + "iupacFASTA.fasta";
+        final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(new File(testFasta), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
+        final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta));
+
+        int preservingNs = 0;
+        int changingNs = 0;
+        for ( SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences() ) {
+            final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000);
+            preservingNs += StringUtils.countMatches(sPreserving, "N");
+
+            final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000);
+            changingNs += StringUtils.countMatches(sChanging, "N");
+        }
+
+        Assert.assertEquals(changingNs, preservingNs + 4);
+    }
+
+    @Test(enabled = true, expectedExceptions = {UserException.class})
+    public void testFailOnBadBase() throws FileNotFoundException, InterruptedException {
+        final String testFasta = privateTestDir + "problematicFASTA.fasta";
+        final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta));
+
+        for ( SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences() ) {
+            fetchBaseString(fasta, contig.getSequenceName(), -1, -1);
+        }
+    }
 }

From ec1cfe67329c43afe916be4816af3b5af23f27c3 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 15:05:49 -0500
Subject: [PATCH 20/34] Oops, forgot to add 1 of my files

---
 .../variant/utils/BaseUtilsUnitTest.java               | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java
index 4f918f718..37627204f 100644
--- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java
@@ -53,11 +53,11 @@ public class BaseUtilsUnitTest extends BaseTest {
     @Test
     public void testConvertIUPACtoN() {
 
-        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false), new byte[]{'A', 'A', 'A'});
-        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false), new byte[]{'N', 'A', 'A'});
-        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false), new byte[]{'A', 'N', 'A'});
-        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false), new byte[]{'A', 'A', 'N'});
-        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false), new byte[]{'N', 'N', 'N'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false, false), new byte[]{'A', 'A', 'A'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false, false), new byte[]{'N', 'A', 'A'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false, false), new byte[]{'A', 'N', 'A'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false, false), new byte[]{'A', 'A', 'N'});
+        checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false, false), new byte[]{'N', 'N', 'N'});
     }
 
     private void checkBytesAreEqual(final byte[] b1, final byte[] b2) {

From 4d0e7b50ec967897a5400befb329177bb0256c69 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Tue, 15 Jan 2013 16:45:45 -0500
Subject: [PATCH 21/34] ArtificialBAMBuilder utility class for creating streams
 of GATKSAMRecords with a variety of properties

--  Allows us to make a stream of reads or an index BAM file with read having the following properties (coming from n samples, of fixed read length and aligned to the genome with M operator, having N reads per alignment start, skipping N bases between each alignment start, starting at a given alignment start)
-- This stream can be handed back to the caller immediately, or written to an indexed BAM file
-- Update LocusIteratorByStateUnitTest to use this functionality (which was refactored from LIBS unit tests and ArtificialSAMUtils)
---
 .../sting/utils/sam/ArtificialBAMBuilder.java | 176 ++++++++++++++++++
 .../sting/utils/sam/ArtificialSAMUtils.java   |  29 ---
 .../LocusIteratorByStateUnitTest.java         |  22 +--
 .../sam/ArtificialBAMBuilderUnitTest.java     | 122 ++++++++++++
 4 files changed, 305 insertions(+), 44 deletions(-)
 create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
 create mode 100644 public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java

diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
new file mode 100644
index 000000000..651d759e0
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.sam;
+
+import net.sf.samtools.*;
+import org.broadinstitute.sting.utils.NGSPlatform;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Easy to use creator of artificial BAM files for testing
+ *
+ * Allows us to make a stream of reads or an index BAM file with read having the following properties
+ *
+ * - coming from n samples
+ * - of fixed read length and aligned to the genome with M operator
+ * - having N reads per alignment start
+ * - skipping N bases between each alignment start
+ * - starting at a given alignment start
+ *
+ * User: depristo
+ * Date: 1/15/13
+ * Time: 9:22 AM
+ */
+public class ArtificialBAMBuilder {
+    public final static int BAM_SHARD_SIZE = 16384;
+
+    final int nReadsPerLocus;
+    final int nLoci;
+
+    int skipNLoci = 0;
+    int alignmentStart = 1;
+    int readLength = 10;
+    private final ArrayList<String> samples = new ArrayList<String>();
+
+    final SAMFileWriterFactory factory = new SAMFileWriterFactory();
+    {
+        factory.setCreateIndex(true);
+    }
+
+    SAMFileHeader header;
+
+    public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) {
+        this.nReadsPerLocus = nReadsPerLocus;
+        this.nLoci = nLoci;
+        createAndSetHeader(1);
+    }
+
+    public ArtificialBAMBuilder createAndSetHeader(final int nSamples) {
+        this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
+        samples.clear();
+
+        for ( int i = 0; i < nSamples; i++ ) {
+            final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
+            final String sample = "sample" + i;
+            samples.add(sample);
+            rg.setSample(sample);
+            rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
+            header.addReadGroup(rg);
+        }
+
+        return this;
+    }
+
+    public List<String> getSamples() {
+        return samples;
+    }
+
+    /**
+     * Create a read stream based on the parameters.  The cigar string for each
+     * read will be *M, where * is the length of the read.
+     *
+     * Useful for testing things like LocusIteratorBystate
+     *
+     * @return a ordered list of reads
+     */
+    public List<GATKSAMRecord> makeReads() {
+        final String baseName = "read";
+        List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(nReadsPerLocus*nLoci);
+        for ( int locusI = 0; locusI < nLoci; locusI++) {
+            final int locus = locusI * (skipNLoci + 1);
+            for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
+                for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {
+                    final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId());
+                    final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength);
+                    read.setReadGroup(new GATKSAMReadGroupRecord(rg));
+                    reads.add(read);
+                }
+            }
+        }
+
+        return reads;
+    }
+
+    /**
+     * Make an indexed BAM file contains the reads in the builder, marking it for deleteOnExit()
+     * @return the BAM file
+     */
+    public File makeTemporarilyBAMFile() {
+        try {
+            final File file = File.createTempFile("tempBAM", ".bam");
+            file.deleteOnExit();
+            return makeBAMFile(file);
+        } catch ( IOException e ) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Write the reads from this builder to output, creating an index as well
+     * @param output the output BAM file we want to use
+     * @return
+     */
+    public File makeBAMFile(final File output) {
+        final SAMFileWriter writer = factory.makeBAMWriter(header, true, output, 0);
+        for ( final GATKSAMRecord read : makeReads() )
+            writer.addAlignment(read);
+        writer.close();
+        return output;
+    }
+
+    public int getnReadsPerLocus() { return nReadsPerLocus; }
+    public int getnLoci() { return nLoci; }
+    public int getSkipNLoci() { return skipNLoci; }
+    public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; return this; }
+    public int getAlignmentStart() { return alignmentStart; }
+    public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; return this; }
+    public int getReadLength() { return readLength; }
+    public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; return this; }
+    public SAMFileHeader getHeader() { return header; }
+    public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; }
+
+    public int getNSamples() { return samples.size(); }
+
+    public int expectedNumberOfReads() {
+        return nLoci * nReadsPerLocus * header.getReadGroups().size();
+    }
+
+    @Override
+    public String toString() {
+        return "ArtificialBAMBuilder{" +
+                "samples=" + samples +
+                ", readLength=" + readLength +
+                ", alignmentStart=" + alignmentStart +
+                ", skipNLoci=" + skipNLoci +
+                ", nLoci=" + nLoci +
+                ", nReadsPerLocus=" + nReadsPerLocus +
+                '}';
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java
index 4af6555d9..0f5d6a2f7 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java
@@ -327,35 +327,6 @@ public class ArtificialSAMUtils {
         return stack;
     }
 
-    /**
-     * Create a read stream based on the parameters.  The cigar string for each
-     * read will be *M, where * is the length of the read.
-     *
-     * Useful for testing things like LocusIteratorBystate
-     *
-     * @return a collection of stackSize reads all sharing the above properties
-     */
-    public static List<GATKSAMRecord> createReadStream( final int nReadsPerLocus,
-                                                    final int nLoci,
-                                                    final SAMFileHeader header,
-                                                    final int alignmentStart,
-                                                    final int length ) {
-        final String baseName = "read";
-        List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(nReadsPerLocus*nLoci);
-        for ( int locus = 0; locus < nLoci; locus++ ) {
-            for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
-                for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {
-                    final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId());
-                    final GATKSAMRecord read = createArtificialRead(header, readName, 0, alignmentStart + locus, length);
-                    read.setReadGroup(new GATKSAMReadGroupRecord(rg));
-                    reads.add(read);
-                }
-            }
-        }
-
-        return reads;
-    }
-
     /**
      * create an iterator containing the specified read piles
      *
diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
index 37494903c..2f984165e 100644
--- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
@@ -28,17 +28,16 @@ package org.broadinstitute.sting.utils.locusiterator;
 import net.sf.samtools.CigarOperator;
 import net.sf.samtools.SAMFileHeader;
 import net.sf.samtools.SAMReadGroupRecord;
-import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.ReadProperties;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
 import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
-import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.NGSPlatform;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -447,26 +446,19 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
         //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample));
         final int readLength = 10;
 
-        final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
-        final List<String> samples = new ArrayList<String>(nSamples);
-        for ( int i = 0; i < nSamples; i++ ) {
-            final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
-            final String sample = "sample" + i;
-            samples.add(sample);
-            rg.setSample(sample);
-            rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
-            header.addReadGroup(rg);
-        }
-
         final boolean downsample = downsampleTo != -1;
         final DownsamplingMethod downsampler = downsample
                 ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false)
                 : new DownsamplingMethod(DownsampleType.NONE, null, null, false);
-        final List<GATKSAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
+
+        final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
+        bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1);
+
+        final List<GATKSAMRecord> reads = bamBuilder.makeReads();
         li = new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
                 createTestReadProperties(downsampler, keepReads),
                 genomeLocParser,
-                samples);
+                bamBuilder.getSamples());
 
         final Set<GATKSAMRecord> seenSoFar = new HashSet<GATKSAMRecord>();
         final Set<GATKSAMRecord> keptReads = new HashSet<GATKSAMRecord>();
diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java
new file mode 100644
index 000000000..cf3c97b34
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.sam;
+
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMRecord;
+import org.apache.commons.collections.IteratorUtils;
+import org.broadinstitute.sting.BaseTest;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: depristo
+ * Date: 1/15/13
+ * Time: 3:49 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class ArtificialBAMBuilderUnitTest extends BaseTest {
+    @DataProvider(name = "CombinatorialARTTilingProvider")
+    public Object[][] makeCombinatorialARTTilingProvider() {
+        final List<Object[]> tests = new LinkedList<Object[]>();
+
+        final List<Integer> starts = Arrays.asList(
+                1, // very start of the chromosome
+                ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary
+                ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary
+        );
+
+        for ( final int readLength : Arrays.asList(10, 20) ) {
+            for ( final int skips : Arrays.asList(0, 1, 10) ) {
+                for ( final int start : starts ) {
+                    for ( final int nSamples : Arrays.asList(1, 2) ) {
+                        for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) {
+                            for ( final int nLoci : Arrays.asList(10, 100, 1000) ) {
+                                final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
+                                bamBuilder.setReadLength(readLength);
+                                bamBuilder.setSkipNLoci(skips);
+                                bamBuilder.setAlignmentStart(start);
+                                bamBuilder.createAndSetHeader(nSamples);
+                                tests.add(new Object[]{bamBuilder, readLength, skips, start, nSamples, nReadsPerLocus, nLoci});
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "CombinatorialARTTilingProvider")
+    public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) {
+        Assert.assertEquals(bamBuilder.getReadLength(), readLength);
+        Assert.assertEquals(bamBuilder.getSkipNLoci(), skips);
+        Assert.assertEquals(bamBuilder.getAlignmentStart(), start);
+        Assert.assertEquals(bamBuilder.getNSamples(), nSamples);
+        Assert.assertEquals(bamBuilder.getnReadsPerLocus(), nReadsPerLocus);
+        Assert.assertEquals(bamBuilder.getnLoci(), nLoci);
+
+        final List<GATKSAMRecord> reads = bamBuilder.makeReads();
+        Assert.assertEquals(reads.size(), bamBuilder.expectedNumberOfReads());
+        for ( final GATKSAMRecord read : reads ) {
+            assertGoodRead(read, bamBuilder);
+        }
+
+        final File bam = bamBuilder.makeTemporarilyBAMFile();
+        final SAMFileReader reader = new SAMFileReader(bam);
+        Assert.assertTrue(reader.hasIndex());
+        final Iterator<SAMRecord> bamIt = reader.iterator();
+        int nReadsFromBam = 0;
+        int lastStart = -1;
+        while ( bamIt.hasNext() ) {
+            final SAMRecord read = bamIt.next();
+            assertGoodRead(read, bamBuilder);
+            nReadsFromBam++;
+            Assert.assertTrue(read.getAlignmentStart() >= lastStart);
+            lastStart = read.getAlignmentStart();
+        }
+        Assert.assertEquals(nReadsFromBam, bamBuilder.expectedNumberOfReads());
+    }
+
+    private void assertGoodRead(final SAMRecord read, final ArtificialBAMBuilder bamBuilder) {
+        Assert.assertEquals(read.getReadLength(), bamBuilder.getReadLength());
+        Assert.assertEquals(read.getReadBases().length, bamBuilder.getReadLength());
+        Assert.assertEquals(read.getBaseQualities().length, bamBuilder.getReadLength());
+        Assert.assertTrue(read.getAlignmentStart() >= bamBuilder.getAlignmentStart());
+        Assert.assertNotNull(read.getReadGroup());
+    }
+}
+
+

From ddcb33fcf81cd208bc8ad6e23aeb1eb49624ea07 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 16 Jan 2013 12:09:36 -0500
Subject: [PATCH 22/34] Cache result of getLocation() in Shard so we don't
 performance expensive calculation over and over

---
 .../sting/gatk/datasources/reads/Shard.java   | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java
index 2c03363ba..5b4c2afda 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java
@@ -95,7 +95,10 @@ public abstract class Shard implements HasGenomeLocation {
      */
     private final Map<SAMReaderID,SAMFileSpan> fileSpans;
 
-
+    /**
+     * Lazy-calculated span of all of the genome locs in this shard
+     */
+    private GenomeLoc spanningLocation = null;
 
     /**
      * Statistics about which reads in this shards were used and which were filtered away.
@@ -148,27 +151,34 @@ public abstract class Shard implements HasGenomeLocation {
 
     /**
      * Returns the span of the genomeLocs comprising this shard
-     * @param
-     * @return
+     * @return a GenomeLoc that starts as the first position in getGenomeLocs() and stops at the stop of the last
+     *    position in getGenomeLocs()
      */
     public GenomeLoc getLocation() {
-        if ( getGenomeLocs() == null )
-            return GenomeLoc.WHOLE_GENOME;
+        if ( spanningLocation == null ) {
+            if ( getGenomeLocs() == null )
+                spanningLocation = GenomeLoc.WHOLE_GENOME;
+            else if ( getGenomeLocs().size() == 0 ) {
+                spanningLocation = getGenomeLocs().get(0);
+            } else {
+                int start = Integer.MAX_VALUE;
+                int stop = Integer.MIN_VALUE;
+                String contig = null;
 
-        int start = Integer.MAX_VALUE;
-        int stop = Integer.MIN_VALUE;
-        String contig = null;
+                for ( GenomeLoc loc : getGenomeLocs() ) {
+                    if ( GenomeLoc.isUnmapped(loc) )
+                        // special case the unmapped region marker, just abort out
+                        return loc;
+                    contig = loc.getContig();
+                    if ( loc.getStart() < start ) start = loc.getStart();
+                    if ( loc.getStop() > stop ) stop = loc.getStop();
+                }
 
-        for ( GenomeLoc loc : getGenomeLocs() ) {
-            if ( GenomeLoc.isUnmapped(loc) )
-                // special case the unmapped region marker, just abort out
-                return loc;
-            contig = loc.getContig();
-            if ( loc.getStart() < start ) start = loc.getStart();
-            if ( loc.getStop() > stop ) stop = loc.getStop();
+                spanningLocation = parser.createGenomeLoc(contig, start, stop);
+            }
         }
 
-        return parser.createGenomeLoc(contig, start, stop);
+        return spanningLocation;
     }
 
 

From 2a42b47e4a19c17ae3dad64a980e856229875295 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 16 Jan 2013 15:29:26 -0500
Subject: [PATCH 23/34] Massive expansion of ActiveRegionTraversal unit tests,
 resulting in several bugfixes to ART

-- UnitTests now include combinational tiling of reads within and spanning shard boundaries
-- ART now properly handles shard transitions, and does so efficiently without requiring hash sets or other collections of reads
-- Updating HC and CountReadsInActiveRegions integration tests
---
 .../HaplotypeCallerIntegrationTest.java       |  12 +-
 .../traversals/TraverseActiveRegions.java     | 226 +++++++++++++-----
 .../sting/utils/sam/ArtificialBAMBuilder.java |  39 ++-
 .../traversals/DummyActiveRegionWalker.java   | 104 ++++++++
 .../TraverseActiveRegionsUnitTest.java        | 207 +++++++++++-----
 .../LocusIteratorByStateUnitTest.java         |  28 ++-
 .../sam/ArtificialBAMBuilderUnitTest.java     |   6 +-
 7 files changed, 482 insertions(+), 140 deletions(-)
 create mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 6183fc411..e86834a4a 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc");
+        HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
     }
 
     @Test
     public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a");
+        HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
     }
 
     @Test(enabled = false)
@@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void testHaplotypeCallerMultiSampleGGA() {
         HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
-                "a938cdd7262968597fc8eb6c1c0a69f1");
+                "c679ae7f04bdfda896b5c046d35e043c");
     }
 
     private void HCTestComplexGGA(String bam, String args, String md5) {
@@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
         HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
-                "d590c8d6d5e58d685401b65a23846893");
+                "1a034b7eb572e1b6f659d6e5d57b3e76");
     }
 
     private void HCTestComplexVariants(String bam, String args, String md5) {
@@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleIndelQualityScores() {
-        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323");
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
     }
 
     // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void HCTestProblematicReadsModifiedInActiveRegions() {
         final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
         executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index 03aaf95f2..a7e4d7649 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -25,6 +25,7 @@
 
 package org.broadinstitute.sting.gatk.traversals;
 
+import com.google.java.contract.Requires;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.WalkerManager;
@@ -47,24 +48,36 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import java.util.*;
 
 /**
- * Created with IntelliJ IDEA.
+ * Implement active region traversal
+ *
  * User: depristo
  * Date: 1/9/13
  * Time: 4:45 PM
- * To change this template use File | Settings | File Templates.
+ *
+ * Live region:
+ *
+ *   The ART tracks a thing called the live region.  The live region is a position on a specific contig
+ *   of the alignment start of the last read we processed during this traversal.  Because the
+ *   read stream is sorted, future reads must occurs in the the live region.  Therefore the the dead region
+ *   (everything to the left of the live boundary) cannot have any more read data.  The live / dead
+ *   regions are used to decide when we can safely call map on active regions, as only active regions
+ *   contained completely within the dead region (including extensions) have a complete set of read data
+ *   in the collected read list.  All of the data related to the live region is captured by the local
+ *   variable spanOfLastReadSeen
+ *
  */
 public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
+    protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
     protected final static boolean DEBUG = false;
 
     // set by the tranversal
     private int activeRegionExtension = -1;
     private int maxRegionSize = -1;
 
-    /**
-     * our log, which we want to capture anything from this class
-     */
-    protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
-    protected final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
+    private final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
+
+    private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
+    private GenomeLoc spanOfLastReadSeen = null;
 
     protected int getActiveRegionExtension() {
         return activeRegionExtension;
@@ -79,6 +92,11 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
         return "active regions";
     }
 
+    @Override
+    public String toString() {
+        return "TraverseActiveRegions";
+    }
+
     @Override
     public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
         super.initialize(engine, walker, progressMeter);
@@ -153,23 +171,58 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
         }
     }
 
-    private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
-    private Shard lastShard = null;
+    /**
+     * Did read appear in the last shard?
+     *
+     * When we transition across shard boundaries we see duplicate reads because
+     * each shard contains the reads that *overlap* the shard.  So if we just finished
+     * shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001
+     * that overlapped 1-1000.  This function tests read to determine if we would have
+     * seen it before by asking if read.getAlignmentStart() is less than the
+     * stop position of the last seen read at the start of the traversal.  The reason
+     * we need to use the location of the last read at the start of the traversal
+     * is that we update the lastRead during the traversal, and we only want to filter
+     * out reads whose start is before the last read of the previous shard, not the
+     * current shard.
+     *
+     * @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal
+     * @param read the read we want to test if it's already been seen in the last shard
+     * @return true if read would have appeared in the last shard, false otherwise
+     */
+    protected boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) {
+        if ( locOfLastReadAtTraversalStart == null )
+            // we're in the first shard, so obviously the answer is no
+            return false;
+        else {
+            // otherwise check to see if the alignment occurred in the previous shard
+            return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart()
+                    // we're on the same contig
+                    && read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex();
+        }
+
+    }
+
+    // -------------------------------------------------------------------------------------
+    //
+    // Actual traverse function
+    //
+    // -------------------------------------------------------------------------------------
+
+    /**
+     * Is the current shard on a new contig w.r.t. the previous shard?
+     * @param currentShard the current shard we are processing
+     * @return true if the last shard was on a different contig than the current shard
+     */
+    private boolean onNewContig(final Shard currentShard) {
+        return spanOfLastSeenRead() != null
+                && spanOfLastSeenRead().getContigIndex() != currentShard.getLocation().getContigIndex();
+    }
 
     @Override
     public T traverse( final ActiveRegionWalker<M,T> walker,
                        final LocusShardDataProvider dataProvider,
                        T sum) {
-        if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
-
-        final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
-        // TODO -- there's got to be a better way to know this
-        if ( lastShard != dataProvider.getShard() ) {
-            maybeDuplicatedReads.addAll(myReads);
-            logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() +  " reads");
-            if ( DEBUG ) logger.warn("Clearing myReads");
-        }
-        lastShard = dataProvider.getShard();
+        logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
 
         final LocusView locusView = new AllLocusView(dataProvider);
 
@@ -181,6 +234,12 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
         ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
 
         // We keep processing while the next reference location is within the interval
+        final GenomeLoc locOfLastReadAtTraversalStart = spanOfLastSeenRead();
+
+        // if we've moved onto a new contig, process all of the active regions
+        if ( onNewContig(dataProvider.getShard()) )
+            sum = processActiveRegions(walker, sum, true);
+
         GenomeLoc prevLoc = null;
         while( locusView.hasNext() ) {
             final AlignmentContext locus = locusView.next();
@@ -191,15 +250,12 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
             // reads may occur outside our intervals but overlap them in the future
             final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
             for( final GATKSAMRecord read : reads ) {
-                notifyOfCurrentPosition(read);
-                // most of the time maybeDuplicatedReads is empty
-                // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
-                // TODO -- potential list of duplicates we can clear the hashset
-                if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
+                if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) {
                     if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
                 } else {
                     if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
-                    myReads.add((GATKSAMRecord)read);
+                    rememberLastReadLocation(read);
+                    myReads.add(read);
                 }
             }
 
@@ -257,28 +313,87 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
         return sum;
     }
 
-    private GenomeLoc startOfLiveRegion = null;
-
-    protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
-        notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
+    /**
+     * Special function called in LinearMicroScheduler to empty out the work queue.
+     * Ugly for now but will be cleaned up when we push this functionality more into the engine
+     */
+    public T endTraversal(final Walker<M, T> walker, T sum) {
+        return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
     }
 
-    protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
-        if ( startOfLiveRegion == null )
-            startOfLiveRegion = currentLocation;
-        else
-            startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
+    // -------------------------------------------------------------------------------------
+    //
+    // Functions to manage and interact with the live / dead zone
+    //
+    // -------------------------------------------------------------------------------------
+
+    /**
+     * Update the live region to reflect that the last read we've seen in the traversal is read
+     *
+     * Requires that sequential calls always be provided reads in coordinate sorted order
+     *
+     * @param read the last read we've seen during the traversal
+     */
+    protected void rememberLastReadLocation(final GATKSAMRecord read) {
+        final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read);
+        if ( spanOfLastReadSeen == null )
+            spanOfLastReadSeen = currentLocation;
+        else {
+            if ( currentLocation.isBefore(spanOfLastReadSeen) )
+                throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen);
+            spanOfLastReadSeen = currentLocation;
+        }
     }
 
-    protected GenomeLoc getStartOfLiveRegion() {
-        return startOfLiveRegion;
+    /**
+     * Get a GenomeLoc indicating the start (heading to the right) of the live ART region.
+     * @return the left-most position of the live region on the genome
+     */
+    protected GenomeLoc spanOfLastSeenRead() {
+        return spanOfLastReadSeen;
     }
 
-    protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
-        return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
-                || ! region.onSameContig(getStartOfLiveRegion());
+    /**
+     * Is the active region completely within the traversal's dead zone?
+     *
+     * @param region the region we want to test
+     * @return true if the extended location of region is completely within the current dead zone, false otherwise
+     */
+    protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) {
+        return region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart()
+                || ! region.getExtendedLoc().onSameContig(spanOfLastSeenRead());
     }
 
+    /**
+     * Is the read dead?  That is, can it no longer be in any future active region, and therefore can be discarded?
+     *
+     * read: start |--------> stop ------ stop + extension
+     * region:                      start |-----------------| end
+     *
+     * Since the regions are coming in order, read could potentially be contained in a future interval if
+     * stop + activeRegionExtension >= end.  If, on the other hand, stop + extension is < the end
+     * of this region, then we can discard it, since any future region could only include reads
+     * up to end + 1 - extension.
+     *
+     * Note that this function doesn't care about the dead zone.  We're assuming that by
+     * actually calling this function with an active region that region is already in the dead zone,
+     * so checking that the read is in the dead zone doesn't make sense.
+     *
+     * @param read the read we're testing
+     * @param activeRegion the current active region
+     * @return true if the read is dead, false other
+     */
+    @Requires({"read != null", "activeRegion != null"})
+    private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) {
+        return read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop();
+    }
+
+    // -------------------------------------------------------------------------------------
+    //
+    // Functions to process active regions that are ready for map / reduce calls
+    //
+    // -------------------------------------------------------------------------------------
+
     private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
         if( walker.activeRegionOutStream != null ) {
             writeActiveRegionsToStream(walker);
@@ -292,11 +407,10 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
         // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
         // TODO can implement parallel traversal here
         while( workQueue.peek() != null ) {
-            final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
-            if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
-                final ActiveRegion activeRegion = workQueue.remove();
-                if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
-                sum = processActiveRegion( activeRegion, sum, walker );
+            final ActiveRegion activeRegion = workQueue.peek();
+            if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
+                if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
+                sum = processActiveRegion( workQueue.remove(), sum, walker );
             } else {
                 break;
             }
@@ -305,15 +419,6 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
         return sum;
     }
 
-    @Override
-    public String toString() {
-        return "TraverseActiveRegions";
-    }
-
-    private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
-        return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
-    }
-
     protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
         final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
         while ( liveReads.hasNext() ) {
@@ -325,7 +430,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
                 activeRegion.add(read);
 
                 if ( ! walker.wantsNonPrimaryReads() ) {
-                    if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
+                    if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + spanOfLastSeenRead());
                     liveReads.remove();
                     killed = true;
                 }
@@ -333,8 +438,8 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
                 activeRegion.add( read );
             }
 
-            if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
-                if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
+            if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) {
+                if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + spanOfLastSeenRead());
                 liveReads.remove();
             }
         }
@@ -343,13 +448,4 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
         final M x = walker.map(activeRegion, null);
         return walker.reduce( x, sum );
     }
-
-
-    /**
-     * Special function called in LinearMicroScheduler to empty out the work queue.
-     * Ugly for now but will be cleaned up when we push this functionality more into the engine
-     */
-    public T endTraversal(final Walker<M, T> walker, T sum) {
-        return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
-    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
index 651d759e0..f5018db8c 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
@@ -25,7 +25,9 @@
 
 package org.broadinstitute.sting.utils.sam;
 
+import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.samtools.*;
+import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.NGSPlatform;
 
 import java.io.File;
@@ -51,6 +53,9 @@ import java.util.List;
 public class ArtificialBAMBuilder {
     public final static int BAM_SHARD_SIZE = 16384;
 
+    private final IndexedFastaSequenceFile reference;
+    private final GenomeLocParser parser;
+
     final int nReadsPerLocus;
     final int nLoci;
 
@@ -66,14 +71,39 @@ public class ArtificialBAMBuilder {
 
     SAMFileHeader header;
 
-    public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) {
+    public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) {
         this.nReadsPerLocus = nReadsPerLocus;
         this.nLoci = nLoci;
+
+        this.reference = reference;
+        this.parser = new GenomeLocParser(reference);
         createAndSetHeader(1);
     }
 
+    public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) {
+        this(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000).getSequenceDictionary(), nReadsPerLocus, nLoci);
+    }
+
+    public ArtificialBAMBuilder(final SAMSequenceDictionary dict, int nReadsPerLocus, int nLoci) {
+        this.nReadsPerLocus = nReadsPerLocus;
+        this.nLoci = nLoci;
+        this.reference = null;
+        this.parser = new GenomeLocParser(dict);
+        createAndSetHeader(1);
+    }
+
+    public IndexedFastaSequenceFile getReference() {
+        return reference;
+    }
+
+    public GenomeLocParser getGenomeLocParser() {
+        return parser;
+    }
+
     public ArtificialBAMBuilder createAndSetHeader(final int nSamples) {
-        this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
+        this.header = new SAMFileHeader();
+        header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
+        header.setSequenceDictionary(parser.getContigs());
         samples.clear();
 
         for ( int i = 0; i < nSamples; i++ ) {
@@ -156,6 +186,11 @@ public class ArtificialBAMBuilder {
     public SAMFileHeader getHeader() { return header; }
     public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; }
 
+    public int getAlignmentEnd() {
+        return alignmentStart + nLoci * (skipNLoci + 1) + readLength;
+    }
+
+
     public int getNSamples() { return samples.size(); }
 
     public int expectedNumberOfReads() {
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java
new file mode 100644
index 000000000..bc1e1d7b0
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.traversals;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocSortedSet;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
+import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
+
+import java.util.*;
+
+/**
+ * ActiveRegionWalker for unit testing
+ *
+ * User: depristo
+ * Date: 1/15/13
+ * Time: 1:28 PM
+ */
+class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
+    private final double prob;
+    private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
+    private GenomeLocSortedSet activeRegions = null;
+
+    protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
+    protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new LinkedHashMap<GenomeLoc, ActiveRegion>();
+
+    public DummyActiveRegionWalker() {
+        this(1.0);
+    }
+
+    public DummyActiveRegionWalker(double constProb) {
+        this.prob = constProb;
+    }
+
+    public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
+        this(1.0);
+        this.states = wantStates;
+    }
+
+    public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions) {
+        this(1.0);
+        this.activeRegions = activeRegions;
+    }
+
+    public void setStates(EnumSet<ActiveRegionReadState> states) {
+        this.states = states;
+    }
+
+    @Override
+    public EnumSet<ActiveRegionReadState> desiredReadStates() {
+        return states;
+    }
+
+    @Override
+    public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        isActiveCalls.add(ref.getLocus());
+        final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0;
+        return new ActivityProfileResult(ref.getLocus(), p);
+    }
+
+    @Override
+    public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
+        mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
+        return 0;
+    }
+
+    @Override
+    public Integer reduceInit() {
+        return 0;
+    }
+
+    @Override
+    public Integer reduce(Integer value, Integer sum) {
+        return 0;
+    }
+}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
index c4dadbcce..15d4eec2d 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
@@ -30,33 +30,26 @@ import net.sf.samtools.*;
 import org.broadinstitute.sting.commandline.Tags;
 import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
 import org.broadinstitute.sting.gatk.datasources.reads.*;
-import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
 import org.broadinstitute.sting.gatk.filters.ReadFilter;
 import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
 import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
 import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.utils.GenomeLocSortedSet;
+import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
 import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
 import org.broadinstitute.sting.utils.interval.IntervalUtils;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.*;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.executive.WindowMaker;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
-import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
-import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.testng.Assert;
 import org.testng.annotations.BeforeClass;
 import org.testng.annotations.DataProvider;
@@ -80,54 +73,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
     private final static boolean ENFORCE_CONTRACTS = false;
     private final static boolean DEBUG = false;
 
-    private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
-        private final double prob;
-        private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
-
-        protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
-        protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
-
-        public DummyActiveRegionWalker() {
-            this.prob = 1.0;
-        }
-
-        public DummyActiveRegionWalker(double constProb) {
-            this.prob = constProb;
-        }
-
-        public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
-            this.prob = 1.0;
-            this.states = wantStates;
-        }
-
-        @Override
-        public EnumSet<ActiveRegionReadState> desiredReadStates() {
-            return states;
-        }
-
-        @Override
-        public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
-            isActiveCalls.add(ref.getLocus());
-            return new ActivityProfileResult(ref.getLocus(), prob);
-        }
-
-        @Override
-        public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
-            mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
-            return 0;
-        }
-
-        @Override
-        public Integer reduceInit() {
-            return 0;
-        }
-
-        @Override
-        public Integer reduce(Integer value, Integer sum) {
-            return 0;
-        }
-    }
-
     @DataProvider(name = "TraversalEngineProvider")
     public Object[][] makeTraversals() {
         final List<Object[]> traversals = new LinkedList<Object[]>();
@@ -297,7 +242,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
         }
     }
 
-    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
+    @Test(enabled = true, dataProvider = "TraversalEngineProvider")
     public void testPrimaryReadMapping(TraverseActiveRegions t) {
         DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
 
@@ -340,7 +285,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
         verifyReadMapping(region, "simple20");
     }
 
-    @Test(enabled = true, dataProvider = "TraversalEngineProvider")
+    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
     public void testNonPrimaryReadMapping(TraverseActiveRegions t) {
         DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
                 EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY));
@@ -456,7 +401,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
     }
 
     private Map<GenomeLoc, ActiveRegion> getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
-        for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM))
+        return getActiveRegions(t, walker, intervals, testBAM);
+    }
+
+    private Map<GenomeLoc, ActiveRegion> getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List<GenomeLoc> intervals, final String bam) {
+        for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam))
             t.traverse(walker, dataProvider, 0);
 
         t.endTraversal(walker, 0);
@@ -516,14 +465,15 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
         record.setCigar(cigar);
         record.setReadString(new String(new char[len]).replace("\0", "A"));
         record.setBaseQualities(new byte[len]);
+        record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test")));
 
         return record;
     }
 
-    private List<LocusShardDataProvider> createDataProviders(TraverseActiveRegions t, final Walker walker, List<GenomeLoc> intervals, String bamFile) {
+    private List<LocusShardDataProvider> createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List<GenomeLoc> intervals, String bamFile) {
         GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
         engine.setGenomeLocParser(genomeLocParser);
-        t.initialize(engine, walker);
+        traverseActiveRegions.initialize(engine, walker);
 
         Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
         SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags());
@@ -539,13 +489,144 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
                 new ArrayList<ReadTransformer>(),
                 false, (byte)30, false, true);
 
+        final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
+
         List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
         for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
-            for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) {
+            for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) {
                 providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
             }
         }
 
         return providers;
     }
+
+    @DataProvider(name = "CombinatorialARTTilingProvider")
+    public Object[][] makeCombinatorialARTTilingProvider() {
+        final List<Object[]> tests = new LinkedList<Object[]>();
+
+        final List<Integer> starts = Arrays.asList(
+                1, // very start of the chromosome
+                ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary
+                ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary
+        );
+
+        final List<EnumSet<ActiveRegionReadState>> allReadStates = Arrays.asList(
+                EnumSet.of(ActiveRegionReadState.PRIMARY),
+                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY),
+                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)
+        );
+
+        final int maxTests = Integer.MAX_VALUE;
+        int nTests = 0;
+        for ( final int readLength : Arrays.asList(10, 100) ) {
+            for ( final int skips : Arrays.asList(0, 1, 10) ) {
+                for ( final int start : starts ) {
+                    for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) {
+                        for ( final int nLoci : Arrays.asList(1, 1000) ) {
+                            for ( EnumSet<ActiveRegionReadState> readStates : allReadStates ) {
+                                final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci);
+                                bamBuilder.setReadLength(readLength);
+                                bamBuilder.setSkipNLoci(skips);
+                                bamBuilder.setAlignmentStart(start);
+
+                                for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) {
+                                    nTests++;
+                                    if ( nTests < maxTests ) // && nTests == 1238 )
+                                        tests.add(new Object[]{nTests, activeRegions, readStates, bamBuilder});
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    private Collection<GenomeLocSortedSet> enumerateActiveRegions(final int start, final int stop) {
+        // should basically cut up entire region into equal sized chunks, of
+        // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive
+        // Need to make sure we include some edge cases:
+        final List<GenomeLocSortedSet> activeRegions = new LinkedList<GenomeLocSortedSet>();
+
+        for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) {
+            for ( final boolean startWithActive : Arrays.asList(true, false) ) {
+                activeRegions.add(makeActiveRegionMask(start, stop, stepSize,  startWithActive));
+            }
+        }
+
+        // active region is the whole interval
+        activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop)));
+
+        // active region extends up to the end of the data, but doesn't include start
+        activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop)));
+
+        return activeRegions;
+    }
+
+    private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) {
+        final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser);
+
+        boolean includeRegion = startWithActive;
+        for ( int left = start; left < stop; left += stepSize) {
+            final int right = left + stepSize;
+            final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right);
+            if ( includeRegion )
+                active.add(region);
+            includeRegion = ! includeRegion;
+        }
+
+        return active;
+    }
+
+
+    @Test(enabled = true, dataProvider = "CombinatorialARTTilingProvider")
+    public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet<ActiveRegionReadState> readStates, final ArtificialBAMBuilder bamBuilder) {
+        logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder);
+        final List<GenomeLoc> intervals = Arrays.asList(
+                genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
+        );
+
+        final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions);
+        walker.setStates(readStates);
+
+        final TraverseActiveRegions traversal = new TraverseActiveRegions<Integer, Integer>();
+        final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString());
+
+        final Set<String> alreadySeenReads = new HashSet<String>(); // for use with the primary / non-primary
+        for ( final ActiveRegion region : activeRegionsMap.values() ) {
+            int nReadsExpectedInRegion = 0;
+            for ( final GATKSAMRecord read : bamBuilder.makeReads() ) {
+                final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read);
+                final Set<String> readNamesInRegion = readNamesInRegion(region);
+
+                boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED)
+                        ? region.getExtendedLoc().overlapsP(readLoc)
+                        : region.getLocation().overlapsP(readLoc);
+
+                if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) {
+                    if ( alreadySeenReads.contains(read.getReadName()) )
+                        shouldBeInRegion = false;
+                    else if ( shouldBeInRegion )
+                        alreadySeenReads.add(read.getReadName());
+                }
+
+                Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, "Region " + region +
+                        " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite");
+
+                nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0;
+            }
+
+            Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected");
+        }
+    }
+
+    private Set<String> readNamesInRegion(final ActiveRegion region) {
+        final Set<String> readNames = new LinkedHashSet<String>(region.getReads().size());
+        for ( final SAMRecord read : region.getReads() )
+            readNames.add(read.getReadName());
+        return readNames;
+    }
 }
diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
index 2f984165e..e5e28e1f6 100644
--- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java
@@ -54,6 +54,32 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
     private static final boolean DEBUG = false;
     protected LocusIteratorByState li;
 
+    @Test(enabled = true)
+    public void testUnmappedAndAllIReadsPassThrough() {
+        final int readLength = 10;
+        GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength);
+        GATKSAMRecord mapped2 = ArtificialSAMUtils.createArtificialRead(header,"mapped2",0,1,readLength);
+        GATKSAMRecord unmapped = ArtificialSAMUtils.createArtificialRead(header,"unmapped",0,1,readLength);
+        GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(header,"allI",0,1,readLength);
+
+        unmapped.setReadUnmappedFlag(true);
+        unmapped.setCigarString("*");
+        allI.setCigarString(readLength + "I");
+
+        List<GATKSAMRecord> reads = Arrays.asList(mapped1, unmapped, allI, mapped2);
+
+        // create the iterator by state with the fake reads and fake records
+        li = makeLTBS(reads,createTestReadProperties(DownsamplingMethod.NONE, true));
+
+        Assert.assertTrue(li.hasNext());
+        AlignmentContext context = li.next();
+        ReadBackedPileup pileup = context.getBasePileup();
+        Assert.assertEquals(pileup.depthOfCoverage(), 2, "Should see only 2 reads in pileup, even with unmapped and all I reads");
+
+        final List<GATKSAMRecord> rawReads = li.transferReadsFromAllPreviousPileups();
+        Assert.assertEquals(rawReads, reads, "Input and transferred read lists should be the same, and include the unmapped and all I reads");
+    }
+
     @Test(enabled = true && ! DEBUG)
     public void testXandEQOperators() {
         final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
@@ -451,7 +477,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
                 ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false)
                 : new DownsamplingMethod(DownsampleType.NONE, null, null, false);
 
-        final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
+        final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(header.getSequenceDictionary(), nReadsPerLocus, nLoci);
         bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1);
 
         final List<GATKSAMRecord> reads = bamBuilder.makeReads();
diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java
index cf3c97b34..2a638eb69 100644
--- a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java
@@ -47,8 +47,8 @@ import java.util.List;
  * To change this template use File | Settings | File Templates.
  */
 public class ArtificialBAMBuilderUnitTest extends BaseTest {
-    @DataProvider(name = "CombinatorialARTTilingProvider")
-    public Object[][] makeCombinatorialARTTilingProvider() {
+    @DataProvider(name = "ArtificialBAMBuilderUnitTestProvider")
+    public Object[][] makeArtificialBAMBuilderUnitTestProvider() {
         final List<Object[]> tests = new LinkedList<Object[]>();
 
         final List<Integer> starts = Arrays.asList(
@@ -79,7 +79,7 @@ public class ArtificialBAMBuilderUnitTest extends BaseTest {
         return tests.toArray(new Object[][]{});
     }
 
-    @Test(dataProvider = "CombinatorialARTTilingProvider")
+    @Test(dataProvider = "ArtificialBAMBuilderUnitTestProvider")
     public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) {
         Assert.assertEquals(bamBuilder.getReadLength(), readLength);
         Assert.assertEquals(bamBuilder.getSkipNLoci(), skips);

From 4cf34ee9da6dfa9539b485daeed9f276fb192975 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 15:35:04 -0500
Subject: [PATCH 24/34] Bug fix to FisherStrand: do not let it output INFINITY.
  This all needs to be unit tested, but that's coming on the horizon.

---
 .../sting/gatk/walkers/annotator/FisherStrand.java          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
index 167e5df63..fd81103cd 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
@@ -116,8 +116,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
         else if (table1 == null)
             return annotationForOneTable(pValueForContingencyTable(table2));
         else { // take the one with the best (i.e., least significant pvalue)
-            double pvalue1 = Math.max(pValueForContingencyTable(table1), MIN_PVALUE);
-            double pvalue2 = Math.max(pValueForContingencyTable(table2), MIN_PVALUE);
+            double pvalue1 = pValueForContingencyTable(table1);
+            double pvalue2 = pValueForContingencyTable(table2);
             return annotationForOneTable(Math.max(pvalue1, pvalue2));
         }
     }
@@ -129,7 +129,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
      * @return a hash map from FS -> phred-scaled pValue
      */
     private Map<String, Object> annotationForOneTable(final double pValue) {
-        final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue));
+        final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
         return Collections.singletonMap(FS, value);
 //        Map<String, Object> map = new HashMap<String, Object>();
 //        map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)));

From 79bc8180228480f037a122b609b24ff666a7040f Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 16:15:58 -0500
Subject: [PATCH 25/34] Bug fix for VariantsToVCF: old dbSNP files can have '-'
 as reference base and those records always need to be padded.

---
 .../sting/gatk/refdata/VariantContextAdaptors.java           | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
index c7edebd81..a77341a5d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
@@ -194,17 +194,18 @@ public class VariantContextAdaptors {
                 return null; // we weren't given enough reference context to create the VariantContext
 
             final byte refBaseForIndel = ref.getBases()[index];
+            final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-");
 
             boolean addPaddingBase;
             if ( isSNP(dbsnp) || isMNP(dbsnp) )
                 addPaddingBase = false;
             else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") )
-                addPaddingBase = VariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp)));
+                addPaddingBase = refBaseIsDash || VariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp)));
             else
                 return null; // can't handle anything else
 
             Allele refAllele;
-            if ( dbsnp.getNCBIRefBase().equals("-") )
+            if ( refBaseIsDash )
                 refAllele = Allele.create(refBaseForIndel, true);
             else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) )
                 return null;

From 3c476a92a27db20f80ea94598cdac18e3d31c09c Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 16 Jan 2013 15:56:03 -0500
Subject: [PATCH 26/34] Add dummy functionality (currently throws an error) to
 allow HC to include unmapped reads during assembly and calling

---
 .../haplotypecaller/HaplotypeCaller.java      | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 4da2e1179..ce6aa32f4 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -184,6 +184,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false)
     protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000;
 
+    /**
+     * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
+     * when these reads occur in the region being analyzed.  Typically, for paired end analyses, one pair of the
+     * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking
+     * the mates contig and alignment start.  If this flag is provided the haplotype caller will see such reads,
+     * and may make use of them in assembly and calling, where possible.
+     */
+    @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
+    protected boolean includeUnmappedReads = false;
+
     @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
     protected boolean USE_ALLELES_TRIGGER = false;
 
@@ -354,11 +364,20 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     // enable non primary and extended reads in the active region
     @Override
     public EnumSet<ActiveRegionReadState> desiredReadStates() {
-        return EnumSet.of(
-                ActiveRegionReadState.PRIMARY,
-                ActiveRegionReadState.NONPRIMARY,
-                ActiveRegionReadState.EXTENDED
-        );
+        if ( includeUnmappedReads ) {
+            throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional");
+//            return EnumSet.of(
+//                    ActiveRegionReadState.PRIMARY,
+//                    ActiveRegionReadState.NONPRIMARY,
+//                    ActiveRegionReadState.EXTENDED,
+//                    ActiveRegionReadState.UNMAPPED
+//            );
+        } else
+            return EnumSet.of(
+                    ActiveRegionReadState.PRIMARY,
+                    ActiveRegionReadState.NONPRIMARY,
+                    ActiveRegionReadState.EXTENDED
+            );
     }
 
     @Override

From 738c24a3b1efea489a5638eb146107d8533b8878 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 16 Jan 2013 16:25:11 -0500
Subject: [PATCH 27/34] Add tests to ensure that all insertion reads appear in
 the active region traversal

---
 .../sting/utils/sam/ArtificialBAMBuilder.java | 18 +++++-
 .../TraverseActiveRegionsUnitTest.java        | 59 ++++++++++++++++++-
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
index f5018db8c..ab539c9dc 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java
@@ -32,8 +32,7 @@ import org.broadinstitute.sting.utils.NGSPlatform;
 
 import java.io.File;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.*;
 
 /**
  * Easy to use creator of artificial BAM files for testing
@@ -64,6 +63,8 @@ public class ArtificialBAMBuilder {
     int readLength = 10;
     private final ArrayList<String> samples = new ArrayList<String>();
 
+    private LinkedList<GATKSAMRecord> additionalReads = new LinkedList<GATKSAMRecord>();
+
     final SAMFileWriterFactory factory = new SAMFileWriterFactory();
     {
         factory.setCreateIndex(true);
@@ -118,6 +119,14 @@ public class ArtificialBAMBuilder {
         return this;
     }
 
+    public void addReads(final GATKSAMRecord readToAdd) {
+        additionalReads.add(readToAdd);
+    }
+
+    public void addReads(final Collection<GATKSAMRecord> readsToAdd) {
+        additionalReads.addAll(readsToAdd);
+    }
+
     public List<String> getSamples() {
         return samples;
     }
@@ -145,6 +154,11 @@ public class ArtificialBAMBuilder {
             }
         }
 
+        if ( ! additionalReads.isEmpty() ) {
+            reads.addAll(additionalReads);
+            Collections.sort(reads, new SAMRecordCoordinateComparator());
+        }
+
         return reads;
     }
 
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
index 15d4eec2d..319af5ec5 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java
@@ -501,6 +501,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
         return providers;
     }
 
+    // ---------------------------------------------------------------------------------------------------------
+    //
+    // Combinatorial tests to ensure reads are going into the right regions
+    //
+    // ---------------------------------------------------------------------------------------------------------
+
     @DataProvider(name = "CombinatorialARTTilingProvider")
     public Object[][] makeCombinatorialARTTilingProvider() {
         final List<Object[]> tests = new LinkedList<Object[]>();
@@ -582,7 +588,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
     }
 
 
-    @Test(enabled = true, dataProvider = "CombinatorialARTTilingProvider")
+    @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider")
     public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet<ActiveRegionReadState> readStates, final ArtificialBAMBuilder bamBuilder) {
         logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder);
         final List<GenomeLoc> intervals = Arrays.asList(
@@ -597,10 +603,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
 
         final Set<String> alreadySeenReads = new HashSet<String>(); // for use with the primary / non-primary
         for ( final ActiveRegion region : activeRegionsMap.values() ) {
+            final Set<String> readNamesInRegion = readNamesInRegion(region);
             int nReadsExpectedInRegion = 0;
             for ( final GATKSAMRecord read : bamBuilder.makeReads() ) {
                 final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read);
-                final Set<String> readNamesInRegion = readNamesInRegion(region);
 
                 boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED)
                         ? region.getExtendedLoc().overlapsP(readLoc)
@@ -629,4 +635,53 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
             readNames.add(read.getReadName());
         return readNames;
     }
+
+    // ---------------------------------------------------------------------------------------------------------
+    //
+    // Make sure all insertion reads are properly included in the active regions
+    //
+    // ---------------------------------------------------------------------------------------------------------
+
+    @Test
+    public void ensureAllInsertionReadsAreInActiveRegions() {
+
+        final int readLength = 10;
+        final int start = 20;
+        final int nReadsPerLocus = 10;
+        final int nLoci = 3;
+
+        final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci);
+        bamBuilder.setReadLength(readLength);
+        bamBuilder.setAlignmentStart(start);
+
+        // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region
+        GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength);
+        allI.setCigarString(readLength + "I");
+        allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0)));
+
+        bamBuilder.addReads(allI);
+
+        final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser());
+        activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30));
+        final List<GenomeLoc> intervals = Arrays.asList(
+                genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
+        );
+
+        final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions);
+
+        final TraverseActiveRegions traversal = new TraverseActiveRegions<Integer, Integer>();
+        final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString());
+
+        final ActiveRegion region = activeRegionsMap.values().iterator().next();
+        int nReadsExpectedInRegion = 0;
+
+        final Set<String> readNamesInRegion = readNamesInRegion(region);
+        for ( final GATKSAMRecord read : bamBuilder.makeReads() ) {
+            Assert.assertTrue(readNamesInRegion.contains(read.getReadName()),
+                    "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't");
+            nReadsExpectedInRegion++;
+        }
+
+        Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected");
+    }
 }

From dbb69a1e1088db4c7fcfd9e512b1b9e095a62ba3 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 22:33:16 -0500
Subject: [PATCH 28/34] Need to use ints for quals in HaplotypeScore instead of
 bytes because of overflow (they are summed when haplotypes are combined)

---
 .../walkers/annotator/HaplotypeScore.java     | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
index af6304297..3acba48ae 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
@@ -216,14 +216,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
             final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
 
             List<Haplotype> hlist = new ArrayList<Haplotype>();
-            hlist.add(new Haplotype(haplotype1.getBases(), (byte)60));
+            hlist.add(new Haplotype(haplotype1.getBases(), 60));
 
             for (int k = 1; k < haplotypesToCompute; k++) {
                 Haplotype haplotype2 = consensusHaplotypeQueue.poll();
                 if (haplotype2 == null) {
                     haplotype2 = haplotype1;
                 } // Sometimes only the reference haplotype can be found
-                hlist.add(new Haplotype(haplotype2.getBases(), (byte)20));
+                hlist.add(new Haplotype(haplotype2.getBases(), 20));
             }
             return hlist;
         } else
@@ -285,10 +285,10 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
 
         final int length = a.length;
         final byte[] consensusChars = new byte[length];
-        final byte[] consensusQuals = new byte[length];
+        final int[] consensusQuals = new int[length];
 
-        final byte[] qualsA = haplotypeA.getQuals();
-        final byte[] qualsB = haplotypeB.getQuals();
+        final int[] qualsA = haplotypeA.getQuals();
+        final int[] qualsB = haplotypeB.getQuals();
 
         for (int i = 0; i < length; i++) {
             chA = a[i];
@@ -308,7 +308,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
                 consensusQuals[i] = qualsA[i];
             } else {
                 consensusChars[i] = chA;
-                consensusQuals[i] = (byte)((int)qualsA[i] + (int)qualsB[i]);
+                consensusQuals[i] = qualsA[i] + qualsB[i];
             }
         }
 
@@ -442,31 +442,38 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
 
     private static class Haplotype  {
         private final byte[] bases;
-        private final byte[] quals;
+        private final int[] quals;
         private int qualitySum = -1;
 
-        public Haplotype( final byte[] bases, final byte[] quals ) {
+        public Haplotype( final byte[] bases, final int[] quals ) {
             this.bases = bases;
             this.quals = quals;
         }
 
-        public Haplotype( final byte[] bases, final byte qual ) {
+        public Haplotype( final byte[] bases, final int qual ) {
             this.bases = bases;
-            quals = new byte[bases.length];
+            quals = new int[bases.length];
             Arrays.fill(quals, qual);
         }
 
+        public Haplotype( final byte[] bases, final byte[] quals ) {
+            this.bases = bases;
+            this.quals = new int[quals.length];
+            for ( int i = 0 ; i < quals.length; i++ )
+                this.quals[i] = (int)quals[i];
+        }
+
         public double getQualitySum() {
             if ( qualitySum == -1 ) {
                 qualitySum = 0;
-                for ( final byte qual : quals ) {
-                    qualitySum += (int)qual;
+                for ( final int qual : quals ) {
+                    qualitySum += qual;
                 }
             }
             return qualitySum;
         }
 
-        public byte[] getQuals() {
+        public int[] getQuals() {
             return quals.clone();
         }
 

From a623cca89a7310fe52118529dfd9c9f28698d8e5 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 16 Jan 2013 22:47:58 -0500
Subject: [PATCH 29/34] Bug fix for HaplotypeCaller, as reported on the forum:
 when reduced reads didn't completely overlap a deletion call, we were
 incorrectly trying to find the reference position of a base on the read that
 didn't exist. Added integration test to cover this case.

---
 .../sting/gatk/walkers/annotator/DepthOfCoverage.java     | 2 +-
 .../gatk/walkers/annotator/DepthPerAlleleBySample.java    | 2 +-
 .../sting/gatk/walkers/annotator/FisherStrand.java        | 2 +-
 .../haplotypecaller/HaplotypeCallerIntegrationTest.java   | 8 ++++++++
 .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 5 +++++
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
index aeec36c18..4adb2ca71 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
@@ -99,7 +99,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
             for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
                 for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
                     final GATKSAMRecord read = el.getKey();
-                    depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
+                    depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
                 }
             }
         }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
index a194fe323..5acea12f6 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
@@ -144,7 +144,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
                 continue; // read is non-informative
             if (!vc.getAlleles().contains(a))
                 continue; // sanity check - shouldn't be needed
-            alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
+            alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
         }
         final int[] counts = new int[alleleCounts.size()];
         counts[0] = alleleCounts.get(vc.getReference());
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
index fd81103cd..fbd27dfe3 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
@@ -277,7 +277,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
                 int column = isFW ? 0 : 1;
 
                 final GATKSAMRecord read = el.getKey();
-                table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
+                table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
             }
         }
 
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index e86834a4a..03d4216dd 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -178,4 +178,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
                 Arrays.asList("8a400b0c46f41447fcc35a907e34f384"));
         executeTest("HC calling on a ReducedRead BAM", spec);
     }
+
+    @Test
+    public void testReducedBamWithReadsNotFullySpanningDeletion() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
+                Arrays.asList("0446c11fe2ba68a14f938ebc6e71ded7"));
+        executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
index b43b590df..1488f7269 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
@@ -394,6 +394,11 @@ public class ReadUtils {
         return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false);
     }
 
+    public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) {
+        final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord);
+        return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false);
+    }
+
     public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) {
         Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached);
         int readCoord = result.getFirst();

From 953592421b267254b0dc4811522c982bbdd5360d Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 17 Jan 2013 09:19:21 -0500
Subject: [PATCH 30/34] I think we got out of sync with the HC tests as we were
 clobbering each other's changes.  Only differences here are to some
 RankSumTest values.

---
 .../HaplotypeCallerIntegrationTest.java            | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 03d4216dd..3ceb0df94 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
+        HCTest(CEUTRIO_BAM, "", "0e59153c6359d7cb7be44e25ab552790");
     }
 
     @Test
     public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
+        HCTest(NA12878_BAM, "", "d4b377aed2c8be2ebd81ee5e43b73a93");
     }
 
     @Test(enabled = false)
@@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSampleComplex() {
-        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6c0c441b71848c2eea38ab5e2afe1120");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "14ed8e5be2d2a0bf478d742b4baa5a46");
     }
 
     private void HCTestSymbolicVariants(String bam, String args, String md5) {
@@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleSymbolic() {
-        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "0761ff5cbf279be467833fa6708bf360");
+        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "76fe5e57ed96541bdfee74782331b061");
     }
 
     private void HCTestIndelQualityScores(String bam, String args, String md5) {
@@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleIndelQualityScores() {
-        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "25981f7706f61d930556fb128cd1e5c5");
     }
 
     // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void HCTestProblematicReadsModifiedInActiveRegions() {
         final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4701887e1927814259560d85098b6440"));
         executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
     }
 
@@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void HCTestReducedBam() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
-                Arrays.asList("8a400b0c46f41447fcc35a907e34f384"));
+                Arrays.asList("18d047bf8116b56e0c6212e0875eceea"));
         executeTest("HC calling on a ReducedRead BAM", spec);
     }
 

From 6db3e473af175328166cf5aea9e0bd94ff9a9e31 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 17 Jan 2013 10:30:04 -0500
Subject: [PATCH 31/34] Better error message for bad qual

---
 .../gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
index fc7573f21..8c8de2bad 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
@@ -425,7 +425,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
         byte qual = p.getQual();
 
         if ( qual > SAMUtils.MAX_PHRED_SCORE )
-            throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s.  Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
+            throw new UserException.MisencodedBAM(p.getRead(), "we encountered an extremely high quality score (" + (int)qual + ")");
         if ( capBaseQualsAtMappingQual )
             qual = (byte)Math.min((int)qual, p.getMappingQual());
         if ( (int)qual < minBaseQual )

From 6a903f2c235fd48dd02aa9f1982da9e980046c2e Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Fri, 18 Jan 2013 01:21:08 -0500
Subject: [PATCH 32/34] I finally gave up on trying to get the Haplotype/Allele
 merging to work in the HaplotypeCaller. I've resigned myself instead to
 create a mapping from Allele to Haplotype.  It's cheap so not a big deal, but
 really shouldn't be necessary. Ryan and I are talking about refactoring for
 GATK2.5.

---
 .../haplotypecaller/HaplotypeCaller.java       |  7 ++++++-
 .../LikelihoodCalculationEngine.java           |  9 +++++++--
 .../HaplotypeCallerIntegrationTest.java        | 18 +++++++++---------
 .../broadinstitute/sting/utils/Haplotype.java  |  9 +++++++--
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index ce6aa32f4..26f2560b7 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -508,12 +508,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
             for ( Haplotype haplotype : haplotypes )
                 writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype));
 
+            // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently
+            final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<Allele, Haplotype>(haplotypes.size());
+            for ( final Haplotype haplotype : haplotypes )
+                alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype);
+
             // next, output the interesting reads for each sample aligned against the appropriate haplotype
             for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
                 for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
                     final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
                     if ( bestAllele != Allele.NO_CALL )
-                        writeReadAgainstHaplotype(entry.getKey(), (Haplotype) bestAllele, paddedRefLoc.getStart());
+                        writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedRefLoc.getStart());
                 }
             }
         }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index aafdbf126..57e071189 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -124,9 +124,14 @@ public class LikelihoodCalculationEngine {
     }
 
     private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads) {
+        // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
+        final int numHaplotypes = haplotypes.size();
+        final Map<Haplotype, Allele> alleleVersions = new HashMap<Haplotype, Allele>(numHaplotypes);
+        for ( final Haplotype haplotype : haplotypes ) {
+            alleleVersions.put(haplotype, Allele.create(haplotype.getBases()));
+        }
 
         final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
-        final int numHaplotypes = haplotypes.size();
         for( final GATKSAMRecord read : reads ) {
             final byte[] overallGCP = new byte[read.getReadLength()];
             Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
@@ -148,7 +153,7 @@ public class LikelihoodCalculationEngine {
                 final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
                 previousHaplotypeSeen = haplotype;
 
-                perReadAlleleLikelihoodMap.add(read, haplotype,
+                perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype),
                         pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
                                 readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
             }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 3ceb0df94..6c7afd8bb 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "0e59153c6359d7cb7be44e25ab552790");
+        HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
     }
 
     @Test
     public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "d4b377aed2c8be2ebd81ee5e43b73a93");
+        HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
     }
 
     @Test(enabled = false)
@@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
         HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
-                "1a034b7eb572e1b6f659d6e5d57b3e76");
+                "d590c8d6d5e58d685401b65a23846893");
     }
 
     private void HCTestComplexVariants(String bam, String args, String md5) {
@@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSampleComplex() {
-        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "14ed8e5be2d2a0bf478d742b4baa5a46");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6c0c441b71848c2eea38ab5e2afe1120");
     }
 
     private void HCTestSymbolicVariants(String bam, String args, String md5) {
@@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleSymbolic() {
-        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "76fe5e57ed96541bdfee74782331b061");
+        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "0761ff5cbf279be467833fa6708bf360");
     }
 
     private void HCTestIndelQualityScores(String bam, String args, String md5) {
@@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleIndelQualityScores() {
-        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "25981f7706f61d930556fb128cd1e5c5");
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
     }
 
     // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void HCTestProblematicReadsModifiedInActiveRegions() {
         final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4701887e1927814259560d85098b6440"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
         executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
     }
 
@@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void HCTestReducedBam() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
-                Arrays.asList("18d047bf8116b56e0c6212e0875eceea"));
+                Arrays.asList("8a400b0c46f41447fcc35a907e34f384"));
         executeTest("HC calling on a ReducedRead BAM", spec);
     }
 
@@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void testReducedBamWithReadsNotFullySpanningDeletion() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
-                Arrays.asList("0446c11fe2ba68a14f938ebc6e71ded7"));
+                Arrays.asList("6c22e5d57c4f5b631e3345e721aaca1b"));
         executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
     }
 }
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index 66aed1173..baab1f5fa 100644
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -73,9 +73,14 @@ public class Haplotype extends Allele {
 
     @Override
     public boolean equals( Object h ) {
-        return h instanceof Haplotype && super.equals(h);
+        return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases());
     }
-    
+
+    @Override
+    public int hashCode() {
+        return Arrays.hashCode(getBases());
+    }
+
     public HashMap<Integer, VariantContext> getEventMap() {
         return eventMap;
     }

From 39c73a6cf5adf5d0643c93af0c60b354a31e73ad Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Fri, 18 Jan 2013 03:35:48 -0500
Subject: [PATCH 33/34] 1. Ryan and I noticed that the FisherStrand annotation
 was completely busted for indels with reduced reads; fixed. 2. While making
 the previous fix and unifying FS for SNPs and indels, I noticed that FS was
 slightly broken in the general case for indels too; fixed. 3. I also fixed a
 minor bug in the Allele Biased Downsampling code for reduced reads.

---
 .../AlleleBiasedDownsamplingUtils.java        | 11 ++--
 .../gatk/walkers/annotator/FisherStrand.java  | 53 +++++++++----------
 .../UnifiedGenotyperIntegrationTest.java      | 12 ++---
 .../HaplotypeCallerIntegrationTest.java       |  4 +-
 4 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
index a7bb58d0c..ba1da7c87 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
@@ -84,12 +84,13 @@ public class AlleleBiasedDownsamplingUtils {
         // start by stratifying the reads by the alleles they represent at this position
         for( final PileupElement pe : pileup ) {
             // we do not want to remove a reduced read
-            if ( pe.getRead().isReducedRead() )
+            if ( pe.getRead().isReducedRead() ) {
                 reducedReadPileups.add(pe);
-
-            final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
-            if ( baseIndex != -1 )
-                alleleStratifiedElements[baseIndex].add(pe);
+            } else {
+                final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
+                if ( baseIndex != -1 )
+                    alleleStratifiedElements[baseIndex].add(pe);
+            }
         }
 
         // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
index fbd27dfe3..ff3d7940f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
@@ -265,24 +265,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
 
         for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
             for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
-                final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true);
-                final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true);
-
-                if ( !matchesRef && !matchesAlt )
-                    continue;
-
-                boolean isFW = el.getKey().getReadNegativeStrandFlag();
-
-                int row = matchesRef ? 0 : 1;
-                int column = isFW ? 0 : 1;
-
+                final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
                 final GATKSAMRecord read = el.getKey();
-                table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
+                final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
+                updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount);
             }
         }
 
         return table;
     }
+
     /**
      Allocate and fill a 2x2 strand contingency table.  In the end, it'll look something like this:
      *             fw      rc
@@ -299,31 +291,36 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
         for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
             for (PileupElement p : sample.getValue().getBasePileup()) {
 
-                // ignore reduced reads because they are always on the forward strand!
-                // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test
-                if ( p.getRead().isReducedRead() )
-                    continue;
-
                 if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
                     continue;
 
                 if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
                     continue;
 
-                final Allele base = Allele.create(p.getBase(), false);
-                final boolean isFW = !p.getRead().getReadNegativeStrandFlag();
-
-                final boolean matchesRef = ref.equals(base, true);
-                final boolean matchesAlt = alt.equals(base, true);
-                if ( matchesRef || matchesAlt ) {
-                    int row = matchesRef ? 0 : 1;
-                    int column = isFW ? 0 : 1;
-
-                    table[row][column] += p.getRepresentativeCount();
-                }
+                updateTable(table, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount());
             }
         }
 
         return table;
     }
+
+    private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) {
+        // ignore reduced reads because they are always on the forward strand!
+        // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test
+        if ( read.isReducedRead() )
+            return;
+
+        final boolean matchesRef = allele.equals(ref, true);
+        final boolean matchesAlt = allele.equals(alt, true);
+
+        if ( matchesRef || matchesAlt ) {
+
+            final boolean isFW = !read.getReadNegativeStrandFlag();
+
+            int row = matchesRef ? 0 : 1;
+            int column = isFW ? 0 : 1;
+
+            table[row][column] += representativeCount;
+        }
+    }
 }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index a84019988..5b5a75d4e 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -363,7 +363,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
                          " -o %s" +
                          " -L 1:10,000,000-10,500,000",
                  1,
-                 Arrays.asList("39c7a813fd6ee82d3604f2a868b35b2a"));
+                 Arrays.asList("8231ae37b52b927db9fc1e5c221b0ba0"));
 
          executeTest(String.format("test indel calling, multiple technologies"), spec);
      }
@@ -391,13 +391,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
     public void testMultiSampleIndels1() {
         WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
                 baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
-                Arrays.asList("3d3c5691973a223209a1341272d881be"));
+                Arrays.asList("a47810de2f6ef8087f4644064a0814bc"));
         List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
 
         WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
                 baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
                         "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
-                Arrays.asList("23b7a37a64065cee53a80495c8717eea"));
+                Arrays.asList("53b8d2b0fa63c5d1019855e8e0db28f0"));
         executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
     }
 
@@ -497,18 +497,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
     public void testReducedBam() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
-                Arrays.asList("092e42a712afb660ec79ff11c55933e2"));
+                Arrays.asList("02175dc9731aed92837ce0db78489fc0"));
         executeTest("test calling on a ReducedRead BAM", spec);
     }
 
     @Test
     public void testReducedBamSNPs() {
-        testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f");
+        testReducedCalling("SNP", "fe1af8b30b7f1a267f772b9aaf388f24");
     }
 
     @Test
     public void testReducedBamINDELs() {
-        testReducedCalling("INDEL", "1c9aaf65ffaa12bb766855265a1c3f8e");
+        testReducedCalling("INDEL", "a85c110fcac9574a54c7daccb1e2d5ae");
     }
 
 
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 6c7afd8bb..27fe31fa7 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
         HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
-                "d590c8d6d5e58d685401b65a23846893");
+                "1a034b7eb572e1b6f659d6e5d57b3e76");
     }
 
     private void HCTestComplexVariants(String bam, String args, String md5) {
@@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void testReducedBamWithReadsNotFullySpanningDeletion() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
-                Arrays.asList("6c22e5d57c4f5b631e3345e721aaca1b"));
+                Arrays.asList("4e8121dd9dc90478f237bd6ae4d19920"));
         executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
     }
 }