main class (argument processing and traversing the reference) and implementation of all the Receiver functionality for building read piles over indels

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@112 348d0f76-0448-11de-a6fe-93d51630548a
2009-03-20 05:18:04 +00:00 · 2009-03-20 05:18:04 +00:00 · 3565b50ff5
parent 4c3b92b860
commit 3565b50ff5
2 changed files with 766 additions and 0 deletions
--- a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java
@ -0,0 +1,274 @@
+package org.broadinstitute.sting.indels;
+
+import java.io.File;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+
+
+import javax.swing.JFileChooser;
+import javax.swing.filechooser.FileNameExtensionFilter;
+import edu.mit.broad.picard.cmdline.CommandLineProgram;
+import edu.mit.broad.picard.cmdline.Option;
+import edu.mit.broad.picard.cmdline.Usage;
+import edu.mit.broad.picard.reference.ReferenceSequenceFile;
+import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
+import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
+import edu.mit.broad.picard.reference.ReferenceSequence;
+
+import net.sf.samtools.*;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+public class IndelInspector extends CommandLineProgram {
+
+    // Usage and parameters
+    @Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n";
+    @Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE;
+    @Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION;
+    @Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
+    @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
+//    @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
+
+    /** Required main method implementation. */
+    public static void main(final String[] argv) {
+        System.exit(new IndelInspector().instanceMain(argv));
+    }
+    
+	protected int doWork() {
+
+        GenomeLoc location = null;
+        if ( GENOME_LOCATION != null ) {
+            location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);
+        }
+
+		if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") ) {
+			System.out.println("Unknown value specified for ERR_MODE");
+			return 1;
+		}
+		
+		final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1kG/"));
+
+        setContigOrdering(samReader);
+
+        ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
+                    new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
+            );
+
+
+        ReferenceSequence contig_seq = null;
+
+		IndelRecordPileCollector col = null;
+		try {
+			col = new IndelRecordPileCollector();
+		} catch(Exception e) { System.err.println(e.getMessage()); }
+		if ( col == null ) return 1; 
+		
+        String cur_contig = null;
+        int counter = 0;
+
+        for ( SAMRecord r : samReader ) {
+
+        	if ( r.getReferenceName() != cur_contig) {
+        		cur_contig = r.getReferenceName();
+        		System.out.println("Contig "+cur_contig);
+                // if contig is specified and we are past that contig, we are done:
+                if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
+                if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
+                    contig_seq = reference.get(r.getReferenceIndex());
+                    System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
+                }
+            }
+
+            // if contig is specified and wqe did not reach it yet, skip the records until we reach that contig:
+        	if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
+
+            // if stop position is specified and we are past that, stop reading:
+        	if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
+        	
+        	if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now
+        	
+        	int err = -1;
+/*
+            System.out.println("MM:     "+numMismatches(r));
+            System.out.println("direct: "+numMismatchesDirect(r,contig_seq));
+            System.out.print("  ");
+            for ( int i = r.getAlignmentStart() - 1 ; i < r.getAlignmentEnd() ; i++ ) System.out.print((char)contig_seq.getBases()[i]);
+            System.out.println();
+            System.out.println((r.getReadNegativeStrandFlag()?"<-":"->")+r.getReadString());
+            System.out.println("cigar: "+r.getCigarString());
+            System.out.println();
+            if (counter++ == 20 ) break;
+            continue;
+*/
+
+        	if ( ERR_MODE.equals("MM")) err = numMismatches(r);
+        	else if ( ERR_MODE.equals("ERR")) err = numErrors(r);
+        	else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r);
+        	if ( err > MAX_ERRS.intValue() ) continue;
+        	counter++;
+        	if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
+        	col.receive(r);
+
+        }
+        System.out.println("done.");
+        col.printLengthHistograms();
+        samReader.close();
+        return 0;
+	}
+	
+	/** This method is a HACK: it is designed to work around the current bug in NM tags created  at CRD 
+	 * 
+	 * @param r SAM record that must specify an alignment
+	 * @return number of errors (number of mismatches plus total length of all insertions/deletions
+	 * @throws RuntimeException
+	 */
+	private static int numErrors(SAMRecord r) throws RuntimeException {
+		
+		// NM currently stores the total number of mismatches in all blocks + 1
+		int errs = numMismatches(r);
+		
+		// now we have to add the total length of all indels:
+		Cigar c = r.getCigar();
+		for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+			CigarElement ce = c.getCigarElement(i);
+			switch( ce.getOperator()) {
+			case M : break; // we already have correct number of mismatches
+			case I : 
+			case D :
+					errs += ce.getLength();
+					break;
+			default: throw new RuntimeException("Unrecognized cigar element");
+			}
+		}
+		return errs;
+	}
+
+	/** This method is a HACK: it is designed to work around the current bug in NM tags created  at CRD 
+	 * 
+	 * @param r SAM record that must specify an alignment
+	 * @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or
+	 * deletion will be counted as a single error regardless of the length)
+	 * @throws RuntimeException
+	 */
+	private static int numMismatchesGaps(SAMRecord r) throws RuntimeException {
+		
+		// NM currently stores the total number of mismatches in all blocks + 1
+		int errs = numMismatches(r);
+		
+		// now we have to add the total length of all indels:
+		Cigar c = r.getCigar();
+		for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+			CigarElement ce = c.getCigarElement(i);
+			switch( ce.getOperator()) {
+			case M : break; // we already have correct number of mismatches
+			case I : 
+			case D :
+					errs++;
+					break;
+			default: throw new RuntimeException("Unrecognized cigar element");
+			}
+		}
+		return errs;
+	}
+
+    private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) {
+        int i_ref = r.getAlignmentStart()-1; // position on the ref
+        int i_read = 0;                    // position on the read
+        int mm_count = 0; // number of mismatches
+        Cigar c = r.getCigar();
+        for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
+            CigarElement ce = c.getCigarElement(k);
+            switch( ce.getOperator() ) {
+                case M:
+                    for ( int l = 0 ; l < ce.getLength() ; l++ ) {
+                        if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
+                             Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++;
+                        i_ref++;
+                        i_read++;
+                    }
+                    break;
+                case I: i_read += ce.getLength(); break;
+                case D: i_ref += ce.getLength(); break;
+                default: throw new RuntimeException("Unrecognized cigar element");
+            }
+
+        }
+        return mm_count;
+    }
+	
+	/** This method is a HACK: it is designed to work around the current bug in NM tags created  at CRD */
+	private static int numMismatches(SAMRecord r) throws RuntimeException {
+		
+		// NM currently stores the total number of mismatches in all blocks + 1
+		return ((Integer)r.getAttribute("NM")).intValue() - 1;
+		
+	}
+
+    /** Trivial utility method that goes some distance trying to ensure that the input file is there;
+     * the only purpose is reducing clutter in main(). Receives a default
+     * input file argument, does a few checks (e.g. that it is non-null and exists), if they fail tries
+     * to fire up a file chooser dialog using start_folder as initial directory, etc.
+     * @param default_arg some "default" input file; if it is non-null and exists, nothing else will be done,
+     *        and the same default_arg objetc will be returned; otherwise the method will try to ask for a "better" input.
+     * @param start_folder should file open dialog be fired up, it will initially display this directory.
+     * @return File object that is not null and does exist (there is no check that it is a valid SAM/BAM file though).
+     */
+    private File getInputFile(File default_arg, String start_folder) {
+        File f = default_arg;
+        if ( f==null || ! f.exists() ) {
+            JFileChooser fc = new JFileChooser(start_folder);
+            FileNameExtensionFilter ff = new FileNameExtensionFilter("SAM and BAM files","sam","bam");
+            fc.setFileFilter(ff);
+            fc.setFileSelectionMode(JFileChooser.FILES_ONLY);
+
+            int ret = fc.showOpenDialog(null);
+            f = fc.getSelectedFile();
+            if ( ret != JFileChooser.APPROVE_OPTION ) {
+                System.out.println("No input file specified. Exiting...");
+                System.exit(1);
+            }
+        }
+
+        if ( f == null || ! f.exists() ) {
+            System.out.println("SAM or BAM input file must be specified. Exiting...");
+            System.exit(1);
+        }
+
+        return f;
+    }
+
+    /** Auxiliary method to remove some clutter from main(); gets called only once and tries to get
+     * contig ordering from the header provided by opened SAM reader; if no header info is available
+     * falls back to default ordering; whichever ordering is used, it is set for GenomeLoc class.
+     * @param r sam reader to get header from
+     */
+    private void setContigOrdering(SAMFileReader r) {
+        SAMFileHeader h = r.getFileHeader();
+        if ( h == null ) {
+            System.out.println("No header found in SAM file, falling back to default contig ordering");
+            setDefaultContigOrdering();
+            return;
+        }
+        List<SAMSequenceRecord> seqs = h.getSequences();
+        if ( seqs == null ) {
+            System.out.println("No reference sequence records found in SAM file header, " +
+                               "falling back to default contig ordering");
+            setDefaultContigOrdering();
+            return;
+        }
+        int i = 0;
+        Map<String,Integer> rco = new HashMap<String,Integer>();
+        for ( SAMSequenceRecord sr : seqs) {
+            rco.put(sr.getSequenceName(),i++);
+        }
+        GenomeLoc.setContigOrdering(rco);
+    }
+
+    private void setDefaultContigOrdering() {
+        Map<String,Integer> rco = new HashMap<String,Integer>();
+        rco.put("chrM",0);
+        for ( int i = 1 ; i <= 22 ; i++ ) rco.put("chr"+i,i);
+        rco.put("chrX",23);
+        rco.put("chrY",24);
+    }
+}
--- a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java
@ -0,0 +1,492 @@
+package org.broadinstitute.sting.indels;
+
+import java.util.*;
+
+
+import org.broadinstitute.sting.indels.Indel.IndelType;
+import org.broadinstitute.sting.utils.*;
+import net.sf.samtools.*;
+
+/** Ultimately, this class is a splitter for a stream of alignment records. It detects putative indels, or
+ * trains of sufficiently close indels, and sends the alignments two-way: those that do not overlap with any
+ * detected indels or trains of indels, and those that do. The latters are emitted in finished piles of
+ * all alignments that overlap with genomic interval of interest. This collector should be bound to
+ * and driven by an alignment traversal engine that sends alignment records one by one.
+ *
+ * NOTE 1: alignments must be sent to the collector strictly in the order
+ * of non-decreasing reference start position.
+ *
+ * NOTE 2: a train of indels is defined as a sequence of (putative) indels such that each pair of adjacent indels
+ * is overlapped by at least one alignment (that alignment does not have to have both indels in it, but only
+ * to span over both positions). A "genomic region of interest" is defined as the smallest interval
+ * containing all indels in the train, and <i>all</i> the alignments that overlap with that region will be collected
+ * into one pile. For instance, if reads of different length are present in the dataset, it is possible that two
+ * adjacent indels are overlapped by a single longer read (which stitches them into the train), but there are
+ * shorter reads that fall completely into the in-between region (so that they technically do not overlap with any
+ * indels in the train). According to the above definition, these shorter reads will be still emitted into the pile,
+ * since they overlap with the "region of interest".
+ *
+ * NOTE 3: due to performance/memory issues, the collector may refuse to assemble a pile over pathologically long
+ * train of indels. In this case, it will keep detecting the indel train in order to be able to understand what is
+ * going on and to recover later, but the reads will be sent to the "non-overlapping" output channel.
+ *
+ * In order to abstract and decouple the operation of emitting records, the collector expects to be bound to an
+ * implementation of RecordEmitter interface. It is the emitter's implementation that decides what to do with
+ * alignments of the two types (not related to indels vs. piles of alignments overlapping with indels). While
+ * this collector has some delay between receiving an alignment and being able to decide which way it should go,
+ * no records are ever discarded.
+ *
+ * Implementation note:
+ *
+ * In order to achive its goal, the collector has a state ('wait' or 'active') and always
+ * keeps a variable size "backlog" pile of alignments that were sent to it most recently. In 'wait' state collector
+ * has not detected any putative indels just yet. The backlog pile contains only alignments of "undecided fate": those
+ * that still might overlap with an indel should it be detected in the future. All alignments that end before the
+ * current position on the genome have their fate determined (as not overlapping with any indels) and emitted.
+ * When an indel is encountered, the collector flips into the 'active' state and from that moment on keeps all
+ * the alignments in the pile and collects information on the indels (their positions on the reference and numbers
+ * of observations).
+ *
+ * Since only alignments are sorted, but not indels (an indel in a later read may occur closer
+ * to its start and thus before a previously seen indel), and also because it is relatively difficult (TO_DO?)to break a
+ * pile in the middle immediately when it becomes clear that two adjacent indels might have been overlapped by a
+ * single read, but no such read ever surfaced, the collector is conservative at this stage and keeps
+ * accumulating the pile (and indel train) until it moves sufficiently far away from the last indel seen (full
+ * maximum read length is recommended). Then it switches back into wait state and performs post-processing
+ * of the indel train and the collected pile: only at this stage the preliminary pile is closely examined and if
+ * there are pairs of adjacent indels not spanned by any read, the pile is broken into smaller piles
+ * that conform to the contract outlined above. These piles are directed into the RecordEmitter, and the
+ * reads that fall in between the piles, if any (i.e. those that do not overlap with final indel trains
+ * determined at the post-processing stage) are relabeled as "not interesting" and redirected
+ * to the appropriate output channel.
+ *
+ * @author asivache
+ *
+ */
+public class IndelRecordPileCollector implements RecordReceiver {
+
+	private final int WAIT_STATE = 0;
+	private final int ACTIVE_STATE = 1;
+
+	private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
+                                     // we will determine their span and report them,
+                                     // but we won't be counting any indels there or building piles
+
+	private List<SAMRecord> mRecordPile; // here we keep the records before we decide how we want to emit them
+	private TreeSet<CountedObject<Indel> > mAllIndels; ///< individual indels encountered, with observation counts
+	private int mLastContig ;    ///< keeps the index of the contig last alignment was on
+	private int mLastStartOnRef; ///< keeps the start position of the last alignment
+	private int mState; ///< WAIT_STATE or ACTIVE_STATE
+	private int mIndelSeparation; ///< Indels that are farther away from one another than this value
+	                               ///< will be emitted separately; trains of indels with less then
+		                           ///< mIndelSeparation bases between each adjacent pair will be emitted 
+		                           ///< as one pile.
+	
+	// we will build histograms (distributions) of encountered indel lengths on the fly
+	private List<Integer> mIndelLengthHistI;
+	private List<Integer> mIndelLengthHistD;
+
+    private RecordReceiver nonindelReceiver; // we will send there records that do not overlap with regions of interest
+    private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
+
+	public String memStatsString() {
+		String s = "mRecordPile: ";
+		return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef;
+                //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
+	}
+	
+	public IndelRecordPileCollector() throws java.io.IOException {
+		mRecordPile = new LinkedList<SAMRecord>();
+		mAllIndels = new TreeSet<CountedObject<Indel> >(
+                new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
+		mLastContig = -1;
+		mLastStartOnRef = -1;
+		mIndelSeparation = 51;
+		mIndelLengthHistI = new ArrayList<Integer>();
+		mIndelLengthHistD = new ArrayList<Integer>();
+		for ( int i = 0 ; i < 5 ; i++ ) {
+			mIndelLengthHistI.add(0);
+			mIndelLengthHistD.add(0);
+		}
+        nonindelReceiver = new DiscardingReceiver();
+        indelPileReceiver = new DiscardingPileReceiver();
+        setWaitState();
+	}
+
+    /** Fully reinitializes wait state: clears record pile and indel list, resets flags and states.
+     * Does not emit records, just clears/resets the variables.
+     */
+    private void setWaitState() {
+        mRecordPile.clear();
+        mAllIndels.clear();
+//        mIndelRegionStart = 1000000000;
+//        mIndelRegionStop = -1;
+        avoiding_region = false;
+        mState = WAIT_STATE; // got to do this if we were in avoid_region state
+    }
+	
+	/** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile
+     * all the consequtive records with alignment end positions less than or equal to the specified
+     * position <code>pos</code>, until the first record is encountered that does not meet this condition. Note that
+     * there might be more alignments that end at or before <code>pos</code> later on in the pile, but
+     * they will <i> nit</i> be emitted/removed by this method.
+	 * @param pos all leading records with alignments ending before or at this position will be purged from the pile,
+     * up to the first record that does not end at or before pos.
+	 */
+	protected void purgeRecordsEndingAtOrBefore(final long pos) {
+        Iterator<SAMRecord> i = mRecordPile.iterator();
+        while ( i.hasNext() ) {
+            SAMRecord r = i.next();
+            if ( r.getAlignmentEnd() <= pos ) {
+                nonindelReceiver.receive(r);
+                i.remove();
+            } else break;
+        }
+	}
+	
+	/** A utility method: purges from the currently held SAM record pile all the records with alignment
+	 * start positions greater than or equal to the specified position <code>pos</code>
+	 * @param pos all records with alignments starting at or after this position will be purged from the pile
+	 */
+	protected void purgeRecordsStartingAtOrAfter(final int pos) {
+        Iterator<SAMRecord> i = mRecordPile.iterator();
+        while ( i.hasNext() ) {
+            SAMRecord r = i.next();
+            if ( r.getAlignmentStart() >= pos ) {
+                nonindelReceiver.receive(r);
+                i.remove();
+            } else break;
+        }
+	}
+
+    /** This is the main interface method of the collector: it receives alignments, inspects them, detects indels,
+     *  updates and purges the read pile it keeps and emits alignments as needed.
+     * Depending on the state, the following behaviors are possible
+     *
+     * <ul>
+     *  <li> If the collector is in wait state (no indels seen recently): all
+     *       alignments that end prior to the start of currently inspected alignment can not overlap
+     *       with any future indels, including those that may be present in the current alignment; these records
+     *       get purged from the pile and emitted immediately. Current alignment gets added to the pile.
+     *       If current alignment has indels, collector switches into 'active' state.
+     *  <li> in active state: if the current alignment starts sufficiently far away from the last indel seen,
+     *       examine the currently held pile closely, split into a few separate piles/indel trains if needed, emit and
+     *       completely purge the pile, add alignment to the pile, switch to wait state if alignment has no indels or
+     *       stay in active state if it does. Otherwise (alignment too close to last indel),
+     *       just add alignment to the pile, since it is yet impossible to tell whether new indels are coming soon and
+     *       indel train will need to be extended; if alignment does have indels of its own, add them
+     *       to the current indel train
+     * </ul>
+     *
+     * This method checks that records arrive in reference-sorted order and throws RuntimeException if out-of-order
+     * record arrives.
+     *
+     * @param r
+     * @throws RuntimeException
+     */
+    @Override
+	public void receive(final SAMRecord r) throws RuntimeException {
+		
+		if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do
+		
+		int currContig = r.getReferenceIndex();
+		int currPos = r.getAlignmentStart();
+		
+		if ( currContig < mLastContig ) throw new RuntimeException("SAM file is not ordered by contigs");
+		if ( currContig == mLastContig && currPos < mLastStartOnRef ) throw new RuntimeException("SAM file is not ordered by start positions");
+		
+		if ( currContig > mLastContig ) {
+			// we jumped onto a new contig; emit everything we might have been building and purge the piles:
+			emit();
+		} else { // still on the same contig:
+
+            switch (mState) {
+			            // everything ending up to currPos is guaranteed to have no overlaps with indels yet to come
+			    case WAIT_STATE: purgeRecordsEndingAtOrBefore(currPos); break;
+
+                       // next indel can start only after currPos (whether it is in the current read or in the
+                       // reads yet to come). If it is far enough from the last indel we have seen, we can emit
+                case ACTIVE_STATE: if ( currPos - mAllIndels.last().getObject().getStop() > mIndelSeparation ) emit(); break;
+                default: throw new RuntimeException("Unknown state");
+            }
+        }
+
+        // does nothing if alignment has no indels, otherwise adds the indels to the list and (re)sets state to 'active'
+        extractIndelsAndUpdateState(r.getCigar(),currPos);
+
+		if ( ! avoiding_region && mAllIndels.size() > 20 ) avoiding_region = true;
+
+		if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region
+
+		mLastContig = currContig;
+		mLastStartOnRef = currPos;
+		
+	}
+
+    /** Emits all reads from the currently held pile, cleans the pile and fully reinitializes wait state
+     * (clears indel list etc).
+     *
+     * If the current state is 'wait', simply sends all the records from the pile to nonindelReceiver before
+     * the cleanup. If the state is 'active', then performs final inspection of the pile built over a train of indels,
+     * splits the train (and the pile) into multiple trains/piles as needed (i.e. if there are pairs of adjacent
+     * indels that are not overlapped by any read), and emits the final piles of records into indelReceiver.
+      */
+	private void emit() {
+
+		if ( mState == WAIT_STATE || avoiding_region ) {
+            if ( avoiding_region )  {
+                long start = mAllIndels.first().getObject().getStart();
+                long stop = mAllIndels.last().getObject().getStop();
+                System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
+                                   " was ignored: "+mAllIndels.size() +" unique indels with average distance of "+
+                                   ((double)(stop - start))/((double)mAllIndels.size()-1) +
+                                   " bases between indels");
+            }
+
+			// no indels or avoiding indels in bad region: send all records to nonindelReceiver and clear the pile
+            for ( SAMRecord r : mRecordPile ) nonindelReceiver.receive(r);
+            setWaitState();
+			return;
+		}
+		
+		// last minute cleanup:
+		// at this stage we have all the indels collected conservatively (in a sense
+		// that they can be farther away than it is needed) - this means that there actually
+		// can be more than one pile in what we have stored. Also, we can still have gapless reads
+		// at the ends of the piles that do not really overlap with indel sites.
+		
+		if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
+
+		HistogramAsNeeded(mAllIndels);
+
+
+        // indels are in a sorted map, and reads were added to the pile in the order they were received (also sorted).
+        // we will traverse the two collections in parallel and detect exactly where we can break the indel train into
+        // subtrains
+        Iterator<CountedObject<Indel> > i_iter = mAllIndels.iterator();
+
+        // will keep list of indels and list of records, respectively, in one final train
+        List< CountedObject<Indel> > finalTrain = new ArrayList<CountedObject<Indel>>();
+        List< SAMRecord > finalPile = new ArrayList<SAMRecord>();
+
+        long curr_stop = -1; // the rightmost stop position among all the alignments seen so far
+
+        CountedObject<Indel> indel = i_iter.next(); // we checked that list of indels contains at least one element!
+
+        SAMRecord record ;
+
+        while ( indel != null ) {
+
+            // first, if we just started new indel train, then emit into nonindelReceiver all alignments
+            // that end prior to the first indel in the train:
+            if ( finalTrain.size() == 0 ) purgeRecordsEndingAtOrBefore(indel.getObject().getStart() - 1);
+
+            finalTrain.add(indel);
+
+            Iterator<SAMRecord> r_iter = mRecordPile.iterator();
+
+            if ( r_iter.hasNext() ) record = r_iter.next();
+            else record = null;
+
+            // record now contains first alignment that ends in or after the indel, or null if there are no more records
+
+            // now collect all the alignments that overlap with the current indel (start before or inside) and
+            // record the rightmost alignment stop position:
+            while ( record != null && record.getAlignmentStart() <= indel.getObject().getStop() ) {
+                finalPile.add(record);
+                r_iter.remove(); // remove from the original pile the record we just moved to the current final pile
+                curr_stop = Math.max(curr_stop, record.getAlignmentEnd());
+                if ( r_iter.hasNext() ) record = r_iter.next();
+                else record = null;
+            }
+
+            // record is now the first alignment that starts after the indel, or null if there are no more records
+
+            // we are done with current indel, get next one if any:
+            if ( i_iter.hasNext() ) {
+                indel = i_iter.next();
+                if ( curr_stop < indel.getObject().getStart() ) {
+                    // all alignments that overlapped with the previous indel ended before the current indel started,
+                    // this means that the current train and pile of reads overlapping with it are fully built;
+                    // emit into indel receiver if the train is interesting enough, or into the nonindel receiver:
+
+                    System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" +
+                            finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " +
+                            finalTrain.size() + " indels");
+                    System.out.println(formatRange(finalTrain));
+
+                    if ( shouldAcceptForOutput(finalTrain ) ) indelPileReceiver.receive(finalPile);
+                    else for ( SAMRecord r : finalPile ) nonindelReceiver.receive(r);
+                    finalPile.clear();
+                    finalTrain.clear();
+                    curr_stop = -1;
+                } // ELSE: otherwise we have reads that overlap with both previous and current indel, so we just continue
+            } else indel = null;
+        }
+
+        setWaitState();
+	}
+		
+	
+	/** Looks for indels in the cigar and, if finds any, updates list of indels in the current train ans sets
+     * the state to 'active'. If cigar contains no indels, this method does not do anything (it does <i>not</i>
+     * set state back to 'wait' either!). If this method finds any indels in the cigar, it first tries to find them
+     * in the list of previously seen indels. If the indel was already seen before, its counter is updated (indels
+     * are stored in the list as counted objects), oherwise indel is added to the list with initial count of 1.
+	 *
+	 * @param c alignment cigar; if it contains no indels, nothing will be done
+	 * @param start position, at which the alignment represented by cigar <code>c</code> starts on the reference
+	 */
+	private void extractIndelsAndUpdateState(final Cigar c, final int start) {
+		//
+		// firstpos,lastpos span of the indel will be interpreted as follows:
+		// any alignment that ends strictly before firstpos or starts strictly after lastpos
+		// on the *reference* (not inclusive!) does not overlap with an indel; in the case of 
+		// insertion it will result in firstpos > lastpos!
+		//         lastpos
+		//         |   firstpos 
+		//         |   |
+		//         v   v
+		// ---------III----- Ref  Insertion: bases I are not in the ref; any alignment that starts
+		//                        after lastpos or ends before firstpos *on the reference*
+		//                        is completely over the reference bases to the right or to
+		//                        the left, respectively, of the insertion site
+		//
+		//      firstpos
+		//      | lastpos
+		//      | |
+		//      v v
+		//------------------ Ref   Deletion: any alignment that ends before firstpos or starts after lastpos
+		// -----DDD--- alignment   on the reference does not overlap with the deletion 
+		int runninglength = start; // position on the original reference; start = alignment start position
+
+        if ( c.numCigarElements() == 1 ) return; // most of the reads have no indels, save a few cycles by returning early
+
+		for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+			
+			final CigarElement ce = c.getCigarElement(i);
+            Indel indel = null;
+
+			switch(ce.getOperator()) {
+    		case I: indel = new Indel(runninglength, ce.getLength(), IndelType.I); break;
+    		case D: indel = new Indel(runninglength, ce.getLength(), IndelType.D);
+    				runninglength += ce.getLength();
+    				break;
+    		case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
+    		default :
+    			throw new IllegalArgumentException("Unexpected operator in cigar string");
+			}
+
+			if ( indel == null ) continue; // element was not an indel, go grab next element...
+
+            mState = ACTIVE_STATE; // this is redundant and will be executed unnecessarily many times, but it's cheap...
+
+            CountedObject<Indel> indelWithCount = new CountedObject<Indel>(indel);
+            CountedObject<Indel> found = mAllIndels.floor(indelWithCount);
+            
+            if ( indelWithCount.equals( found ) ) found.increment(); // we did find our indel, advance the counter
+            else mAllIndels.add(indelWithCount); // this is a new indel. Add it.
+		} // end for loop over all alignment cigar elements
+		
+	} // end extractIndels() method
+
+
+
+	/** Counts the size of the passed <indel> argument into the appropriate size histogram 
+	 * 
+	 * @param indel size of this indel will be counted in
+	 */
+	private void addToSizeHistogram(Indel indel) {
+		// count this indel's size into the appropriate bin of the appropriate histogram
+		// (we count insertions and deletions separately), resizing the histogram array if needed:
+		List<Integer> histogram;
+		if ( indel.getType() == Indel.IndelType.D ) {
+			histogram = mIndelLengthHistD; 
+		} else if ( indel.getType() == Indel.IndelType.I ) {
+			histogram = mIndelLengthHistI; 
+		} else {
+			throw new RuntimeException("Indel of unknown type");
+		}
+		if( indel.getIndelLength() > histogram.size() ) {
+			for ( int j = histogram.size() ; j < indel.getIndelLength() ; j++ ) histogram.add(0);
+			histogram.set((int)indel.getIndelLength()-1, 1); // we are seeing this length for the first time, so count == 1
+		} else {
+			int n = histogram.get((int)indel.getIndelLength()-1);
+			histogram.set((int)indel.getIndelLength()-1, n+1);
+		}		
+	}
+	
+	/** Adds sizes of the indels from the list that pass some filters to the histograms
+	 * 
+	 * @param indels collection of indels with counts
+	 */
+	private void HistogramAsNeeded(Collection<CountedObject<Indel>> indels) {
+		for ( CountedObject<Indel> o : indels ) {
+			if ( o.getCount() >= 2 ) addToSizeHistogram(o.getObject());
+		}
+	}
+	
+	/** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable
+	 * if it contains at least one indel onbserved more than once.
+	 * @param indels list of indels with counts to check for being acceptable
+	 * @return true if the indel run has to be printed
+	 */
+	private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
+		for ( CountedObject<Indel> o :  indels ) {
+			if ( o.getCount() >= 2 ) return true;
+		}
+		return false;
+	}
+
+	private String formatRange(List<CountedObject<Indel>> indels) {
+		StringBuffer b = new StringBuffer();
+		StringBuffer all = new StringBuffer();
+		
+		long min = 1000000000;
+		long max = 0;
+		
+		for ( CountedObject<Indel> o :  indels ) {
+			if ( o.getCount() < 2 ) continue;
+            all.append(" ");
+            all.append(o.getObject().getIndelLength());
+			if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength();
+			if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength();
+		}
+		b.append(" min: ");
+		b.append(min);
+		b.append(" max: ");
+		b.append(max);
+		b.append(all);
+		return b.toString();
+	}
+
+    public void printLengthHistograms() {
+        if ( mIndelLengthHistD.size() < mIndelLengthHistI.size() ) {
+            for ( int i = mIndelLengthHistD.size(); i < mIndelLengthHistI.size(); i++ ) mIndelLengthHistD.add(0);
+        }
+        if ( mIndelLengthHistI.size() < mIndelLengthHistD.size() ) {
+            for ( int i = mIndelLengthHistI.size(); i < mIndelLengthHistD.size(); i++ ) mIndelLengthHistI.add(0);
+        }
+        System.out.println("length n_insertions n_deletions");
+        for ( int i = 0 ; i < mIndelLengthHistD.size(); i++  ) {
+            System.out.println((i+1)+" "+mIndelLengthHistI.get(i)+" "+mIndelLengthHistD.get(i));
+        }
+    }
+
+    /** Returns true iff the SAM record (or, strictly speaking, its cigar) has at least one insertion or deletion
+     *
+     * @param r record to analyze
+     * @return true if cigar contains at least one I or D element, false otherwise
+     */
+//    private boolean hasIndel(SAMRecord r) {
+//        Cigar c = r.getCigar();
+//        for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+//            CigarOperator co = c.getCigarElement(i).getOperator();
+//            if ( co.equals(CigarOperator.I) || co.equals(CigarOperator.D) ) {
+//                // we got an indel!
+//                return true;
+//            }
+//        }
+//        return false;
+//    }
+}