From 3565b50ff54e6833256144adaa0021762f783200 Mon Sep 17 00:00:00 2001 From: asivache Date: Fri, 20 Mar 2009 05:18:04 +0000 Subject: [PATCH] main class (argument processing and traversing the reference) and implementation of all the Receiver functionality for building read piles over indels git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@112 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/indels/IndelInspector.java | 274 ++++++++++ .../indels/IndelRecordPileCollector.java | 492 ++++++++++++++++++ 2 files changed, 766 insertions(+) create mode 100755 playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java create mode 100755 playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java new file mode 100755 index 000000000..7abe7cd24 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java @@ -0,0 +1,274 @@ +package org.broadinstitute.sting.indels; + +import java.io.File; +import java.util.List; +import java.util.Map; +import java.util.HashMap; + + +import javax.swing.JFileChooser; +import javax.swing.filechooser.FileNameExtensionFilter; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker; +import edu.mit.broad.picard.reference.ReferenceSequence; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLoc; + +public class IndelInspector extends CommandLineProgram { + + // Usage and parameters + @Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n"; + @Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE; + @Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION; + @Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE; + @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS; +// @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE; + + /** Required main method implementation. */ + public static void main(final String[] argv) { + System.exit(new IndelInspector().instanceMain(argv)); + } + + protected int doWork() { + + GenomeLoc location = null; + if ( GENOME_LOCATION != null ) { + location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION); + } + + if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") ) { + System.out.println("Unknown value specified for ERR_MODE"); + return 1; + } + + final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1kG/")); + + setContigOrdering(samReader); + + ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker( + new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") + ); + + + ReferenceSequence contig_seq = null; + + IndelRecordPileCollector col = null; + try { + col = new IndelRecordPileCollector(); + } catch(Exception e) { System.err.println(e.getMessage()); } + if ( col == null ) return 1; + + String cur_contig = null; + int counter = 0; + + for ( SAMRecord r : samReader ) { + + if ( r.getReferenceName() != cur_contig) { + cur_contig = r.getReferenceName(); + System.out.println("Contig "+cur_contig); + // if contig is specified and we are past that contig, we are done: + if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break; + if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) { + contig_seq = reference.get(r.getReferenceIndex()); + System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString()); + } + } + + // if contig is specified and wqe did not reach it yet, skip the records until we reach that contig: + if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue; + + // if stop position is specified and we are past that, stop reading: + if ( location != null && r.getAlignmentStart() > location.getStop() ) break; + + if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now + + int err = -1; +/* + System.out.println("MM: "+numMismatches(r)); + System.out.println("direct: "+numMismatchesDirect(r,contig_seq)); + System.out.print(" "); + for ( int i = r.getAlignmentStart() - 1 ; i < r.getAlignmentEnd() ; i++ ) System.out.print((char)contig_seq.getBases()[i]); + System.out.println(); + System.out.println((r.getReadNegativeStrandFlag()?"<-":"->")+r.getReadString()); + System.out.println("cigar: "+r.getCigarString()); + System.out.println(); + if (counter++ == 20 ) break; + continue; +*/ + + if ( ERR_MODE.equals("MM")) err = numMismatches(r); + else if ( ERR_MODE.equals("ERR")) err = numErrors(r); + else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r); + if ( err > MAX_ERRS.intValue() ) continue; + counter++; + if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString()); + col.receive(r); + + } + System.out.println("done."); + col.printLengthHistograms(); + samReader.close(); + return 0; + } + + /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD + * + * @param r SAM record that must specify an alignment + * @return number of errors (number of mismatches plus total length of all insertions/deletions + * @throws RuntimeException + */ + private static int numErrors(SAMRecord r) throws RuntimeException { + + // NM currently stores the total number of mismatches in all blocks + 1 + int errs = numMismatches(r); + + // now we have to add the total length of all indels: + Cigar c = r.getCigar(); + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + CigarElement ce = c.getCigarElement(i); + switch( ce.getOperator()) { + case M : break; // we already have correct number of mismatches + case I : + case D : + errs += ce.getLength(); + break; + default: throw new RuntimeException("Unrecognized cigar element"); + } + } + return errs; + } + + /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD + * + * @param r SAM record that must specify an alignment + * @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or + * deletion will be counted as a single error regardless of the length) + * @throws RuntimeException + */ + private static int numMismatchesGaps(SAMRecord r) throws RuntimeException { + + // NM currently stores the total number of mismatches in all blocks + 1 + int errs = numMismatches(r); + + // now we have to add the total length of all indels: + Cigar c = r.getCigar(); + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + CigarElement ce = c.getCigarElement(i); + switch( ce.getOperator()) { + case M : break; // we already have correct number of mismatches + case I : + case D : + errs++; + break; + default: throw new RuntimeException("Unrecognized cigar element"); + } + } + return errs; + } + + private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) { + int i_ref = r.getAlignmentStart()-1; // position on the ref + int i_read = 0; // position on the read + int mm_count = 0; // number of mismatches + Cigar c = r.getCigar(); + for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { + CigarElement ce = c.getCigarElement(k); + switch( ce.getOperator() ) { + case M: + for ( int l = 0 ; l < ce.getLength() ; l++ ) { + if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != + Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++; + i_ref++; + i_read++; + } + break; + case I: i_read += ce.getLength(); break; + case D: i_ref += ce.getLength(); break; + default: throw new RuntimeException("Unrecognized cigar element"); + } + + } + return mm_count; + } + + /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */ + private static int numMismatches(SAMRecord r) throws RuntimeException { + + // NM currently stores the total number of mismatches in all blocks + 1 + return ((Integer)r.getAttribute("NM")).intValue() - 1; + + } + + /** Trivial utility method that goes some distance trying to ensure that the input file is there; + * the only purpose is reducing clutter in main(). Receives a default + * input file argument, does a few checks (e.g. that it is non-null and exists), if they fail tries + * to fire up a file chooser dialog using start_folder as initial directory, etc. + * @param default_arg some "default" input file; if it is non-null and exists, nothing else will be done, + * and the same default_arg objetc will be returned; otherwise the method will try to ask for a "better" input. + * @param start_folder should file open dialog be fired up, it will initially display this directory. + * @return File object that is not null and does exist (there is no check that it is a valid SAM/BAM file though). + */ + private File getInputFile(File default_arg, String start_folder) { + File f = default_arg; + if ( f==null || ! f.exists() ) { + JFileChooser fc = new JFileChooser(start_folder); + FileNameExtensionFilter ff = new FileNameExtensionFilter("SAM and BAM files","sam","bam"); + fc.setFileFilter(ff); + fc.setFileSelectionMode(JFileChooser.FILES_ONLY); + + int ret = fc.showOpenDialog(null); + f = fc.getSelectedFile(); + if ( ret != JFileChooser.APPROVE_OPTION ) { + System.out.println("No input file specified. Exiting..."); + System.exit(1); + } + } + + if ( f == null || ! f.exists() ) { + System.out.println("SAM or BAM input file must be specified. Exiting..."); + System.exit(1); + } + + return f; + } + + /** Auxiliary method to remove some clutter from main(); gets called only once and tries to get + * contig ordering from the header provided by opened SAM reader; if no header info is available + * falls back to default ordering; whichever ordering is used, it is set for GenomeLoc class. + * @param r sam reader to get header from + */ + private void setContigOrdering(SAMFileReader r) { + SAMFileHeader h = r.getFileHeader(); + if ( h == null ) { + System.out.println("No header found in SAM file, falling back to default contig ordering"); + setDefaultContigOrdering(); + return; + } + List seqs = h.getSequences(); + if ( seqs == null ) { + System.out.println("No reference sequence records found in SAM file header, " + + "falling back to default contig ordering"); + setDefaultContigOrdering(); + return; + } + int i = 0; + Map rco = new HashMap(); + for ( SAMSequenceRecord sr : seqs) { + rco.put(sr.getSequenceName(),i++); + } + GenomeLoc.setContigOrdering(rco); + } + + private void setDefaultContigOrdering() { + Map rco = new HashMap(); + rco.put("chrM",0); + for ( int i = 1 ; i <= 22 ; i++ ) rco.put("chr"+i,i); + rco.put("chrX",23); + rco.put("chrY",24); + } +} diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java new file mode 100755 index 000000000..c07ab1056 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java @@ -0,0 +1,492 @@ +package org.broadinstitute.sting.indels; + +import java.util.*; + + +import org.broadinstitute.sting.indels.Indel.IndelType; +import org.broadinstitute.sting.utils.*; +import net.sf.samtools.*; + +/** Ultimately, this class is a splitter for a stream of alignment records. It detects putative indels, or + * trains of sufficiently close indels, and sends the alignments two-way: those that do not overlap with any + * detected indels or trains of indels, and those that do. The latters are emitted in finished piles of + * all alignments that overlap with genomic interval of interest. This collector should be bound to + * and driven by an alignment traversal engine that sends alignment records one by one. + * + * NOTE 1: alignments must be sent to the collector strictly in the order + * of non-decreasing reference start position. + * + * NOTE 2: a train of indels is defined as a sequence of (putative) indels such that each pair of adjacent indels + * is overlapped by at least one alignment (that alignment does not have to have both indels in it, but only + * to span over both positions). A "genomic region of interest" is defined as the smallest interval + * containing all indels in the train, and all the alignments that overlap with that region will be collected + * into one pile. For instance, if reads of different length are present in the dataset, it is possible that two + * adjacent indels are overlapped by a single longer read (which stitches them into the train), but there are + * shorter reads that fall completely into the in-between region (so that they technically do not overlap with any + * indels in the train). According to the above definition, these shorter reads will be still emitted into the pile, + * since they overlap with the "region of interest". + * + * NOTE 3: due to performance/memory issues, the collector may refuse to assemble a pile over pathologically long + * train of indels. In this case, it will keep detecting the indel train in order to be able to understand what is + * going on and to recover later, but the reads will be sent to the "non-overlapping" output channel. + * + * In order to abstract and decouple the operation of emitting records, the collector expects to be bound to an + * implementation of RecordEmitter interface. It is the emitter's implementation that decides what to do with + * alignments of the two types (not related to indels vs. piles of alignments overlapping with indels). While + * this collector has some delay between receiving an alignment and being able to decide which way it should go, + * no records are ever discarded. + * + * Implementation note: + * + * In order to achive its goal, the collector has a state ('wait' or 'active') and always + * keeps a variable size "backlog" pile of alignments that were sent to it most recently. In 'wait' state collector + * has not detected any putative indels just yet. The backlog pile contains only alignments of "undecided fate": those + * that still might overlap with an indel should it be detected in the future. All alignments that end before the + * current position on the genome have their fate determined (as not overlapping with any indels) and emitted. + * When an indel is encountered, the collector flips into the 'active' state and from that moment on keeps all + * the alignments in the pile and collects information on the indels (their positions on the reference and numbers + * of observations). + * + * Since only alignments are sorted, but not indels (an indel in a later read may occur closer + * to its start and thus before a previously seen indel), and also because it is relatively difficult (TO_DO?)to break a + * pile in the middle immediately when it becomes clear that two adjacent indels might have been overlapped by a + * single read, but no such read ever surfaced, the collector is conservative at this stage and keeps + * accumulating the pile (and indel train) until it moves sufficiently far away from the last indel seen (full + * maximum read length is recommended). Then it switches back into wait state and performs post-processing + * of the indel train and the collected pile: only at this stage the preliminary pile is closely examined and if + * there are pairs of adjacent indels not spanned by any read, the pile is broken into smaller piles + * that conform to the contract outlined above. These piles are directed into the RecordEmitter, and the + * reads that fall in between the piles, if any (i.e. those that do not overlap with final indel trains + * determined at the post-processing stage) are relabeled as "not interesting" and redirected + * to the appropriate output channel. + * + * @author asivache + * + */ +public class IndelRecordPileCollector implements RecordReceiver { + + private final int WAIT_STATE = 0; + private final int ACTIVE_STATE = 1; + + private boolean avoiding_region; // some regions are too funky (contain very long indel trains)- + // we will determine their span and report them, + // but we won't be counting any indels there or building piles + + private List mRecordPile; // here we keep the records before we decide how we want to emit them + private TreeSet > mAllIndels; ///< individual indels encountered, with observation counts + private int mLastContig ; ///< keeps the index of the contig last alignment was on + private int mLastStartOnRef; ///< keeps the start position of the last alignment + private int mState; ///< WAIT_STATE or ACTIVE_STATE + private int mIndelSeparation; ///< Indels that are farther away from one another than this value + ///< will be emitted separately; trains of indels with less then + ///< mIndelSeparation bases between each adjacent pair will be emitted + ///< as one pile. + + // we will build histograms (distributions) of encountered indel lengths on the fly + private List mIndelLengthHistI; + private List mIndelLengthHistD; + + private RecordReceiver nonindelReceiver; // we will send there records that do not overlap with regions of interest + private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there + + public String memStatsString() { + String s = "mRecordPile: "; + return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef; + //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop; + } + + public IndelRecordPileCollector() throws java.io.IOException { + mRecordPile = new LinkedList(); + mAllIndels = new TreeSet >( + new CountedObjectComparatorAdapter(new IntervalComparator())); + mLastContig = -1; + mLastStartOnRef = -1; + mIndelSeparation = 51; + mIndelLengthHistI = new ArrayList(); + mIndelLengthHistD = new ArrayList(); + for ( int i = 0 ; i < 5 ; i++ ) { + mIndelLengthHistI.add(0); + mIndelLengthHistD.add(0); + } + nonindelReceiver = new DiscardingReceiver(); + indelPileReceiver = new DiscardingPileReceiver(); + setWaitState(); + } + + /** Fully reinitializes wait state: clears record pile and indel list, resets flags and states. + * Does not emit records, just clears/resets the variables. + */ + private void setWaitState() { + mRecordPile.clear(); + mAllIndels.clear(); +// mIndelRegionStart = 1000000000; +// mIndelRegionStop = -1; + avoiding_region = false; + mState = WAIT_STATE; // got to do this if we were in avoid_region state + } + + /** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile + * all the consequtive records with alignment end positions less than or equal to the specified + * position pos, until the first record is encountered that does not meet this condition. Note that + * there might be more alignments that end at or before pos later on in the pile, but + * they will nit be emitted/removed by this method. + * @param pos all leading records with alignments ending before or at this position will be purged from the pile, + * up to the first record that does not end at or before pos. + */ + protected void purgeRecordsEndingAtOrBefore(final long pos) { + Iterator i = mRecordPile.iterator(); + while ( i.hasNext() ) { + SAMRecord r = i.next(); + if ( r.getAlignmentEnd() <= pos ) { + nonindelReceiver.receive(r); + i.remove(); + } else break; + } + } + + /** A utility method: purges from the currently held SAM record pile all the records with alignment + * start positions greater than or equal to the specified position pos + * @param pos all records with alignments starting at or after this position will be purged from the pile + */ + protected void purgeRecordsStartingAtOrAfter(final int pos) { + Iterator i = mRecordPile.iterator(); + while ( i.hasNext() ) { + SAMRecord r = i.next(); + if ( r.getAlignmentStart() >= pos ) { + nonindelReceiver.receive(r); + i.remove(); + } else break; + } + } + + /** This is the main interface method of the collector: it receives alignments, inspects them, detects indels, + * updates and purges the read pile it keeps and emits alignments as needed. + * Depending on the state, the following behaviors are possible + * + *
    + *
  • If the collector is in wait state (no indels seen recently): all + * alignments that end prior to the start of currently inspected alignment can not overlap + * with any future indels, including those that may be present in the current alignment; these records + * get purged from the pile and emitted immediately. Current alignment gets added to the pile. + * If current alignment has indels, collector switches into 'active' state. + *
  • in active state: if the current alignment starts sufficiently far away from the last indel seen, + * examine the currently held pile closely, split into a few separate piles/indel trains if needed, emit and + * completely purge the pile, add alignment to the pile, switch to wait state if alignment has no indels or + * stay in active state if it does. Otherwise (alignment too close to last indel), + * just add alignment to the pile, since it is yet impossible to tell whether new indels are coming soon and + * indel train will need to be extended; if alignment does have indels of its own, add them + * to the current indel train + *
+ * + * This method checks that records arrive in reference-sorted order and throws RuntimeException if out-of-order + * record arrives. + * + * @param r + * @throws RuntimeException + */ + @Override + public void receive(final SAMRecord r) throws RuntimeException { + + if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do + + int currContig = r.getReferenceIndex(); + int currPos = r.getAlignmentStart(); + + if ( currContig < mLastContig ) throw new RuntimeException("SAM file is not ordered by contigs"); + if ( currContig == mLastContig && currPos < mLastStartOnRef ) throw new RuntimeException("SAM file is not ordered by start positions"); + + if ( currContig > mLastContig ) { + // we jumped onto a new contig; emit everything we might have been building and purge the piles: + emit(); + } else { // still on the same contig: + + switch (mState) { + // everything ending up to currPos is guaranteed to have no overlaps with indels yet to come + case WAIT_STATE: purgeRecordsEndingAtOrBefore(currPos); break; + + // next indel can start only after currPos (whether it is in the current read or in the + // reads yet to come). If it is far enough from the last indel we have seen, we can emit + case ACTIVE_STATE: if ( currPos - mAllIndels.last().getObject().getStop() > mIndelSeparation ) emit(); break; + default: throw new RuntimeException("Unknown state"); + } + } + + // does nothing if alignment has no indels, otherwise adds the indels to the list and (re)sets state to 'active' + extractIndelsAndUpdateState(r.getCigar(),currPos); + + if ( ! avoiding_region && mAllIndels.size() > 20 ) avoiding_region = true; + + if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region + + mLastContig = currContig; + mLastStartOnRef = currPos; + + } + + /** Emits all reads from the currently held pile, cleans the pile and fully reinitializes wait state + * (clears indel list etc). + * + * If the current state is 'wait', simply sends all the records from the pile to nonindelReceiver before + * the cleanup. If the state is 'active', then performs final inspection of the pile built over a train of indels, + * splits the train (and the pile) into multiple trains/piles as needed (i.e. if there are pairs of adjacent + * indels that are not overlapped by any read), and emits the final piles of records into indelReceiver. + */ + private void emit() { + + if ( mState == WAIT_STATE || avoiding_region ) { + if ( avoiding_region ) { + long start = mAllIndels.first().getObject().getStart(); + long stop = mAllIndels.last().getObject().getStop(); + System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop + + " was ignored: "+mAllIndels.size() +" unique indels with average distance of "+ + ((double)(stop - start))/((double)mAllIndels.size()-1) + + " bases between indels"); + } + + // no indels or avoiding indels in bad region: send all records to nonindelReceiver and clear the pile + for ( SAMRecord r : mRecordPile ) nonindelReceiver.receive(r); + setWaitState(); + return; + } + + // last minute cleanup: + // at this stage we have all the indels collected conservatively (in a sense + // that they can be farther away than it is needed) - this means that there actually + // can be more than one pile in what we have stored. Also, we can still have gapless reads + // at the ends of the piles that do not really overlap with indel sites. + + if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels"); + + HistogramAsNeeded(mAllIndels); + + + // indels are in a sorted map, and reads were added to the pile in the order they were received (also sorted). + // we will traverse the two collections in parallel and detect exactly where we can break the indel train into + // subtrains + Iterator > i_iter = mAllIndels.iterator(); + + // will keep list of indels and list of records, respectively, in one final train + List< CountedObject > finalTrain = new ArrayList>(); + List< SAMRecord > finalPile = new ArrayList(); + + long curr_stop = -1; // the rightmost stop position among all the alignments seen so far + + CountedObject indel = i_iter.next(); // we checked that list of indels contains at least one element! + + SAMRecord record ; + + while ( indel != null ) { + + // first, if we just started new indel train, then emit into nonindelReceiver all alignments + // that end prior to the first indel in the train: + if ( finalTrain.size() == 0 ) purgeRecordsEndingAtOrBefore(indel.getObject().getStart() - 1); + + finalTrain.add(indel); + + Iterator r_iter = mRecordPile.iterator(); + + if ( r_iter.hasNext() ) record = r_iter.next(); + else record = null; + + // record now contains first alignment that ends in or after the indel, or null if there are no more records + + // now collect all the alignments that overlap with the current indel (start before or inside) and + // record the rightmost alignment stop position: + while ( record != null && record.getAlignmentStart() <= indel.getObject().getStop() ) { + finalPile.add(record); + r_iter.remove(); // remove from the original pile the record we just moved to the current final pile + curr_stop = Math.max(curr_stop, record.getAlignmentEnd()); + if ( r_iter.hasNext() ) record = r_iter.next(); + else record = null; + } + + // record is now the first alignment that starts after the indel, or null if there are no more records + + // we are done with current indel, get next one if any: + if ( i_iter.hasNext() ) { + indel = i_iter.next(); + if ( curr_stop < indel.getObject().getStart() ) { + // all alignments that overlapped with the previous indel ended before the current indel started, + // this means that the current train and pile of reads overlapping with it are fully built; + // emit into indel receiver if the train is interesting enough, or into the nonindel receiver: + + System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" + + finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " + + finalTrain.size() + " indels"); + System.out.println(formatRange(finalTrain)); + + if ( shouldAcceptForOutput(finalTrain ) ) indelPileReceiver.receive(finalPile); + else for ( SAMRecord r : finalPile ) nonindelReceiver.receive(r); + finalPile.clear(); + finalTrain.clear(); + curr_stop = -1; + } // ELSE: otherwise we have reads that overlap with both previous and current indel, so we just continue + } else indel = null; + } + + setWaitState(); + } + + + /** Looks for indels in the cigar and, if finds any, updates list of indels in the current train ans sets + * the state to 'active'. If cigar contains no indels, this method does not do anything (it does not + * set state back to 'wait' either!). If this method finds any indels in the cigar, it first tries to find them + * in the list of previously seen indels. If the indel was already seen before, its counter is updated (indels + * are stored in the list as counted objects), oherwise indel is added to the list with initial count of 1. + * + * @param c alignment cigar; if it contains no indels, nothing will be done + * @param start position, at which the alignment represented by cigar c starts on the reference + */ + private void extractIndelsAndUpdateState(final Cigar c, final int start) { + // + // firstpos,lastpos span of the indel will be interpreted as follows: + // any alignment that ends strictly before firstpos or starts strictly after lastpos + // on the *reference* (not inclusive!) does not overlap with an indel; in the case of + // insertion it will result in firstpos > lastpos! + // lastpos + // | firstpos + // | | + // v v + // ---------III----- Ref Insertion: bases I are not in the ref; any alignment that starts + // after lastpos or ends before firstpos *on the reference* + // is completely over the reference bases to the right or to + // the left, respectively, of the insertion site + // + // firstpos + // | lastpos + // | | + // v v + //------------------ Ref Deletion: any alignment that ends before firstpos or starts after lastpos + // -----DDD--- alignment on the reference does not overlap with the deletion + int runninglength = start; // position on the original reference; start = alignment start position + + if ( c.numCigarElements() == 1 ) return; // most of the reads have no indels, save a few cycles by returning early + + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + + final CigarElement ce = c.getCigarElement(i); + Indel indel = null; + + switch(ce.getOperator()) { + case I: indel = new Indel(runninglength, ce.getLength(), IndelType.I); break; + case D: indel = new Indel(runninglength, ce.getLength(), IndelType.D); + runninglength += ce.getLength(); + break; + case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment + default : + throw new IllegalArgumentException("Unexpected operator in cigar string"); + } + + if ( indel == null ) continue; // element was not an indel, go grab next element... + + mState = ACTIVE_STATE; // this is redundant and will be executed unnecessarily many times, but it's cheap... + + CountedObject indelWithCount = new CountedObject(indel); + CountedObject found = mAllIndels.floor(indelWithCount); + + if ( indelWithCount.equals( found ) ) found.increment(); // we did find our indel, advance the counter + else mAllIndels.add(indelWithCount); // this is a new indel. Add it. + } // end for loop over all alignment cigar elements + + } // end extractIndels() method + + + + /** Counts the size of the passed argument into the appropriate size histogram + * + * @param indel size of this indel will be counted in + */ + private void addToSizeHistogram(Indel indel) { + // count this indel's size into the appropriate bin of the appropriate histogram + // (we count insertions and deletions separately), resizing the histogram array if needed: + List histogram; + if ( indel.getType() == Indel.IndelType.D ) { + histogram = mIndelLengthHistD; + } else if ( indel.getType() == Indel.IndelType.I ) { + histogram = mIndelLengthHistI; + } else { + throw new RuntimeException("Indel of unknown type"); + } + if( indel.getIndelLength() > histogram.size() ) { + for ( int j = histogram.size() ; j < indel.getIndelLength() ; j++ ) histogram.add(0); + histogram.set((int)indel.getIndelLength()-1, 1); // we are seeing this length for the first time, so count == 1 + } else { + int n = histogram.get((int)indel.getIndelLength()-1); + histogram.set((int)indel.getIndelLength()-1, n+1); + } + } + + /** Adds sizes of the indels from the list that pass some filters to the histograms + * + * @param indels collection of indels with counts + */ + private void HistogramAsNeeded(Collection> indels) { + for ( CountedObject o : indels ) { + if ( o.getCount() >= 2 ) addToSizeHistogram(o.getObject()); + } + } + + /** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable + * if it contains at least one indel onbserved more than once. + * @param indels list of indels with counts to check for being acceptable + * @return true if the indel run has to be printed + */ + private boolean shouldAcceptForOutput(List> indels) { + for ( CountedObject o : indels ) { + if ( o.getCount() >= 2 ) return true; + } + return false; + } + + private String formatRange(List> indels) { + StringBuffer b = new StringBuffer(); + StringBuffer all = new StringBuffer(); + + long min = 1000000000; + long max = 0; + + for ( CountedObject o : indels ) { + if ( o.getCount() < 2 ) continue; + all.append(" "); + all.append(o.getObject().getIndelLength()); + if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength(); + if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength(); + } + b.append(" min: "); + b.append(min); + b.append(" max: "); + b.append(max); + b.append(all); + return b.toString(); + } + + public void printLengthHistograms() { + if ( mIndelLengthHistD.size() < mIndelLengthHistI.size() ) { + for ( int i = mIndelLengthHistD.size(); i < mIndelLengthHistI.size(); i++ ) mIndelLengthHistD.add(0); + } + if ( mIndelLengthHistI.size() < mIndelLengthHistD.size() ) { + for ( int i = mIndelLengthHistI.size(); i < mIndelLengthHistD.size(); i++ ) mIndelLengthHistI.add(0); + } + System.out.println("length n_insertions n_deletions"); + for ( int i = 0 ; i < mIndelLengthHistD.size(); i++ ) { + System.out.println((i+1)+" "+mIndelLengthHistI.get(i)+" "+mIndelLengthHistD.get(i)); + } + } + + /** Returns true iff the SAM record (or, strictly speaking, its cigar) has at least one insertion or deletion + * + * @param r record to analyze + * @return true if cigar contains at least one I or D element, false otherwise + */ +// private boolean hasIndel(SAMRecord r) { +// Cigar c = r.getCigar(); +// for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { +// CigarOperator co = c.getCigarElement(i).getOperator(); +// if ( co.equals(CigarOperator.I) || co.equals(CigarOperator.D) ) { +// // we got an indel! +// return true; +// } +// } +// return false; +// } +}