main class (argument processing and traversing the reference) and implementation of all the Receiver functionality for building read piles over indels

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@112 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-03-20 05:18:04 +00:00
parent 4c3b92b860
commit 3565b50ff5
2 changed files with 766 additions and 0 deletions

View File

@ -0,0 +1,274 @@
package org.broadinstitute.sting.indels;
import java.io.File;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import javax.swing.JFileChooser;
import javax.swing.filechooser.FileNameExtensionFilter;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
import edu.mit.broad.picard.reference.ReferenceSequence;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.GenomeLoc;
public class IndelInspector extends CommandLineProgram {
// Usage and parameters
@Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n";
@Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE;
@Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION;
@Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
// @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
/** Required main method implementation. */
public static void main(final String[] argv) {
System.exit(new IndelInspector().instanceMain(argv));
}
protected int doWork() {
GenomeLoc location = null;
if ( GENOME_LOCATION != null ) {
location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);
}
if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") ) {
System.out.println("Unknown value specified for ERR_MODE");
return 1;
}
final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1kG/"));
setContigOrdering(samReader);
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
);
ReferenceSequence contig_seq = null;
IndelRecordPileCollector col = null;
try {
col = new IndelRecordPileCollector();
} catch(Exception e) { System.err.println(e.getMessage()); }
if ( col == null ) return 1;
String cur_contig = null;
int counter = 0;
for ( SAMRecord r : samReader ) {
if ( r.getReferenceName() != cur_contig) {
cur_contig = r.getReferenceName();
System.out.println("Contig "+cur_contig);
// if contig is specified and we are past that contig, we are done:
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
contig_seq = reference.get(r.getReferenceIndex());
System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
}
}
// if contig is specified and wqe did not reach it yet, skip the records until we reach that contig:
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
// if stop position is specified and we are past that, stop reading:
if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now
int err = -1;
/*
System.out.println("MM: "+numMismatches(r));
System.out.println("direct: "+numMismatchesDirect(r,contig_seq));
System.out.print(" ");
for ( int i = r.getAlignmentStart() - 1 ; i < r.getAlignmentEnd() ; i++ ) System.out.print((char)contig_seq.getBases()[i]);
System.out.println();
System.out.println((r.getReadNegativeStrandFlag()?"<-":"->")+r.getReadString());
System.out.println("cigar: "+r.getCigarString());
System.out.println();
if (counter++ == 20 ) break;
continue;
*/
if ( ERR_MODE.equals("MM")) err = numMismatches(r);
else if ( ERR_MODE.equals("ERR")) err = numErrors(r);
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r);
if ( err > MAX_ERRS.intValue() ) continue;
counter++;
if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
col.receive(r);
}
System.out.println("done.");
col.printLengthHistograms();
samReader.close();
return 0;
}
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
*
* @param r SAM record that must specify an alignment
* @return number of errors (number of mismatches plus total length of all insertions/deletions
* @throws RuntimeException
*/
private static int numErrors(SAMRecord r) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1
int errs = numMismatches(r);
// now we have to add the total length of all indels:
Cigar c = r.getCigar();
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
CigarElement ce = c.getCigarElement(i);
switch( ce.getOperator()) {
case M : break; // we already have correct number of mismatches
case I :
case D :
errs += ce.getLength();
break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return errs;
}
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
*
* @param r SAM record that must specify an alignment
* @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or
* deletion will be counted as a single error regardless of the length)
* @throws RuntimeException
*/
private static int numMismatchesGaps(SAMRecord r) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1
int errs = numMismatches(r);
// now we have to add the total length of all indels:
Cigar c = r.getCigar();
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
CigarElement ce = c.getCigarElement(i);
switch( ce.getOperator()) {
case M : break; // we already have correct number of mismatches
case I :
case D :
errs++;
break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return errs;
}
private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) {
int i_ref = r.getAlignmentStart()-1; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches
Cigar c = r.getCigar();
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
CigarElement ce = c.getCigarElement(k);
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++ ) {
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++;
i_ref++;
i_read++;
}
break;
case I: i_read += ce.getLength(); break;
case D: i_ref += ce.getLength(); break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return mm_count;
}
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
private static int numMismatches(SAMRecord r) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1
return ((Integer)r.getAttribute("NM")).intValue() - 1;
}
/** Trivial utility method that goes some distance trying to ensure that the input file is there;
* the only purpose is reducing clutter in main(). Receives a default
* input file argument, does a few checks (e.g. that it is non-null and exists), if they fail tries
* to fire up a file chooser dialog using start_folder as initial directory, etc.
* @param default_arg some "default" input file; if it is non-null and exists, nothing else will be done,
* and the same default_arg objetc will be returned; otherwise the method will try to ask for a "better" input.
* @param start_folder should file open dialog be fired up, it will initially display this directory.
* @return File object that is not null and does exist (there is no check that it is a valid SAM/BAM file though).
*/
private File getInputFile(File default_arg, String start_folder) {
File f = default_arg;
if ( f==null || ! f.exists() ) {
JFileChooser fc = new JFileChooser(start_folder);
FileNameExtensionFilter ff = new FileNameExtensionFilter("SAM and BAM files","sam","bam");
fc.setFileFilter(ff);
fc.setFileSelectionMode(JFileChooser.FILES_ONLY);
int ret = fc.showOpenDialog(null);
f = fc.getSelectedFile();
if ( ret != JFileChooser.APPROVE_OPTION ) {
System.out.println("No input file specified. Exiting...");
System.exit(1);
}
}
if ( f == null || ! f.exists() ) {
System.out.println("SAM or BAM input file must be specified. Exiting...");
System.exit(1);
}
return f;
}
/** Auxiliary method to remove some clutter from main(); gets called only once and tries to get
* contig ordering from the header provided by opened SAM reader; if no header info is available
* falls back to default ordering; whichever ordering is used, it is set for GenomeLoc class.
* @param r sam reader to get header from
*/
private void setContigOrdering(SAMFileReader r) {
SAMFileHeader h = r.getFileHeader();
if ( h == null ) {
System.out.println("No header found in SAM file, falling back to default contig ordering");
setDefaultContigOrdering();
return;
}
List<SAMSequenceRecord> seqs = h.getSequences();
if ( seqs == null ) {
System.out.println("No reference sequence records found in SAM file header, " +
"falling back to default contig ordering");
setDefaultContigOrdering();
return;
}
int i = 0;
Map<String,Integer> rco = new HashMap<String,Integer>();
for ( SAMSequenceRecord sr : seqs) {
rco.put(sr.getSequenceName(),i++);
}
GenomeLoc.setContigOrdering(rco);
}
private void setDefaultContigOrdering() {
Map<String,Integer> rco = new HashMap<String,Integer>();
rco.put("chrM",0);
for ( int i = 1 ; i <= 22 ; i++ ) rco.put("chr"+i,i);
rco.put("chrX",23);
rco.put("chrY",24);
}
}

View File

@ -0,0 +1,492 @@
package org.broadinstitute.sting.indels;
import java.util.*;
import org.broadinstitute.sting.indels.Indel.IndelType;
import org.broadinstitute.sting.utils.*;
import net.sf.samtools.*;
/** Ultimately, this class is a splitter for a stream of alignment records. It detects putative indels, or
* trains of sufficiently close indels, and sends the alignments two-way: those that do not overlap with any
* detected indels or trains of indels, and those that do. The latters are emitted in finished piles of
* all alignments that overlap with genomic interval of interest. This collector should be bound to
* and driven by an alignment traversal engine that sends alignment records one by one.
*
* NOTE 1: alignments must be sent to the collector strictly in the order
* of non-decreasing reference start position.
*
* NOTE 2: a train of indels is defined as a sequence of (putative) indels such that each pair of adjacent indels
* is overlapped by at least one alignment (that alignment does not have to have both indels in it, but only
* to span over both positions). A "genomic region of interest" is defined as the smallest interval
* containing all indels in the train, and <i>all</i> the alignments that overlap with that region will be collected
* into one pile. For instance, if reads of different length are present in the dataset, it is possible that two
* adjacent indels are overlapped by a single longer read (which stitches them into the train), but there are
* shorter reads that fall completely into the in-between region (so that they technically do not overlap with any
* indels in the train). According to the above definition, these shorter reads will be still emitted into the pile,
* since they overlap with the "region of interest".
*
* NOTE 3: due to performance/memory issues, the collector may refuse to assemble a pile over pathologically long
* train of indels. In this case, it will keep detecting the indel train in order to be able to understand what is
* going on and to recover later, but the reads will be sent to the "non-overlapping" output channel.
*
* In order to abstract and decouple the operation of emitting records, the collector expects to be bound to an
* implementation of RecordEmitter interface. It is the emitter's implementation that decides what to do with
* alignments of the two types (not related to indels vs. piles of alignments overlapping with indels). While
* this collector has some delay between receiving an alignment and being able to decide which way it should go,
* no records are ever discarded.
*
* Implementation note:
*
* In order to achive its goal, the collector has a state ('wait' or 'active') and always
* keeps a variable size "backlog" pile of alignments that were sent to it most recently. In 'wait' state collector
* has not detected any putative indels just yet. The backlog pile contains only alignments of "undecided fate": those
* that still might overlap with an indel should it be detected in the future. All alignments that end before the
* current position on the genome have their fate determined (as not overlapping with any indels) and emitted.
* When an indel is encountered, the collector flips into the 'active' state and from that moment on keeps all
* the alignments in the pile and collects information on the indels (their positions on the reference and numbers
* of observations).
*
* Since only alignments are sorted, but not indels (an indel in a later read may occur closer
* to its start and thus before a previously seen indel), and also because it is relatively difficult (TO_DO?)to break a
* pile in the middle immediately when it becomes clear that two adjacent indels might have been overlapped by a
* single read, but no such read ever surfaced, the collector is conservative at this stage and keeps
* accumulating the pile (and indel train) until it moves sufficiently far away from the last indel seen (full
* maximum read length is recommended). Then it switches back into wait state and performs post-processing
* of the indel train and the collected pile: only at this stage the preliminary pile is closely examined and if
* there are pairs of adjacent indels not spanned by any read, the pile is broken into smaller piles
* that conform to the contract outlined above. These piles are directed into the RecordEmitter, and the
* reads that fall in between the piles, if any (i.e. those that do not overlap with final indel trains
* determined at the post-processing stage) are relabeled as "not interesting" and redirected
* to the appropriate output channel.
*
* @author asivache
*
*/
public class IndelRecordPileCollector implements RecordReceiver {
private final int WAIT_STATE = 0;
private final int ACTIVE_STATE = 1;
private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
// we will determine their span and report them,
// but we won't be counting any indels there or building piles
private List<SAMRecord> mRecordPile; // here we keep the records before we decide how we want to emit them
private TreeSet<CountedObject<Indel> > mAllIndels; ///< individual indels encountered, with observation counts
private int mLastContig ; ///< keeps the index of the contig last alignment was on
private int mLastStartOnRef; ///< keeps the start position of the last alignment
private int mState; ///< WAIT_STATE or ACTIVE_STATE
private int mIndelSeparation; ///< Indels that are farther away from one another than this value
///< will be emitted separately; trains of indels with less then
///< mIndelSeparation bases between each adjacent pair will be emitted
///< as one pile.
// we will build histograms (distributions) of encountered indel lengths on the fly
private List<Integer> mIndelLengthHistI;
private List<Integer> mIndelLengthHistD;
private RecordReceiver nonindelReceiver; // we will send there records that do not overlap with regions of interest
private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
public String memStatsString() {
String s = "mRecordPile: ";
return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef;
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
}
public IndelRecordPileCollector() throws java.io.IOException {
mRecordPile = new LinkedList<SAMRecord>();
mAllIndels = new TreeSet<CountedObject<Indel> >(
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
mLastContig = -1;
mLastStartOnRef = -1;
mIndelSeparation = 51;
mIndelLengthHistI = new ArrayList<Integer>();
mIndelLengthHistD = new ArrayList<Integer>();
for ( int i = 0 ; i < 5 ; i++ ) {
mIndelLengthHistI.add(0);
mIndelLengthHistD.add(0);
}
nonindelReceiver = new DiscardingReceiver();
indelPileReceiver = new DiscardingPileReceiver();
setWaitState();
}
/** Fully reinitializes wait state: clears record pile and indel list, resets flags and states.
* Does not emit records, just clears/resets the variables.
*/
private void setWaitState() {
mRecordPile.clear();
mAllIndels.clear();
// mIndelRegionStart = 1000000000;
// mIndelRegionStop = -1;
avoiding_region = false;
mState = WAIT_STATE; // got to do this if we were in avoid_region state
}
/** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile
* all the consequtive records with alignment end positions less than or equal to the specified
* position <code>pos</code>, until the first record is encountered that does not meet this condition. Note that
* there might be more alignments that end at or before <code>pos</code> later on in the pile, but
* they will <i> nit</i> be emitted/removed by this method.
* @param pos all leading records with alignments ending before or at this position will be purged from the pile,
* up to the first record that does not end at or before pos.
*/
protected void purgeRecordsEndingAtOrBefore(final long pos) {
Iterator<SAMRecord> i = mRecordPile.iterator();
while ( i.hasNext() ) {
SAMRecord r = i.next();
if ( r.getAlignmentEnd() <= pos ) {
nonindelReceiver.receive(r);
i.remove();
} else break;
}
}
/** A utility method: purges from the currently held SAM record pile all the records with alignment
* start positions greater than or equal to the specified position <code>pos</code>
* @param pos all records with alignments starting at or after this position will be purged from the pile
*/
protected void purgeRecordsStartingAtOrAfter(final int pos) {
Iterator<SAMRecord> i = mRecordPile.iterator();
while ( i.hasNext() ) {
SAMRecord r = i.next();
if ( r.getAlignmentStart() >= pos ) {
nonindelReceiver.receive(r);
i.remove();
} else break;
}
}
/** This is the main interface method of the collector: it receives alignments, inspects them, detects indels,
* updates and purges the read pile it keeps and emits alignments as needed.
* Depending on the state, the following behaviors are possible
*
* <ul>
* <li> If the collector is in wait state (no indels seen recently): all
* alignments that end prior to the start of currently inspected alignment can not overlap
* with any future indels, including those that may be present in the current alignment; these records
* get purged from the pile and emitted immediately. Current alignment gets added to the pile.
* If current alignment has indels, collector switches into 'active' state.
* <li> in active state: if the current alignment starts sufficiently far away from the last indel seen,
* examine the currently held pile closely, split into a few separate piles/indel trains if needed, emit and
* completely purge the pile, add alignment to the pile, switch to wait state if alignment has no indels or
* stay in active state if it does. Otherwise (alignment too close to last indel),
* just add alignment to the pile, since it is yet impossible to tell whether new indels are coming soon and
* indel train will need to be extended; if alignment does have indels of its own, add them
* to the current indel train
* </ul>
*
* This method checks that records arrive in reference-sorted order and throws RuntimeException if out-of-order
* record arrives.
*
* @param r
* @throws RuntimeException
*/
@Override
public void receive(final SAMRecord r) throws RuntimeException {
if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do
int currContig = r.getReferenceIndex();
int currPos = r.getAlignmentStart();
if ( currContig < mLastContig ) throw new RuntimeException("SAM file is not ordered by contigs");
if ( currContig == mLastContig && currPos < mLastStartOnRef ) throw new RuntimeException("SAM file is not ordered by start positions");
if ( currContig > mLastContig ) {
// we jumped onto a new contig; emit everything we might have been building and purge the piles:
emit();
} else { // still on the same contig:
switch (mState) {
// everything ending up to currPos is guaranteed to have no overlaps with indels yet to come
case WAIT_STATE: purgeRecordsEndingAtOrBefore(currPos); break;
// next indel can start only after currPos (whether it is in the current read or in the
// reads yet to come). If it is far enough from the last indel we have seen, we can emit
case ACTIVE_STATE: if ( currPos - mAllIndels.last().getObject().getStop() > mIndelSeparation ) emit(); break;
default: throw new RuntimeException("Unknown state");
}
}
// does nothing if alignment has no indels, otherwise adds the indels to the list and (re)sets state to 'active'
extractIndelsAndUpdateState(r.getCigar(),currPos);
if ( ! avoiding_region && mAllIndels.size() > 20 ) avoiding_region = true;
if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region
mLastContig = currContig;
mLastStartOnRef = currPos;
}
/** Emits all reads from the currently held pile, cleans the pile and fully reinitializes wait state
* (clears indel list etc).
*
* If the current state is 'wait', simply sends all the records from the pile to nonindelReceiver before
* the cleanup. If the state is 'active', then performs final inspection of the pile built over a train of indels,
* splits the train (and the pile) into multiple trains/piles as needed (i.e. if there are pairs of adjacent
* indels that are not overlapped by any read), and emits the final piles of records into indelReceiver.
*/
private void emit() {
if ( mState == WAIT_STATE || avoiding_region ) {
if ( avoiding_region ) {
long start = mAllIndels.first().getObject().getStart();
long stop = mAllIndels.last().getObject().getStop();
System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
" was ignored: "+mAllIndels.size() +" unique indels with average distance of "+
((double)(stop - start))/((double)mAllIndels.size()-1) +
" bases between indels");
}
// no indels or avoiding indels in bad region: send all records to nonindelReceiver and clear the pile
for ( SAMRecord r : mRecordPile ) nonindelReceiver.receive(r);
setWaitState();
return;
}
// last minute cleanup:
// at this stage we have all the indels collected conservatively (in a sense
// that they can be farther away than it is needed) - this means that there actually
// can be more than one pile in what we have stored. Also, we can still have gapless reads
// at the ends of the piles that do not really overlap with indel sites.
if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
HistogramAsNeeded(mAllIndels);
// indels are in a sorted map, and reads were added to the pile in the order they were received (also sorted).
// we will traverse the two collections in parallel and detect exactly where we can break the indel train into
// subtrains
Iterator<CountedObject<Indel> > i_iter = mAllIndels.iterator();
// will keep list of indels and list of records, respectively, in one final train
List< CountedObject<Indel> > finalTrain = new ArrayList<CountedObject<Indel>>();
List< SAMRecord > finalPile = new ArrayList<SAMRecord>();
long curr_stop = -1; // the rightmost stop position among all the alignments seen so far
CountedObject<Indel> indel = i_iter.next(); // we checked that list of indels contains at least one element!
SAMRecord record ;
while ( indel != null ) {
// first, if we just started new indel train, then emit into nonindelReceiver all alignments
// that end prior to the first indel in the train:
if ( finalTrain.size() == 0 ) purgeRecordsEndingAtOrBefore(indel.getObject().getStart() - 1);
finalTrain.add(indel);
Iterator<SAMRecord> r_iter = mRecordPile.iterator();
if ( r_iter.hasNext() ) record = r_iter.next();
else record = null;
// record now contains first alignment that ends in or after the indel, or null if there are no more records
// now collect all the alignments that overlap with the current indel (start before or inside) and
// record the rightmost alignment stop position:
while ( record != null && record.getAlignmentStart() <= indel.getObject().getStop() ) {
finalPile.add(record);
r_iter.remove(); // remove from the original pile the record we just moved to the current final pile
curr_stop = Math.max(curr_stop, record.getAlignmentEnd());
if ( r_iter.hasNext() ) record = r_iter.next();
else record = null;
}
// record is now the first alignment that starts after the indel, or null if there are no more records
// we are done with current indel, get next one if any:
if ( i_iter.hasNext() ) {
indel = i_iter.next();
if ( curr_stop < indel.getObject().getStart() ) {
// all alignments that overlapped with the previous indel ended before the current indel started,
// this means that the current train and pile of reads overlapping with it are fully built;
// emit into indel receiver if the train is interesting enough, or into the nonindel receiver:
System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" +
finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " +
finalTrain.size() + " indels");
System.out.println(formatRange(finalTrain));
if ( shouldAcceptForOutput(finalTrain ) ) indelPileReceiver.receive(finalPile);
else for ( SAMRecord r : finalPile ) nonindelReceiver.receive(r);
finalPile.clear();
finalTrain.clear();
curr_stop = -1;
} // ELSE: otherwise we have reads that overlap with both previous and current indel, so we just continue
} else indel = null;
}
setWaitState();
}
/** Looks for indels in the cigar and, if finds any, updates list of indels in the current train ans sets
* the state to 'active'. If cigar contains no indels, this method does not do anything (it does <i>not</i>
* set state back to 'wait' either!). If this method finds any indels in the cigar, it first tries to find them
* in the list of previously seen indels. If the indel was already seen before, its counter is updated (indels
* are stored in the list as counted objects), oherwise indel is added to the list with initial count of 1.
*
* @param c alignment cigar; if it contains no indels, nothing will be done
* @param start position, at which the alignment represented by cigar <code>c</code> starts on the reference
*/
private void extractIndelsAndUpdateState(final Cigar c, final int start) {
//
// firstpos,lastpos span of the indel will be interpreted as follows:
// any alignment that ends strictly before firstpos or starts strictly after lastpos
// on the *reference* (not inclusive!) does not overlap with an indel; in the case of
// insertion it will result in firstpos > lastpos!
// lastpos
// | firstpos
// | |
// v v
// ---------III----- Ref Insertion: bases I are not in the ref; any alignment that starts
// after lastpos or ends before firstpos *on the reference*
// is completely over the reference bases to the right or to
// the left, respectively, of the insertion site
//
// firstpos
// | lastpos
// | |
// v v
//------------------ Ref Deletion: any alignment that ends before firstpos or starts after lastpos
// -----DDD--- alignment on the reference does not overlap with the deletion
int runninglength = start; // position on the original reference; start = alignment start position
if ( c.numCigarElements() == 1 ) return; // most of the reads have no indels, save a few cycles by returning early
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
final CigarElement ce = c.getCigarElement(i);
Indel indel = null;
switch(ce.getOperator()) {
case I: indel = new Indel(runninglength, ce.getLength(), IndelType.I); break;
case D: indel = new Indel(runninglength, ce.getLength(), IndelType.D);
runninglength += ce.getLength();
break;
case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
default :
throw new IllegalArgumentException("Unexpected operator in cigar string");
}
if ( indel == null ) continue; // element was not an indel, go grab next element...
mState = ACTIVE_STATE; // this is redundant and will be executed unnecessarily many times, but it's cheap...
CountedObject<Indel> indelWithCount = new CountedObject<Indel>(indel);
CountedObject<Indel> found = mAllIndels.floor(indelWithCount);
if ( indelWithCount.equals( found ) ) found.increment(); // we did find our indel, advance the counter
else mAllIndels.add(indelWithCount); // this is a new indel. Add it.
} // end for loop over all alignment cigar elements
} // end extractIndels() method
/** Counts the size of the passed <indel> argument into the appropriate size histogram
*
* @param indel size of this indel will be counted in
*/
private void addToSizeHistogram(Indel indel) {
// count this indel's size into the appropriate bin of the appropriate histogram
// (we count insertions and deletions separately), resizing the histogram array if needed:
List<Integer> histogram;
if ( indel.getType() == Indel.IndelType.D ) {
histogram = mIndelLengthHistD;
} else if ( indel.getType() == Indel.IndelType.I ) {
histogram = mIndelLengthHistI;
} else {
throw new RuntimeException("Indel of unknown type");
}
if( indel.getIndelLength() > histogram.size() ) {
for ( int j = histogram.size() ; j < indel.getIndelLength() ; j++ ) histogram.add(0);
histogram.set((int)indel.getIndelLength()-1, 1); // we are seeing this length for the first time, so count == 1
} else {
int n = histogram.get((int)indel.getIndelLength()-1);
histogram.set((int)indel.getIndelLength()-1, n+1);
}
}
/** Adds sizes of the indels from the list that pass some filters to the histograms
*
* @param indels collection of indels with counts
*/
private void HistogramAsNeeded(Collection<CountedObject<Indel>> indels) {
for ( CountedObject<Indel> o : indels ) {
if ( o.getCount() >= 2 ) addToSizeHistogram(o.getObject());
}
}
/** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable
* if it contains at least one indel onbserved more than once.
* @param indels list of indels with counts to check for being acceptable
* @return true if the indel run has to be printed
*/
private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
for ( CountedObject<Indel> o : indels ) {
if ( o.getCount() >= 2 ) return true;
}
return false;
}
private String formatRange(List<CountedObject<Indel>> indels) {
StringBuffer b = new StringBuffer();
StringBuffer all = new StringBuffer();
long min = 1000000000;
long max = 0;
for ( CountedObject<Indel> o : indels ) {
if ( o.getCount() < 2 ) continue;
all.append(" ");
all.append(o.getObject().getIndelLength());
if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength();
if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength();
}
b.append(" min: ");
b.append(min);
b.append(" max: ");
b.append(max);
b.append(all);
return b.toString();
}
public void printLengthHistograms() {
if ( mIndelLengthHistD.size() < mIndelLengthHistI.size() ) {
for ( int i = mIndelLengthHistD.size(); i < mIndelLengthHistI.size(); i++ ) mIndelLengthHistD.add(0);
}
if ( mIndelLengthHistI.size() < mIndelLengthHistD.size() ) {
for ( int i = mIndelLengthHistI.size(); i < mIndelLengthHistD.size(); i++ ) mIndelLengthHistI.add(0);
}
System.out.println("length n_insertions n_deletions");
for ( int i = 0 ; i < mIndelLengthHistD.size(); i++ ) {
System.out.println((i+1)+" "+mIndelLengthHistI.get(i)+" "+mIndelLengthHistD.get(i));
}
}
/** Returns true iff the SAM record (or, strictly speaking, its cigar) has at least one insertion or deletion
*
* @param r record to analyze
* @return true if cigar contains at least one I or D element, false otherwise
*/
// private boolean hasIndel(SAMRecord r) {
// Cigar c = r.getCigar();
// for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
// CigarOperator co = c.getCigarElement(i).getOperator();
// if ( co.equals(CigarOperator.I) || co.equals(CigarOperator.D) ) {
// // we got an indel!
// return true;
// }
// }
// return false;
// }
}