diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelInspectorMain.java b/java/src/org/broadinstitute/sting/playground/indels/IndelInspectorMain.java index b066b1c35..83e69c035 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspectorMain.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspectorMain.java @@ -31,6 +31,7 @@ public class IndelInspectorMain extends CommandLineProgram { @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS; @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE; @Option(doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)", optional=true) public Integer MAX_READ_LENGTH; + @Option(doc="Realignment will be attempted around trains of indels with at least of them observed COUNT_CUTOFF times or more",optional=true) public Integer COUNT_CUTOFF; /** Required main method implementation. */ public static void main(final String[] argv) { @@ -41,6 +42,9 @@ public class IndelInspectorMain extends CommandLineProgram { int discarded_cigar_count = 0; int discarded_long_read_count = 0; + int discarded_maxerr = 0; + int reads_accepted = 0; + int reads_with_indels_accepted = 0; ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker( REF_FILE @@ -84,6 +88,7 @@ public class IndelInspectorMain extends CommandLineProgram { if ( col == null ) return 1; col.setControlRun(CONTROL_RUN); + col.setIndelCountAcceptanceCutoff(COUNT_CUTOFF); if ( ! CONTROL_RUN ) { if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT"); @@ -97,8 +102,9 @@ public class IndelInspectorMain extends CommandLineProgram { } String cur_contig = null; - int counter = 0; - + long t=0,tc=System.currentTimeMillis(); // time + boolean done_printing = false; + for ( SAMRecord r : samReader ) { if ( r.getReadUnmappedFlag() ) continue; @@ -108,19 +114,25 @@ public class IndelInspectorMain extends CommandLineProgram { // if contig is specified and we are past that contig, we are done: if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break; if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) { + if ( location != null ) System.out.println("Time spent to scroll input bam file to the specified chromosome: "+ ((System.currentTimeMillis()-tc)/1000) + " seconds."); + tc = System.currentTimeMillis(); contig_seq = reference.get(r.getReferenceIndex()); + t = System.currentTimeMillis(); String refstr = new String(contig_seq.getBases()); if (!CONTROL_RUN) pileBuilder.setReferenceSequence(refstr); - System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString()); + System.out.println("Contig "+cur_contig+" (index="+r.getReferenceIndex()+") loaded in "+ ((t-tc)/1000) +" seconds; length="+contig_seq.getBases().length+" tst="+contig_seq.toString()); } } // if contig is specified and we did not reach it yet, skip the records until we reach that contig: if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue; - if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue; + if ( location != null && ! done_printing ) { + System.out.println("Time spent to scroll input bam file to the specified location on the chromosome: " + ((System.currentTimeMillis()-t)/1000)+" seconds."); + done_printing = true; + } // if stop position is specified and we are past that, stop reading: if ( location != null && r.getAlignmentStart() > location.getStop() ) break; @@ -130,12 +142,14 @@ public class IndelInspectorMain extends CommandLineProgram { // let's just skip the reads that contain those other elements (clipped reads?) Cigar c = r.getCigar(); boolean cigar_acceptable = true; + boolean has_indel = false; + for ( int z = 0 ; z < c.numCigarElements() ; z++ ) { CigarElement ce = c.getCigarElement(z); switch ( ce.getOperator() ) { - case M: + case M: break; case I: - case D: break; + case D: has_indel = true; break; default: cigar_acceptable = false; } @@ -168,7 +182,13 @@ public class IndelInspectorMain extends CommandLineProgram { else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq); else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq); else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq); - if ( err > MAX_ERRS.intValue() ) continue; + if ( err > MAX_ERRS.intValue() ) { + discarded_maxerr++; + continue; + } + + reads_accepted++; + if ( has_indel ) reads_with_indels_accepted++; // counter++; // if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString()); col.receive(r); @@ -182,6 +202,8 @@ public class IndelInspectorMain extends CommandLineProgram { System.out.println("done."); System.out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count); System.out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count); + System.out.println("Discarded reads with error counts above "+MAX_ERRS+ ": "+ discarded_maxerr); + System.out.println("Reads passed to realigner: "+ reads_accepted+" total; "+reads_with_indels_accepted+" with indel(s)"); System.out.println(); col.printLengthHistograms(); samReader.close(); diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java index bad34be3a..4d720af66 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java @@ -67,6 +67,7 @@ public class IndelRecordPileCollector implements RecordReceiver { private final int WAIT_STATE = 0; private final int ACTIVE_STATE = 1; + private int INDEL_COUNT_CUTOFF = 2; private boolean avoiding_region; // some regions are too funky (contain very long indel trains)- // we will determine their span and report them, @@ -97,7 +98,8 @@ public class IndelRecordPileCollector implements RecordReceiver { //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop; } - + public void setIndelCountAcceptanceCutoff(int n) { INDEL_COUNT_CUTOFF = n; } + public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException { mRecordPile = new LinkedList(); mAllIndels = new TreeSet >( @@ -259,7 +261,8 @@ public class IndelRecordPileCollector implements RecordReceiver { private void emit() { if ( mState == WAIT_STATE || avoiding_region ) { - if ( avoiding_region ) { + // System.out.println("Emitting uninteresting pile"); + if ( avoiding_region ) { long start = mAllIndels.first().getObject().getStart(); long stop = mAllIndels.last().getObject().getStop(); System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop + @@ -284,7 +287,7 @@ public class IndelRecordPileCollector implements RecordReceiver { // can be more than one pile in what we have stored. Also, we can still have gapless reads // at the ends of the piles that do not really overlap with indel sites. - + // System.out.println("Emitting pile with indels ("+mRecordPile.size()+" reads, "+mAllIndels.size()+" indels)"); if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels"); HistogramAsNeeded(mAllIndels); @@ -468,14 +471,15 @@ public class IndelRecordPileCollector implements RecordReceiver { } } - /** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable + /** Returns true if an attempt should be made to clean alignments around the + * specified indel train; currently, indel run is acceptable * if it contains at least one indel onbserved more than once. * @param indels list of indels with counts to check for being acceptable * @return true if the indel run has to be printed */ private boolean shouldAcceptForOutput(List> indels) { for ( CountedObject o : indels ) { - if ( o.getCount() >= 2 ) return true; + if ( o.getCount() >= INDEL_COUNT_CUTOFF ) return true; } return false; }