added COUNT_CUTOFF arg: it is nor possible to tell the code to try to realign all read piles over trains of nearby indels with at least one indel observed in COUNT_CUTOFF or more different alignments (set the arg to 1 to realign around all indels); also, some diagnostic printouts added to the output (time spent on loading the reference, time spent on scrolling through the input bam file, counts of discarded reads)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@611 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-05-06 21:59:33 +00:00
parent 1fe8155111
commit 072808858e
2 changed files with 38 additions and 12 deletions

View File

@ -31,6 +31,7 @@ public class IndelInspectorMain extends CommandLineProgram {
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
@Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
@Option(doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)", optional=true) public Integer MAX_READ_LENGTH;
@Option(doc="Realignment will be attempted around trains of indels with at least of them observed COUNT_CUTOFF times or more",optional=true) public Integer COUNT_CUTOFF;
/** Required main method implementation. */
public static void main(final String[] argv) {
@ -41,6 +42,9 @@ public class IndelInspectorMain extends CommandLineProgram {
int discarded_cigar_count = 0;
int discarded_long_read_count = 0;
int discarded_maxerr = 0;
int reads_accepted = 0;
int reads_with_indels_accepted = 0;
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
REF_FILE
@ -84,6 +88,7 @@ public class IndelInspectorMain extends CommandLineProgram {
if ( col == null ) return 1;
col.setControlRun(CONTROL_RUN);
col.setIndelCountAcceptanceCutoff(COUNT_CUTOFF);
if ( ! CONTROL_RUN ) {
if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT");
@ -97,8 +102,9 @@ public class IndelInspectorMain extends CommandLineProgram {
}
String cur_contig = null;
int counter = 0;
long t=0,tc=System.currentTimeMillis(); // time
boolean done_printing = false;
for ( SAMRecord r : samReader ) {
if ( r.getReadUnmappedFlag() ) continue;
@ -108,19 +114,25 @@ public class IndelInspectorMain extends CommandLineProgram {
// if contig is specified and we are past that contig, we are done:
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
if ( location != null ) System.out.println("Time spent to scroll input bam file to the specified chromosome: "+ ((System.currentTimeMillis()-tc)/1000) + " seconds.");
tc = System.currentTimeMillis();
contig_seq = reference.get(r.getReferenceIndex());
t = System.currentTimeMillis();
String refstr = new String(contig_seq.getBases());
if (!CONTROL_RUN) pileBuilder.setReferenceSequence(refstr);
System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
System.out.println("Contig "+cur_contig+" (index="+r.getReferenceIndex()+") loaded in "+ ((t-tc)/1000) +" seconds; length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
}
}
// if contig is specified and we did not reach it yet, skip the records until we reach that contig:
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue;
if ( location != null && ! done_printing ) {
System.out.println("Time spent to scroll input bam file to the specified location on the chromosome: " + ((System.currentTimeMillis()-t)/1000)+" seconds.");
done_printing = true;
}
// if stop position is specified and we are past that, stop reading:
if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
@ -130,12 +142,14 @@ public class IndelInspectorMain extends CommandLineProgram {
// let's just skip the reads that contain those other elements (clipped reads?)
Cigar c = r.getCigar();
boolean cigar_acceptable = true;
boolean has_indel = false;
for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
CigarElement ce = c.getCigarElement(z);
switch ( ce.getOperator() ) {
case M:
case M: break;
case I:
case D: break;
case D: has_indel = true; break;
default:
cigar_acceptable = false;
}
@ -168,7 +182,13 @@ public class IndelInspectorMain extends CommandLineProgram {
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq);
else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq);
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq);
if ( err > MAX_ERRS.intValue() ) continue;
if ( err > MAX_ERRS.intValue() ) {
discarded_maxerr++;
continue;
}
reads_accepted++;
if ( has_indel ) reads_with_indels_accepted++;
// counter++;
// if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
col.receive(r);
@ -182,6 +202,8 @@ public class IndelInspectorMain extends CommandLineProgram {
System.out.println("done.");
System.out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count);
System.out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count);
System.out.println("Discarded reads with error counts above "+MAX_ERRS+ ": "+ discarded_maxerr);
System.out.println("Reads passed to realigner: "+ reads_accepted+" total; "+reads_with_indels_accepted+" with indel(s)");
System.out.println();
col.printLengthHistograms();
samReader.close();

View File

@ -67,6 +67,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
private final int WAIT_STATE = 0;
private final int ACTIVE_STATE = 1;
private int INDEL_COUNT_CUTOFF = 2;
private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
// we will determine their span and report them,
@ -97,7 +98,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
}
public void setIndelCountAcceptanceCutoff(int n) { INDEL_COUNT_CUTOFF = n; }
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
mRecordPile = new LinkedList<SAMRecord>();
mAllIndels = new TreeSet<CountedObject<Indel> >(
@ -259,7 +261,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
private void emit() {
if ( mState == WAIT_STATE || avoiding_region ) {
if ( avoiding_region ) {
// System.out.println("Emitting uninteresting pile");
if ( avoiding_region ) {
long start = mAllIndels.first().getObject().getStart();
long stop = mAllIndels.last().getObject().getStop();
System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
@ -284,7 +287,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
// can be more than one pile in what we have stored. Also, we can still have gapless reads
// at the ends of the piles that do not really overlap with indel sites.
// System.out.println("Emitting pile with indels ("+mRecordPile.size()+" reads, "+mAllIndels.size()+" indels)");
if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
HistogramAsNeeded(mAllIndels);
@ -468,14 +471,15 @@ public class IndelRecordPileCollector implements RecordReceiver {
}
}
/** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable
/** Returns true if an attempt should be made to clean alignments around the
* specified indel train; currently, indel run is acceptable
* if it contains at least one indel onbserved more than once.
* @param indels list of indels with counts to check for being acceptable
* @return true if the indel run has to be printed
*/
private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
for ( CountedObject<Indel> o : indels ) {
if ( o.getCount() >= 2 ) return true;
if ( o.getCount() >= INDEL_COUNT_CUTOFF ) return true;
}
return false;
}