added COUNT_CUTOFF arg: it is nor possible to tell the code to try to realign all read piles over trains of nearby indels with at least one indel observed in COUNT_CUTOFF or more different alignments (set the arg to 1 to realign around all indels); also, some diagnostic printouts added to the output (time spent on loading the reference, time spent on scrolling through the input bam file, counts of discarded reads)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@611 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1fe8155111
commit
072808858e
|
|
@ -31,6 +31,7 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
|
||||
@Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
|
||||
@Option(doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)", optional=true) public Integer MAX_READ_LENGTH;
|
||||
@Option(doc="Realignment will be attempted around trains of indels with at least of them observed COUNT_CUTOFF times or more",optional=true) public Integer COUNT_CUTOFF;
|
||||
|
||||
/** Required main method implementation. */
|
||||
public static void main(final String[] argv) {
|
||||
|
|
@ -41,6 +42,9 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
|
||||
int discarded_cigar_count = 0;
|
||||
int discarded_long_read_count = 0;
|
||||
int discarded_maxerr = 0;
|
||||
int reads_accepted = 0;
|
||||
int reads_with_indels_accepted = 0;
|
||||
|
||||
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
|
||||
REF_FILE
|
||||
|
|
@ -84,6 +88,7 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
if ( col == null ) return 1;
|
||||
|
||||
col.setControlRun(CONTROL_RUN);
|
||||
col.setIndelCountAcceptanceCutoff(COUNT_CUTOFF);
|
||||
|
||||
if ( ! CONTROL_RUN ) {
|
||||
if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT");
|
||||
|
|
@ -97,8 +102,9 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
}
|
||||
|
||||
String cur_contig = null;
|
||||
int counter = 0;
|
||||
|
||||
long t=0,tc=System.currentTimeMillis(); // time
|
||||
boolean done_printing = false;
|
||||
|
||||
for ( SAMRecord r : samReader ) {
|
||||
|
||||
if ( r.getReadUnmappedFlag() ) continue;
|
||||
|
|
@ -108,19 +114,25 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
// if contig is specified and we are past that contig, we are done:
|
||||
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
|
||||
if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
|
||||
if ( location != null ) System.out.println("Time spent to scroll input bam file to the specified chromosome: "+ ((System.currentTimeMillis()-tc)/1000) + " seconds.");
|
||||
tc = System.currentTimeMillis();
|
||||
contig_seq = reference.get(r.getReferenceIndex());
|
||||
t = System.currentTimeMillis();
|
||||
String refstr = new String(contig_seq.getBases());
|
||||
if (!CONTROL_RUN) pileBuilder.setReferenceSequence(refstr);
|
||||
System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
|
||||
System.out.println("Contig "+cur_contig+" (index="+r.getReferenceIndex()+") loaded in "+ ((t-tc)/1000) +" seconds; length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
|
||||
}
|
||||
}
|
||||
|
||||
// if contig is specified and we did not reach it yet, skip the records until we reach that contig:
|
||||
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
|
||||
|
||||
|
||||
if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue;
|
||||
|
||||
if ( location != null && ! done_printing ) {
|
||||
System.out.println("Time spent to scroll input bam file to the specified location on the chromosome: " + ((System.currentTimeMillis()-t)/1000)+" seconds.");
|
||||
done_printing = true;
|
||||
}
|
||||
// if stop position is specified and we are past that, stop reading:
|
||||
if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
|
||||
|
||||
|
|
@ -130,12 +142,14 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
// let's just skip the reads that contain those other elements (clipped reads?)
|
||||
Cigar c = r.getCigar();
|
||||
boolean cigar_acceptable = true;
|
||||
boolean has_indel = false;
|
||||
|
||||
for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
|
||||
CigarElement ce = c.getCigarElement(z);
|
||||
switch ( ce.getOperator() ) {
|
||||
case M:
|
||||
case M: break;
|
||||
case I:
|
||||
case D: break;
|
||||
case D: has_indel = true; break;
|
||||
default:
|
||||
cigar_acceptable = false;
|
||||
}
|
||||
|
|
@ -168,7 +182,13 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq);
|
||||
else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq);
|
||||
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq);
|
||||
if ( err > MAX_ERRS.intValue() ) continue;
|
||||
if ( err > MAX_ERRS.intValue() ) {
|
||||
discarded_maxerr++;
|
||||
continue;
|
||||
}
|
||||
|
||||
reads_accepted++;
|
||||
if ( has_indel ) reads_with_indels_accepted++;
|
||||
// counter++;
|
||||
// if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
|
||||
col.receive(r);
|
||||
|
|
@ -182,6 +202,8 @@ public class IndelInspectorMain extends CommandLineProgram {
|
|||
System.out.println("done.");
|
||||
System.out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count);
|
||||
System.out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count);
|
||||
System.out.println("Discarded reads with error counts above "+MAX_ERRS+ ": "+ discarded_maxerr);
|
||||
System.out.println("Reads passed to realigner: "+ reads_accepted+" total; "+reads_with_indels_accepted+" with indel(s)");
|
||||
System.out.println();
|
||||
col.printLengthHistograms();
|
||||
samReader.close();
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
|||
|
||||
private final int WAIT_STATE = 0;
|
||||
private final int ACTIVE_STATE = 1;
|
||||
private int INDEL_COUNT_CUTOFF = 2;
|
||||
|
||||
private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
|
||||
// we will determine their span and report them,
|
||||
|
|
@ -97,7 +98,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
|||
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
|
||||
}
|
||||
|
||||
|
||||
public void setIndelCountAcceptanceCutoff(int n) { INDEL_COUNT_CUTOFF = n; }
|
||||
|
||||
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
|
||||
mRecordPile = new LinkedList<SAMRecord>();
|
||||
mAllIndels = new TreeSet<CountedObject<Indel> >(
|
||||
|
|
@ -259,7 +261,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
|||
private void emit() {
|
||||
|
||||
if ( mState == WAIT_STATE || avoiding_region ) {
|
||||
if ( avoiding_region ) {
|
||||
// System.out.println("Emitting uninteresting pile");
|
||||
if ( avoiding_region ) {
|
||||
long start = mAllIndels.first().getObject().getStart();
|
||||
long stop = mAllIndels.last().getObject().getStop();
|
||||
System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
|
||||
|
|
@ -284,7 +287,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
|||
// can be more than one pile in what we have stored. Also, we can still have gapless reads
|
||||
// at the ends of the piles that do not really overlap with indel sites.
|
||||
|
||||
|
||||
// System.out.println("Emitting pile with indels ("+mRecordPile.size()+" reads, "+mAllIndels.size()+" indels)");
|
||||
if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
|
||||
|
||||
HistogramAsNeeded(mAllIndels);
|
||||
|
|
@ -468,14 +471,15 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
|||
}
|
||||
}
|
||||
|
||||
/** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable
|
||||
/** Returns true if an attempt should be made to clean alignments around the
|
||||
* specified indel train; currently, indel run is acceptable
|
||||
* if it contains at least one indel onbserved more than once.
|
||||
* @param indels list of indels with counts to check for being acceptable
|
||||
* @return true if the indel run has to be printed
|
||||
*/
|
||||
private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
|
||||
for ( CountedObject<Indel> o : indels ) {
|
||||
if ( o.getCount() >= 2 ) return true;
|
||||
if ( o.getCount() >= INDEL_COUNT_CUTOFF ) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue