added COUNT_CUTOFF arg: it is nor possible to tell the code to try to realign all read piles over trains of nearby indels with at least one indel observed in COUNT_CUTOFF or more different alignments (set the arg to 1 to realign around all indels); also, some diagnostic printouts added to the output (time spent on loading the reference, time spent on scrolling through the input bam file, counts of discarded reads)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@611 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1fe8155111
commit
072808858e
|
|
@ -31,6 +31,7 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
|
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
|
||||||
@Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
|
@Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
|
||||||
@Option(doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)", optional=true) public Integer MAX_READ_LENGTH;
|
@Option(doc="Ignore reads that are longer than the specified cutoff (not a good way to do things but might be necessary because of performance issues)", optional=true) public Integer MAX_READ_LENGTH;
|
||||||
|
@Option(doc="Realignment will be attempted around trains of indels with at least of them observed COUNT_CUTOFF times or more",optional=true) public Integer COUNT_CUTOFF;
|
||||||
|
|
||||||
/** Required main method implementation. */
|
/** Required main method implementation. */
|
||||||
public static void main(final String[] argv) {
|
public static void main(final String[] argv) {
|
||||||
|
|
@ -41,6 +42,9 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
|
|
||||||
int discarded_cigar_count = 0;
|
int discarded_cigar_count = 0;
|
||||||
int discarded_long_read_count = 0;
|
int discarded_long_read_count = 0;
|
||||||
|
int discarded_maxerr = 0;
|
||||||
|
int reads_accepted = 0;
|
||||||
|
int reads_with_indels_accepted = 0;
|
||||||
|
|
||||||
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
|
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
|
||||||
REF_FILE
|
REF_FILE
|
||||||
|
|
@ -84,6 +88,7 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
if ( col == null ) return 1;
|
if ( col == null ) return 1;
|
||||||
|
|
||||||
col.setControlRun(CONTROL_RUN);
|
col.setControlRun(CONTROL_RUN);
|
||||||
|
col.setIndelCountAcceptanceCutoff(COUNT_CUTOFF);
|
||||||
|
|
||||||
if ( ! CONTROL_RUN ) {
|
if ( ! CONTROL_RUN ) {
|
||||||
if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT");
|
if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT");
|
||||||
|
|
@ -97,8 +102,9 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
}
|
}
|
||||||
|
|
||||||
String cur_contig = null;
|
String cur_contig = null;
|
||||||
int counter = 0;
|
long t=0,tc=System.currentTimeMillis(); // time
|
||||||
|
boolean done_printing = false;
|
||||||
|
|
||||||
for ( SAMRecord r : samReader ) {
|
for ( SAMRecord r : samReader ) {
|
||||||
|
|
||||||
if ( r.getReadUnmappedFlag() ) continue;
|
if ( r.getReadUnmappedFlag() ) continue;
|
||||||
|
|
@ -108,19 +114,25 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
// if contig is specified and we are past that contig, we are done:
|
// if contig is specified and we are past that contig, we are done:
|
||||||
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
|
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
|
||||||
if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
|
if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
|
||||||
|
if ( location != null ) System.out.println("Time spent to scroll input bam file to the specified chromosome: "+ ((System.currentTimeMillis()-tc)/1000) + " seconds.");
|
||||||
|
tc = System.currentTimeMillis();
|
||||||
contig_seq = reference.get(r.getReferenceIndex());
|
contig_seq = reference.get(r.getReferenceIndex());
|
||||||
|
t = System.currentTimeMillis();
|
||||||
String refstr = new String(contig_seq.getBases());
|
String refstr = new String(contig_seq.getBases());
|
||||||
if (!CONTROL_RUN) pileBuilder.setReferenceSequence(refstr);
|
if (!CONTROL_RUN) pileBuilder.setReferenceSequence(refstr);
|
||||||
System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
|
System.out.println("Contig "+cur_contig+" (index="+r.getReferenceIndex()+") loaded in "+ ((t-tc)/1000) +" seconds; length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if contig is specified and we did not reach it yet, skip the records until we reach that contig:
|
// if contig is specified and we did not reach it yet, skip the records until we reach that contig:
|
||||||
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
|
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
|
||||||
|
|
||||||
|
|
||||||
if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue;
|
if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue;
|
||||||
|
|
||||||
|
if ( location != null && ! done_printing ) {
|
||||||
|
System.out.println("Time spent to scroll input bam file to the specified location on the chromosome: " + ((System.currentTimeMillis()-t)/1000)+" seconds.");
|
||||||
|
done_printing = true;
|
||||||
|
}
|
||||||
// if stop position is specified and we are past that, stop reading:
|
// if stop position is specified and we are past that, stop reading:
|
||||||
if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
|
if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
|
||||||
|
|
||||||
|
|
@ -130,12 +142,14 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
// let's just skip the reads that contain those other elements (clipped reads?)
|
// let's just skip the reads that contain those other elements (clipped reads?)
|
||||||
Cigar c = r.getCigar();
|
Cigar c = r.getCigar();
|
||||||
boolean cigar_acceptable = true;
|
boolean cigar_acceptable = true;
|
||||||
|
boolean has_indel = false;
|
||||||
|
|
||||||
for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
|
for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
|
||||||
CigarElement ce = c.getCigarElement(z);
|
CigarElement ce = c.getCigarElement(z);
|
||||||
switch ( ce.getOperator() ) {
|
switch ( ce.getOperator() ) {
|
||||||
case M:
|
case M: break;
|
||||||
case I:
|
case I:
|
||||||
case D: break;
|
case D: has_indel = true; break;
|
||||||
default:
|
default:
|
||||||
cigar_acceptable = false;
|
cigar_acceptable = false;
|
||||||
}
|
}
|
||||||
|
|
@ -168,7 +182,13 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq);
|
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq);
|
||||||
else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq);
|
else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq);
|
||||||
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq);
|
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq);
|
||||||
if ( err > MAX_ERRS.intValue() ) continue;
|
if ( err > MAX_ERRS.intValue() ) {
|
||||||
|
discarded_maxerr++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
reads_accepted++;
|
||||||
|
if ( has_indel ) reads_with_indels_accepted++;
|
||||||
// counter++;
|
// counter++;
|
||||||
// if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
|
// if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
|
||||||
col.receive(r);
|
col.receive(r);
|
||||||
|
|
@ -182,6 +202,8 @@ public class IndelInspectorMain extends CommandLineProgram {
|
||||||
System.out.println("done.");
|
System.out.println("done.");
|
||||||
System.out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count);
|
System.out.println("Discarded reads with non-M,I,D cigar elements: "+ discarded_cigar_count);
|
||||||
System.out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count);
|
System.out.println("Discarded long reads (above "+MAX_READ_LENGTH+" bp): "+ discarded_long_read_count);
|
||||||
|
System.out.println("Discarded reads with error counts above "+MAX_ERRS+ ": "+ discarded_maxerr);
|
||||||
|
System.out.println("Reads passed to realigner: "+ reads_accepted+" total; "+reads_with_indels_accepted+" with indel(s)");
|
||||||
System.out.println();
|
System.out.println();
|
||||||
col.printLengthHistograms();
|
col.printLengthHistograms();
|
||||||
samReader.close();
|
samReader.close();
|
||||||
|
|
|
||||||
|
|
@ -67,6 +67,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
|
|
||||||
private final int WAIT_STATE = 0;
|
private final int WAIT_STATE = 0;
|
||||||
private final int ACTIVE_STATE = 1;
|
private final int ACTIVE_STATE = 1;
|
||||||
|
private int INDEL_COUNT_CUTOFF = 2;
|
||||||
|
|
||||||
private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
|
private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
|
||||||
// we will determine their span and report them,
|
// we will determine their span and report them,
|
||||||
|
|
@ -97,7 +98,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
|
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setIndelCountAcceptanceCutoff(int n) { INDEL_COUNT_CUTOFF = n; }
|
||||||
|
|
||||||
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
|
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
|
||||||
mRecordPile = new LinkedList<SAMRecord>();
|
mRecordPile = new LinkedList<SAMRecord>();
|
||||||
mAllIndels = new TreeSet<CountedObject<Indel> >(
|
mAllIndels = new TreeSet<CountedObject<Indel> >(
|
||||||
|
|
@ -259,7 +261,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
private void emit() {
|
private void emit() {
|
||||||
|
|
||||||
if ( mState == WAIT_STATE || avoiding_region ) {
|
if ( mState == WAIT_STATE || avoiding_region ) {
|
||||||
if ( avoiding_region ) {
|
// System.out.println("Emitting uninteresting pile");
|
||||||
|
if ( avoiding_region ) {
|
||||||
long start = mAllIndels.first().getObject().getStart();
|
long start = mAllIndels.first().getObject().getStart();
|
||||||
long stop = mAllIndels.last().getObject().getStop();
|
long stop = mAllIndels.last().getObject().getStop();
|
||||||
System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
|
System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
|
||||||
|
|
@ -284,7 +287,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
// can be more than one pile in what we have stored. Also, we can still have gapless reads
|
// can be more than one pile in what we have stored. Also, we can still have gapless reads
|
||||||
// at the ends of the piles that do not really overlap with indel sites.
|
// at the ends of the piles that do not really overlap with indel sites.
|
||||||
|
|
||||||
|
// System.out.println("Emitting pile with indels ("+mRecordPile.size()+" reads, "+mAllIndels.size()+" indels)");
|
||||||
if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
|
if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
|
||||||
|
|
||||||
HistogramAsNeeded(mAllIndels);
|
HistogramAsNeeded(mAllIndels);
|
||||||
|
|
@ -468,14 +471,15 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable
|
/** Returns true if an attempt should be made to clean alignments around the
|
||||||
|
* specified indel train; currently, indel run is acceptable
|
||||||
* if it contains at least one indel onbserved more than once.
|
* if it contains at least one indel onbserved more than once.
|
||||||
* @param indels list of indels with counts to check for being acceptable
|
* @param indels list of indels with counts to check for being acceptable
|
||||||
* @return true if the indel run has to be printed
|
* @return true if the indel run has to be printed
|
||||||
*/
|
*/
|
||||||
private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
|
private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
|
||||||
for ( CountedObject<Indel> o : indels ) {
|
for ( CountedObject<Indel> o : indels ) {
|
||||||
if ( o.getCount() >= 2 ) return true;
|
if ( o.getCount() >= INDEL_COUNT_CUTOFF ) return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue