Now iterate over a large set of tiny intervals efficiently.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@130 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
df2a7039cb
commit
149ac3d96c
|
|
@ -1,17 +1,21 @@
|
||||||
package org.broadinstitute.sting.gatk;
|
package org.broadinstitute.sting.gatk;
|
||||||
|
|
||||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||||
import edu.mit.broad.picard.cmdline.Usage;
|
import edu.mit.broad.picard.cmdline.Usage;
|
||||||
import edu.mit.broad.picard.cmdline.Option;
|
import edu.mit.broad.picard.cmdline.Option;
|
||||||
|
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||||
|
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.HashMap;
|
import java.util.*;
|
||||||
|
|
||||||
public class GenomeAnalysisTK extends CommandLineProgram {
|
public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
// Usage and parameters
|
// Usage and parameters
|
||||||
|
|
@ -70,6 +74,17 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
|
|
||||||
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);
|
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);
|
||||||
|
|
||||||
|
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||||
|
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
||||||
|
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary().getSequences();
|
||||||
|
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||||
|
int i = 0;
|
||||||
|
for ( SAMSequenceRecord contig : refContigs ) {
|
||||||
|
refContigOrdering.put(contig.getSequenceName(), i);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
GenomeLoc.setContigOrdering(refContigOrdering);
|
||||||
|
|
||||||
ValidationStringency strictness;
|
ValidationStringency strictness;
|
||||||
if ( STRICTNESS_ARG == null ) {
|
if ( STRICTNESS_ARG == null ) {
|
||||||
strictness = ValidationStringency.STRICT;
|
strictness = ValidationStringency.STRICT;
|
||||||
|
|
|
||||||
|
|
@ -203,10 +203,7 @@ public class TraversalEngine {
|
||||||
GenomeLoc[] locs = (GenomeLoc[])result.toArray(new GenomeLoc[0]);
|
GenomeLoc[] locs = (GenomeLoc[])result.toArray(new GenomeLoc[0]);
|
||||||
|
|
||||||
Arrays.sort(locs);
|
Arrays.sort(locs);
|
||||||
for ( GenomeLoc l : locs )
|
System.out.printf(" Locations are: %s%n", Utils.join("\n", Functions.map( Operators.toString, Arrays.asList(locs) ) ) );
|
||||||
System.out.printf(" -> %s%n", l);
|
|
||||||
|
|
||||||
System.out.printf(" Locations are: %s%n", Utils.join(" ", Functions.map( Operators.toString, Arrays.asList(locs) ) ) );
|
|
||||||
|
|
||||||
return locs;
|
return locs;
|
||||||
}
|
}
|
||||||
|
|
@ -220,9 +217,13 @@ public class TraversalEngine {
|
||||||
*/
|
*/
|
||||||
public boolean inLocations( GenomeLoc curr ) {
|
public boolean inLocations( GenomeLoc curr ) {
|
||||||
if ( this.locs == null )
|
if ( this.locs == null )
|
||||||
|
{
|
||||||
return true;
|
return true;
|
||||||
else {
|
}
|
||||||
for ( GenomeLoc loc : this.locs ) {
|
else
|
||||||
|
{
|
||||||
|
for ( GenomeLoc loc : this.locs )
|
||||||
|
{
|
||||||
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
//System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr));
|
||||||
if ( loc.overlapsP(curr) )
|
if ( loc.overlapsP(curr) )
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -537,21 +538,59 @@ public class TraversalEngine {
|
||||||
boolean done = false;
|
boolean done = false;
|
||||||
GenomeLoc prevLoc = null;
|
GenomeLoc prevLoc = null;
|
||||||
|
|
||||||
while ( iter.hasNext() && ! done ) {
|
int current_interval_index = -1;
|
||||||
|
int current_interval_offset = -1;
|
||||||
|
|
||||||
|
while ( iter.hasNext() && ! done )
|
||||||
|
{
|
||||||
this.nRecords++;
|
this.nRecords++;
|
||||||
|
|
||||||
// actually get the read and hand it to the walker
|
// actually get the read and hand it to the walker
|
||||||
final LocusContext locus = iter.next();
|
LocusContext locus = iter.next();
|
||||||
|
|
||||||
// Poor man's version of index LOL
|
// Poor man's version of index LOL
|
||||||
// HALP! I HAZ 10K INTERVALS 2 INDX
|
// HALP! I HAZ 10K INTERVALS 2 INDX
|
||||||
GenomeLoc curLoc = locus.getLocation();
|
if ( ((this.locs != null) && (this.locs.length != 0)) && ((current_interval_index == -1) || (!locus.getLocation().overlapsP(this.locs[current_interval_index]))))
|
||||||
if ( inLocations(curLoc) ) {
|
{
|
||||||
if ( prevLoc != null && curLoc.compareContigs(prevLoc) != 0 )
|
// Advance to the next locus.
|
||||||
System.out.printf("Traversing to next chromosome...%n");
|
current_interval_index += 1;
|
||||||
|
current_interval_offset = 0;
|
||||||
|
|
||||||
|
if (this.locs.length <= current_interval_index) { done = true; break; }
|
||||||
|
|
||||||
|
//System.out.format("DEBUG Seeking from %s to %s\n", locus.getLocation().toString(), this.locs[current_interval_index].toString());
|
||||||
|
|
||||||
|
while ((!locus.getLocation().overlapsP(this.locs[current_interval_index])) && (iter.hasNext()))
|
||||||
|
{
|
||||||
|
switch (locus.getLocation().compareTo(this.locs[current_interval_index]))
|
||||||
|
{
|
||||||
|
case -1 :
|
||||||
|
locus = iter.next();
|
||||||
|
//System.out.format("DEBUG at %s\n", locus.getLocation().toString());
|
||||||
|
break;
|
||||||
|
case 0 :
|
||||||
|
break;
|
||||||
|
case 1 :
|
||||||
|
current_interval_index += 1;
|
||||||
|
current_interval_offset = 0;
|
||||||
|
//System.out.format("DEBUG Giving up on old locus, Seeking from %s to %s\n", locus.getLocation().toString(), this.locs[current_interval_index].toString());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.format("DEBUG Got there.\n");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
current_interval_offset += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||||
|
|
||||||
// Jump forward in the reference to this locus location
|
// Jump forward in the reference to this locus location
|
||||||
final ReferenceIterator refSite = refIter.seekForward(locus.getLocation());
|
final ReferenceIterator refSite;
|
||||||
|
refSite = refIter.seekForward(locus.getLocation());
|
||||||
final char refBase = refSite.getBaseAsChar();
|
final char refBase = refSite.getBaseAsChar();
|
||||||
locus.setReferenceContig(refSite.getCurrentContig());
|
locus.setReferenceContig(refSite.getCurrentContig());
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue