ByReference traversals!

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@281 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-03 13:23:18 +00:00
parent e3ac0cb500
commit f031d882c6
8 changed files with 162 additions and 11 deletions

View File

@ -16,10 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseByReads;
import org.broadinstitute.sting.gatk.traversals.TraverseByLoci;
import org.broadinstitute.sting.gatk.traversals.TraverseByLociByReference;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.utils.FastaSequenceFile2;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
@ -35,7 +32,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
public static GenomeAnalysisTK Instance = null;
// parameters and their defaults
public File INPUT_FILE;
public File INPUT_FILE = null;
public String MAX_READS_ARG = "-1";
public String STRICTNESS_ARG = "strict";
public File REF_FILE_ARG = null;
@ -98,7 +95,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
* Flags don't take an argument, the associated Boolean gets set to true if the flag appears on the command line.
*/
protected void setupArgs() {
m_parser.addRequiredArg("input_file", "I", "SAM or BAM file", "INPUT_FILE");
m_parser.addOptionalArg("input_file", "I", "SAM or BAM file", "INPUT_FILE");
//m_parser.addRequiredArg("input_file", "I", "SAM or BAM file", "INPUT_FILE");
m_parser.addOptionalArg("maximum_reads", "M", "Maximum number of reads to process before exiting", "MAX_READS_ARG");
m_parser.addOptionalArg("validation_strictness", "S", "How strict should we be with validation (lenient|silent|strict)", "STRICTNESS_ARG");
m_parser.addOptionalArg("reference_sequence", "R", "Reference sequence file", "REF_FILE_ARG");
@ -196,10 +194,21 @@ public class GenomeAnalysisTK extends CommandLineProgram {
// Try to get the walker specified
try {
LocusWalker<?, ?> walker = (LocusWalker<?, ?>) my_walker;
if ( WALK_ALL_LOCI )
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
else
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
if ( INPUT_FILE == null ) {
if ( walker.requiresReads() )
Utils.scareUser(String.format("Analysis %s requires reads, but none were given", Analysis_Name));
this.engine = new TraverseByReference(null, REF_FILE_ARG, rods);
} else {
if ( walker.cannotHandleReads() )
Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE));
if ( WALK_ALL_LOCI )
this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods);
else
this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods);
}
}
catch (java.lang.ClassCastException e) {
// I guess we're a read walker LOL

View File

@ -38,7 +38,8 @@ public class TraverseByLoci extends TraversalEngine {
public <M,T> T traverse(Walker<M,T> walker, ArrayList<GenomeLoc> locations) {
if ( walker instanceof LocusWalker ) {
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
return (T)this.traverseByLoci(locusWalker, locations);
T sum = traverseByLoci(locusWalker, locations);
return sum;
} else {
throw new IllegalArgumentException("Walker isn't a loci walker!");
}

View File

@ -0,0 +1,110 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
import java.util.ArrayList;
import java.io.File;
import net.sf.samtools.SAMRecord;
/**
* A simple, short-term solution to iterating over all reference positions over a series of
* genomic locations. Simply overloads the superclass traverse function to go over the entire
* interval's reference positions.
*/
public class TraverseByReference extends TraverseByLoci {
public TraverseByReference(File reads, File ref, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
super(reads, ref, rods);
logger.debug("Creating TraverseByReference");
if ( reads != null )
throw new IllegalArgumentException("By reference traversal doesn't support reads, but file was given " + reads);
if ( ref == null )
throw new IllegalArgumentException("By reference traversal requires reference file but none was given");
}
/**
* Traverse by loci -- the key driver of linearly ordered traversal of loci. Provides reads, RODs, and
* the reference base for each locus in the reference to the LocusWalker walker. Supports all of the
* interaction contract implied by the locus walker
*
* @param walker A locus walker object
* @param <M> MapType -- the result of calling map() on walker
* @param <T> ReduceType -- the result of calling reduce() on the walker
* @return 0 on success
*/
protected <M, T> T traverseByLoci(LocusWalker<M, T> walker, ArrayList<GenomeLoc> locations) {
logger.debug("Entering traverseByReference");
// initialize the walker object
walker.initialize();
T sum = walker.reduceInit();
if ( ! locations.isEmpty() ) {
logger.debug("Doing interval-based traversal");
// we are doing interval-based traversals
for ( GenomeLoc interval : locations ) {
logger.debug(String.format("Processing locus %s", interval.toString()));
sum = carryWalkerOverReference(walker, sum, interval);
}
}
else {
// We aren't locus oriented
logger.debug("Doing non-interval-based traversal");
sum = carryWalkerOverReference(walker, sum, null);
}
printOnTraversalDone("reference", sum);
walker.onTraversalDone(sum);
return sum;
}
protected <M, T> T carryWalkerOverReference( LocusWalker<M, T> walker,
T sum,
GenomeLoc interval ) {
logger.debug(String.format("traverseByReference.carryWalkerOverReference Genomic interval is %s", interval));
boolean done = false;
List<SAMRecord> NO_READS = new ArrayList<SAMRecord>();
List<Integer> NO_OFFSETS = new ArrayList<Integer>();
ReferenceIterator refSite = null;
if ( interval != null )
refSite = refIter.seekForward(interval); // jump to the first reference site
else
refSite = refIter.next();
// We keep processing while the next reference location is within the interval
while ( (interval == null || interval.containsP(refSite.getLocation())) && ! done ) {
this.nRecords++;
GenomeLoc current = refSite.getLocation();
// Iterate forward to get all reference ordered data covering this locus
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, current);
LocusContext locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads
locus.setReferenceContig(refSite.getCurrentContig());
sum = walkAtLocus( walker, sum, locus, refSite, rodData );
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords));
done = true;
}
//printProgress("ref", locus.getLocation());
refSite = refIter.next(); // update our location
}
return sum;
}
}

View File

@ -18,6 +18,13 @@ public abstract class LocusWalker<MapType, ReduceType> extends Walker<MapType, R
return true; // We are keeping all the reads
}
/**
* These two functions state whether we're don't make any sense without reads (requiresRead())
* or whether we can't take any reads at all (cannotHandleRead())
*/
public boolean requiresReads() { return false; }
public boolean cannotHandleReads() { return false; }
// Map over the org.broadinstitute.sting.gatk.LocusContext
public abstract MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context);

View File

@ -0,0 +1,18 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.LocusContext;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Feb 22, 2009
* Time: 2:52:28 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class RefWalker<MapType, ReduceType> extends LocusWalker<MapType, ReduceType> {
public boolean requiresReads() { return false; }
public boolean cannotHandleReads() { return true; }
}

View File

@ -29,6 +29,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
Random random;
PrintStream output;
public boolean requiresReads() { return true; }
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context)
{
// Convert context data into bases and 4-base quals

View File

@ -19,6 +19,8 @@ public class CoverageBySample extends LocusWalker<String, String>
{
List<String> sample_names = null;
public boolean requiresReads() { return true; }
public void initialize()
{
GenomeAnalysisTK toolkit = this.getToolkit();

View File

@ -17,6 +17,8 @@ public class SingleSampleGenotyper extends LocusWalker<Integer, Integer> {
return true; // We are keeping all the reads
}
public boolean requiresReads() { return true; }
protected class GenotypeLikelihoods
{
public double[] likelihoods;