From f031d882c68a014d90cadb71ed1b23d5d72b4aa4 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 3 Apr 2009 13:23:18 +0000 Subject: [PATCH] ByReference traversals! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@281 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 29 +++-- .../sting/gatk/traversals/TraverseByLoci.java | 3 +- .../gatk/traversals/TraverseByReference.java | 110 ++++++++++++++++++ .../sting/gatk/walkers/LocusWalker.java | 7 ++ .../sting/gatk/walkers/RefWalker.java | 18 +++ .../gatk/walkers/AlleleFrequencyWalker.java | 2 + .../gatk/walkers/CoverageBySample.java | 2 + .../gatk/walkers/SingleSampleGenotyper.java | 2 + 8 files changed, 162 insertions(+), 11 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java create mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 8d42f12b3..b57716781 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -16,10 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.gatk.traversals.TraversalEngine; -import org.broadinstitute.sting.gatk.traversals.TraverseByReads; -import org.broadinstitute.sting.gatk.traversals.TraverseByLoci; -import org.broadinstitute.sting.gatk.traversals.TraverseByLociByReference; +import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.utils.FastaSequenceFile2; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; @@ -35,7 +32,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { public static GenomeAnalysisTK Instance = null; // parameters and their defaults - public File INPUT_FILE; + public File INPUT_FILE = null; public String MAX_READS_ARG = "-1"; public String STRICTNESS_ARG = "strict"; public File REF_FILE_ARG = null; @@ -98,7 +95,8 @@ public class GenomeAnalysisTK extends CommandLineProgram { * Flags don't take an argument, the associated Boolean gets set to true if the flag appears on the command line. */ protected void setupArgs() { - m_parser.addRequiredArg("input_file", "I", "SAM or BAM file", "INPUT_FILE"); + m_parser.addOptionalArg("input_file", "I", "SAM or BAM file", "INPUT_FILE"); + //m_parser.addRequiredArg("input_file", "I", "SAM or BAM file", "INPUT_FILE"); m_parser.addOptionalArg("maximum_reads", "M", "Maximum number of reads to process before exiting", "MAX_READS_ARG"); m_parser.addOptionalArg("validation_strictness", "S", "How strict should we be with validation (lenient|silent|strict)", "STRICTNESS_ARG"); m_parser.addOptionalArg("reference_sequence", "R", "Reference sequence file", "REF_FILE_ARG"); @@ -196,10 +194,21 @@ public class GenomeAnalysisTK extends CommandLineProgram { // Try to get the walker specified try { LocusWalker walker = (LocusWalker) my_walker; - if ( WALK_ALL_LOCI ) - this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods); - else - this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods); + + if ( INPUT_FILE == null ) { + if ( walker.requiresReads() ) + Utils.scareUser(String.format("Analysis %s requires reads, but none were given", Analysis_Name)); + this.engine = new TraverseByReference(null, REF_FILE_ARG, rods); + } else { + if ( walker.cannotHandleReads() ) + Utils.scareUser(String.format("Analysis %s doesn't support SAM/BAM reads, but a read file %s was provided", Analysis_Name, INPUT_FILE)); + + + if ( WALK_ALL_LOCI ) + this.engine = new TraverseByLociByReference(INPUT_FILE, REF_FILE_ARG, rods); + else + this.engine = new TraverseByLoci(INPUT_FILE, REF_FILE_ARG, rods); + } } catch (java.lang.ClassCastException e) { // I guess we're a read walker LOL diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java index 5b248ee9e..43702f45b 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java @@ -38,7 +38,8 @@ public class TraverseByLoci extends TraversalEngine { public T traverse(Walker walker, ArrayList locations) { if ( walker instanceof LocusWalker ) { LocusWalker locusWalker = (LocusWalker)walker; - return (T)this.traverseByLoci(locusWalker, locations); + T sum = traverseByLoci(locusWalker, locations); + return sum; } else { throw new IllegalArgumentException("Walker isn't a loci walker!"); } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java new file mode 100644 index 000000000..db9484fa6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReference.java @@ -0,0 +1,110 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; +import java.util.ArrayList; +import java.io.File; + +import net.sf.samtools.SAMRecord; + +/** + * A simple, short-term solution to iterating over all reference positions over a series of + * genomic locations. Simply overloads the superclass traverse function to go over the entire + * interval's reference positions. + */ +public class TraverseByReference extends TraverseByLoci { + + public TraverseByReference(File reads, File ref, List> rods) { + super(reads, ref, rods); + + logger.debug("Creating TraverseByReference"); + + if ( reads != null ) + throw new IllegalArgumentException("By reference traversal doesn't support reads, but file was given " + reads); + if ( ref == null ) + throw new IllegalArgumentException("By reference traversal requires reference file but none was given"); + } + + /** + * Traverse by loci -- the key driver of linearly ordered traversal of loci. Provides reads, RODs, and + * the reference base for each locus in the reference to the LocusWalker walker. Supports all of the + * interaction contract implied by the locus walker + * + * @param walker A locus walker object + * @param MapType -- the result of calling map() on walker + * @param ReduceType -- the result of calling reduce() on the walker + * @return 0 on success + */ + protected T traverseByLoci(LocusWalker walker, ArrayList locations) { + logger.debug("Entering traverseByReference"); + + // initialize the walker object + walker.initialize(); + + T sum = walker.reduceInit(); + if ( ! locations.isEmpty() ) { + logger.debug("Doing interval-based traversal"); + + // we are doing interval-based traversals + for ( GenomeLoc interval : locations ) { + logger.debug(String.format("Processing locus %s", interval.toString())); + sum = carryWalkerOverReference(walker, sum, interval); + } + } + else { + // We aren't locus oriented + logger.debug("Doing non-interval-based traversal"); + sum = carryWalkerOverReference(walker, sum, null); + } + + printOnTraversalDone("reference", sum); + walker.onTraversalDone(sum); + return sum; + } + + protected T carryWalkerOverReference( LocusWalker walker, + T sum, + GenomeLoc interval ) { + logger.debug(String.format("traverseByReference.carryWalkerOverReference Genomic interval is %s", interval)); + + boolean done = false; + + List NO_READS = new ArrayList(); + List NO_OFFSETS = new ArrayList(); + + ReferenceIterator refSite = null; + if ( interval != null ) + refSite = refIter.seekForward(interval); // jump to the first reference site + else + refSite = refIter.next(); + + // We keep processing while the next reference location is within the interval + while ( (interval == null || interval.containsP(refSite.getLocation())) && ! done ) { + this.nRecords++; + GenomeLoc current = refSite.getLocation(); + + // Iterate forward to get all reference ordered data covering this locus + final List rodData = getReferenceOrderedDataAtLocus(rodIters, current); + + LocusContext locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads + locus.setReferenceContig(refSite.getCurrentContig()); + sum = walkAtLocus( walker, sum, locus, refSite, rodData ); + + if (this.maxReads > 0 && this.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords)); + done = true; + } + + //printProgress("ref", locus.getLocation()); + refSite = refIter.next(); // update our location + } + + return sum; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index 67abd1b0a..895b5c476 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -18,6 +18,13 @@ public abstract class LocusWalker extends Walker rodData, char ref, LocusContext context); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java new file mode 100644 index 000000000..f76226df2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java @@ -0,0 +1,18 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.LocusContext; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class RefWalker extends LocusWalker { + public boolean requiresReads() { return false; } + public boolean cannotHandleReads() { return true; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java index 0f70e04db..8a9a447bc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java @@ -29,6 +29,8 @@ public class AlleleFrequencyWalker extends LocusWalker rodData, char ref, LocusContext context) { // Convert context data into bases and 4-base quals diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java index 87a6c88f3..5f03e80ab 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java @@ -19,6 +19,8 @@ public class CoverageBySample extends LocusWalker { List sample_names = null; + public boolean requiresReads() { return true; } + public void initialize() { GenomeAnalysisTK toolkit = this.getToolkit(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java index 259c41d9c..171b94c7d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java @@ -17,6 +17,8 @@ public class SingleSampleGenotyper extends LocusWalker { return true; // We are keeping all the reads } + public boolean requiresReads() { return true; } + protected class GenotypeLikelihoods { public double[] likelihoods;