From 102b38c05526ef870ebbded989ea9ae10ee07ed4 Mon Sep 17 00:00:00 2001 From: hanna Date: Thu, 25 Jun 2009 18:20:56 +0000 Subject: [PATCH] Sketch of new version of TraverseByLocusWindow, and a flag to conditionally turn it on. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1097 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GATKArgumentCollection.java | 4 + .../sting/gatk/GenomeAnalysisEngine.java | 2 +- .../providers/LocusReferenceView.java | 13 +++ .../gatk/traversals/TraverseLocusWindows.java | 97 +++++++++++++++++++ 4 files changed, 115 insertions(+), 1 deletion(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java index a192ef2aa..9bb5720e3 100755 --- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java @@ -140,6 +140,10 @@ public class GATKArgumentCollection { @Argument(fullName = "disablethreading", shortName = "dt", doc = "Disable experimental threading support.", required = false) public Boolean disableThreading = false; + @Element(required = false) + @Argument(fullName = "use_new_tblw", shortName= "tblw", doc="Use the legacy traverse by locus window.", required = false) + public Boolean useNewTraverseByLocusWindow = false; + /** How many threads should be allocated to this analysis. */ @Element(required = false) @Argument(fullName = "numthreads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ac40b1236..4d57e9afe 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -116,7 +116,7 @@ public class GenomeAnalysisEngine { // if we're a read or a locus walker, we use the new system. Right now we have complicated // branching based on the input data, but this should disapear when all the traversals are switched over - if (my_walker instanceof LocusWalker || my_walker instanceof ReadWalker || my_walker instanceof DuplicateWalker) { + if (!(my_walker instanceof LocusWindowWalker) && !args.useNewTraverseByLocusWindow) { microScheduler = createMicroscheduler(my_walker, rods); } else { // we have an old style traversal, once we're done return legacyTraversal(my_walker, rods); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java index a8a7232ca..60a73b582 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java @@ -54,6 +54,19 @@ public class LocusReferenceView extends ReferenceView { return StringUtil.bytesToString( referenceSequence.getBases(), offset, 1 ).charAt(0); } + /** + * Allow the user to pull reference info from any arbitrary region of the reference. + * Assume the user has already performed all necessary bounds checking. + * TODO: This function is nearly identical to that in the ReadReferenceView. Merge the common functionality. + * @param genomeLoc The locus. + * @return A list of the bases starting at the start of the locus (inclusive) and ending + * at the end of the locus (inclusive). + */ + public char[] getReferenceBases( GenomeLoc genomeLoc ) { + ReferenceSequence subsequence = reference.getSubsequenceAt(genomeLoc.getContig(),genomeLoc.getStart(),genomeLoc.getStop()); + return StringUtil.bytesToString(subsequence.getBases()).toCharArray(); + } + /** * Validates that the genomeLoc is one base wide and is in the reference sequence. * @param genomeLoc location to verify. diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java new file mode 100755 index 000000000..dc1f62f21 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java @@ -0,0 +1,97 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.datasources.shards.Shard; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.*; +import java.io.File; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Apr 23, 2009 + * Time: 10:26:03 AM + * To change this template use File | Settings | File Templates. + */ +public class TraverseLocusWindows extends TraversalEngine { + + public TraverseLocusWindows(List reads, File ref, List> rods) { + super(reads, ref, rods); + } + + public T traverse( Walker walker, + Shard shard, + ShardDataProvider dataProvider, + T sum ) { + + if ( !(walker instanceof LocusWindowWalker) ) + throw new IllegalArgumentException("Walker isn't a locus window walker!"); + + LocusWindowWalker locusWindowWalker = (LocusWindowWalker)walker; + + GenomeLoc interval = shard.getGenomeLoc(); + + ReadView readView = new ReadView( dataProvider ); + LocusReferenceView referenceView = new LocusReferenceView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + + LocusContext locus = getLocusContext(readView.iterator(), interval); + + String referenceSubsequence = new String(referenceView.getReferenceBases(interval)); + + // Iterate forward to get all reference ordered data covering this interval + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); + + // + // Execute our contract with the walker. Call filter, map, and reduce + // + final boolean keepMeP = locusWindowWalker.filter(tracker, referenceSubsequence, locus); + if (keepMeP) { + M x = locusWindowWalker.map(tracker, referenceSubsequence, locus); + sum = locusWindowWalker.reduce(x, sum); + } + + printProgress("intervals", locus.getLocation()); + + return sum; + } + + private LocusContext getLocusContext(Iterator readIter, GenomeLoc interval) { + ArrayList reads = new ArrayList(); + boolean done = false; + long leftmostIndex = interval.getStart(), + rightmostIndex = interval.getStop(); + while (readIter.hasNext() && !done) { + TraversalStatistics.nRecords++; + + SAMRecord read = readIter.next(); + reads.add(read); + if ( read.getAlignmentStart() < leftmostIndex ) + leftmostIndex = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > rightmostIndex ) + rightmostIndex = read.getAlignmentEnd(); + if ( this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads ) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); + done = true; + } + } + + GenomeLoc window = GenomeLocParser.createGenomeLoc(interval.getContig(), leftmostIndex, rightmostIndex); + LocusContext locus = new LocusContext(window, reads, null); + if ( DOWNSAMPLE_BY_COVERAGE ) + locus.downsampleToCoverage(downsamplingCoverage); + + return locus; + } + +}