From 48713e154c4efb332fced0d2dbbb118b1b29f96a Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 5 Aug 2009 16:29:15 +0000 Subject: [PATCH] Windowed access to the reference. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1383 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/contexts/ReferenceContext.java | 61 +++++++++++- .../providers/LocusReferenceView.java | 98 ++++++++++++++++--- .../sting/gatk/traversals/TraverseLoci.java | 4 +- .../gatk/traversals/TraverseLocusWindows.java | 2 +- .../gatk/walkers/PrintLocusContextWalker.java | 2 +- .../sting/gatk/walkers/RMD.java | 1 - .../sting/gatk/walkers/Reference.java | 47 +++++++++ .../sting/gatk/walkers/Window.java | 54 ++++++++++ .../broadinstitute/sting/utils/GenomeLoc.java | 9 +- .../providers/LocusReferenceViewTest.java | 6 +- 10 files changed, 258 insertions(+), 26 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/Reference.java create mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/Window.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index ab7879b76..912338d5a 100644 --- a/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -25,6 +25,9 @@ package org.broadinstitute.sting.gatk.contexts; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; + /** * The section of the reference that overlaps with the given * read / locus. @@ -34,17 +37,65 @@ package org.broadinstitute.sting.gatk.contexts; */ public class ReferenceContext { - private char base; + /** + * The locus. + */ + private GenomeLoc locus; - public ReferenceContext( char base ) { - this.base = base; + /** + * The window of reference information around the current locus. + */ + private GenomeLoc window; + + /** + * The bases in the window around the current locus. + */ + private char[] bases; + + /** + * Contructor for a simple, windowless reference context. + * @param locus locus of interest. + * @param base reference base at that locus. + */ + public ReferenceContext( GenomeLoc locus, char base ) { + this( locus, locus, new char[] { base } ); + } + + public ReferenceContext( GenomeLoc locus, GenomeLoc window, char[] bases ) { + if( !window.containsP(locus) ) + throw new StingException("Invalid locus or window; window does not contain locus"); + + this.locus = locus; + this.window = window; + this.bases = bases; + } + + /** + * The locus currently being examined. + * @return The current locus. + */ + public GenomeLoc getLocus() { + return locus; + } + + public GenomeLoc getWindow() { + return window; } /** * Get the base at the given locus. - * @return The base at the given locus from the reference.§ + * @return The base at the given locus from the reference. */ public char getBase() { - return base; + return bases[(int)(locus.getStart() - window.getStart())]; + } + + /** + * All the bases in the window currently being examined. + * @return All bases available. If the window is of size [0,0], the array will + * contain only the base at the given locus. + */ + public char[] getBases() { + return bases; } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java index 69c50f2cd..b1eae4a0a 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java @@ -1,10 +1,14 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.util.StringUtil; -import net.sf.samtools.SAMSequenceRecord; /* * Copyright (c) 2009 The Broad Institute * @@ -39,6 +43,19 @@ public class LocusReferenceView extends ReferenceView { */ private final GenomeLoc bounds; + /** + * Start of the expanded window for which the reference context should be provided, + * relative to the locus in question. + */ + private final int windowStart; + + + /** + * Start of the expanded window for which the reference context should be provided, + * relative to the locus in question. + */ + private final int windowStop; + /** * Track the reference sequence and the last point accessed. Used to * track state when traversing over the reference. @@ -46,26 +63,65 @@ public class LocusReferenceView extends ReferenceView { private ReferenceSequence referenceSequence; /** - * Create a new locus reference view. - * @param provider source for locus data. + * Create a LocusReferenceView given no other contextual information about + * the walkers, etc. + * @param provider source for locus data. */ public LocusReferenceView( ShardDataProvider provider ) { - super( provider ); + super(provider); bounds = provider.getShard().getGenomeLoc(); - this.referenceSequence = reference.getSubsequenceAt( bounds.getContig(), - bounds.getStart(), - bounds.getStop() ); + windowStart = windowStop = 0; + initializeReferenceSequence(bounds); } /** - * Gets the reference base associated with this particular point on the genome. + * Create a new locus reference view. + * @param provider source for locus data. + */ + public LocusReferenceView( Walker walker, ShardDataProvider provider ) { + super( provider ); + bounds = provider.getShard().getGenomeLoc(); + + // Retrieve information about the window being accessed. + if( walker.getClass().isAnnotationPresent(Reference.class) ) { + Window window = walker.getClass().getAnnotation(Reference.class).window(); + + if( window.start() > 0 ) throw new StingException( "Reference window starts after current locus" ); + if( window.stop() < 0 ) throw new StingException( "Reference window ends before current locus" ); + + windowStart = window.start(); + windowStop = window.stop(); + } + else { + windowStart = 0; + windowStop = 0; + } + + long expandedStart = getWindowStart( bounds ); + long expandedStop = getWindowStop( bounds ); + + initializeReferenceSequence(GenomeLocParser.createGenomeLoc(bounds.getContig(), expandedStart, expandedStop)); + } + + /** + * Initialize reference sequence data using the given locus. + * @param locus + */ + private void initializeReferenceSequence( GenomeLoc locus ) { + this.referenceSequence = reference.getSubsequenceAt( locus.getContig(), locus.getStart(), locus.getStop() ); + } + + /** + * Gets the reference context associated with this particular point on the genome. * @param genomeLoc Region for which to retrieve the base. GenomeLoc must represent a 1-base region. * @return The base at the position represented by this genomeLoc. */ - public char getReferenceBase( GenomeLoc genomeLoc ) { + public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) { validateLocation( genomeLoc ); - int offset = (int)(genomeLoc.getStart() - bounds.getStart()); - return StringUtil.bytesToString( referenceSequence.getBases(), offset, 1 ).charAt(0); + + GenomeLoc window = GenomeLocParser.createGenomeLoc( genomeLoc.getContig(), getWindowStart(genomeLoc), getWindowStop(genomeLoc) ); + char[] bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)(window.getStart() - getWindowStart(bounds)), (int)window.size() ).toCharArray(); + return new ReferenceContext( genomeLoc, window, bases ); } /** @@ -91,4 +147,22 @@ public class LocusReferenceView extends ReferenceView { throw new InvalidPositionException( String.format("Requested position %s not within interval %s", genomeLoc, bounds)); } + + /** + * Gets the start of the expanded window, bounded if necessary by the contig. + * @param locus The locus to expand. + * @return The expanded window. + */ + private long getWindowStart( GenomeLoc locus ) { + return Math.max( locus.getStart() + windowStart, 1 ); + } + + /** + * Gets the stop of the expanded window, bounded if necessary by the contig. + * @param locus The locus to expand. + * @return The expanded window. + */ + private long getWindowStop( GenomeLoc locus ) { + return Math.min( locus.getStop() + windowStop, reference.getSequenceDictionary().getSequence(locus.getContig()).getSequenceLength() ); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java index 436454e5d..1d9038c83 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java @@ -45,7 +45,7 @@ public class TraverseLoci extends TraversalEngine { LocusWalker locusWalker = (LocusWalker)walker; LocusView locusView = getLocusView( walker, dataProvider ); - LocusReferenceView referenceView = new LocusReferenceView( dataProvider ); + LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); // We keep processing while the next reference location is within the interval @@ -57,7 +57,7 @@ public class TraverseLoci extends TraversalEngine { // Iterate forward to get all reference ordered data covering this locus final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); - ReferenceContext refContext = new ReferenceContext( referenceView.getReferenceBase(locus.getLocation()) ); + ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation()); final boolean keepMeP = locusWalker.filter(tracker, refContext, locus); if (keepMeP) { diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java index d08fa7b28..80d04cdb3 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java @@ -38,7 +38,7 @@ public class TraverseLocusWindows extends TraversalEngine { GenomeLoc interval = shard.getGenomeLoc(); ReadView readView = new ReadView( dataProvider ); - LocusReferenceView referenceView = new LocusReferenceView( dataProvider ); + LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); AlignmentContext locus = getLocusContext(readView.iterator(), interval); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PrintLocusContextWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PrintLocusContextWalker.java index 1eac9bade..f4dbca939 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/PrintLocusContextWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PrintLocusContextWalker.java @@ -18,7 +18,7 @@ import net.sf.samtools.SAMRecord; */ public class PrintLocusContextWalker extends LocusWalker implements TreeReducible { public AlignmentContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - out.printf( "In map: ref = %s, loc = %s, reads = %s%n", ref, + out.printf( "In map: ref = %s, loc = %s, reads = %s%n", ref.getBase(), context.getLocation(), Arrays.deepToString( getReadNames(context.getReads()) ) ); return context; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java b/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java index db5e21d6d..004f0a2ba 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java @@ -22,7 +22,6 @@ import java.lang.annotation.*; @Documented @Inherited @Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) public @interface RMD { String name(); Class type(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java b/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java new file mode 100644 index 000000000..4f2a2f2f9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers; + +import java.lang.annotation.*; + +/** + * Describes presentation, capabilities, and limitations of the reference + * provided to the GATK. + * + * @author mhanna + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface Reference { + /** + * Specifies the window expansion for the current walker. + * @return The window to which the reference should be expanded. Defaults to [0,0] (no expansion). + */ + public Window window() default @Window; +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/Window.java b/java/src/org/broadinstitute/sting/gatk/walkers/Window.java new file mode 100644 index 000000000..0b718071d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/Window.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers; + +import java.lang.annotation.*; + +/** + * Describes the size of the window into the genome. Has differing semantics based on + * the data this annotation is used to describe. + * + * @author mhanna + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +public @interface Window { + /** + * Controls where the window should start and stop relative to + * the locus currently being processed. + * @return start point; default is 0, indicating 'supply only the reference base at the current locus'. + */ + public int start() default 0; + + /** + * Controls where the window should start and stop relative to + * the locus currently being processed. + * @return stop point; default is 0, indicating 'supply only the reference base at the current locus'. + */ + public int stop() default 0; +} diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index eaf369ba4..d2fff00af 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -271,6 +271,13 @@ public class GenomeLoc implements Comparable, Cloneable { return 0; } - + /** + * How many BPs are covered by this locus? + * @return Number of BPs covered by this locus. According to the semantics of GenomeLoc, this should + * never be < 1. + */ + public long size() { + return stop - start + 1; + } } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewTest.java index a5bc268f2..6cc560d3f 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewTest.java @@ -47,7 +47,7 @@ public class LocusReferenceViewTest extends ReferenceViewTemplate { ShardDataProvider dataProvider = new ShardDataProvider(shard, null, sequenceFile, null); LocusReferenceView view = new LocusReferenceView(dataProvider); - view.getReferenceBase(shard.getGenomeLoc()); + view.getReferenceContext(shard.getGenomeLoc()).getBase(); } @Test @@ -74,7 +74,7 @@ public class LocusReferenceViewTest extends ReferenceViewTemplate { ShardDataProvider dataProvider = new ShardDataProvider(shard, null, sequenceFile, null); LocusReferenceView view = new LocusReferenceView(dataProvider); - view.getReferenceBase(GenomeLocParser.createGenomeLoc(0, 51)); + view.getReferenceContext(GenomeLocParser.createGenomeLoc(0, 51)).getBase(); } @@ -95,7 +95,7 @@ public class LocusReferenceViewTest extends ReferenceViewTemplate { ReferenceSequence expectedAsSeq = sequenceFile.getSubsequenceAt(locus.getContig(), locus.getStart(), locus.getStop()); char expected = StringUtil.bytesToString(expectedAsSeq.getBases()).charAt(0); - char actual = view.getReferenceBase(locus); + char actual = view.getReferenceContext(locus).getBase(); Assert.assertEquals(String.format("Value of base at position %s in shard %s does not match expected", locus.toString(), shard.getGenomeLoc()), expected,