Windowed access to the reference.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1383 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-08-05 16:29:15 +00:00
parent 65e9dcf5b7
commit 48713e154c
10 changed files with 258 additions and 26 deletions

View File

@ -25,6 +25,9 @@
package org.broadinstitute.sting.gatk.contexts;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
/**
* The section of the reference that overlaps with the given
* read / locus.
@ -34,17 +37,65 @@ package org.broadinstitute.sting.gatk.contexts;
*/
public class ReferenceContext {
private char base;
/**
* The locus.
*/
private GenomeLoc locus;
public ReferenceContext( char base ) {
this.base = base;
/**
* The window of reference information around the current locus.
*/
private GenomeLoc window;
/**
* The bases in the window around the current locus.
*/
private char[] bases;
/**
* Contructor for a simple, windowless reference context.
* @param locus locus of interest.
* @param base reference base at that locus.
*/
public ReferenceContext( GenomeLoc locus, char base ) {
this( locus, locus, new char[] { base } );
}
public ReferenceContext( GenomeLoc locus, GenomeLoc window, char[] bases ) {
if( !window.containsP(locus) )
throw new StingException("Invalid locus or window; window does not contain locus");
this.locus = locus;
this.window = window;
this.bases = bases;
}
/**
* The locus currently being examined.
* @return The current locus.
*/
public GenomeLoc getLocus() {
return locus;
}
public GenomeLoc getWindow() {
return window;
}
/**
* Get the base at the given locus.
* @return The base at the given locus from the reference.§
* @return The base at the given locus from the reference.
*/
public char getBase() {
return base;
return bases[(int)(locus.getStart() - window.getStart())];
}
/**
* All the bases in the window currently being examined.
* @return All bases available. If the window is of size [0,0], the array will
* contain only the base at the given locus.
*/
public char[] getBases() {
return bases;
}
}

View File

@ -1,10 +1,14 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
import net.sf.samtools.SAMSequenceRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -39,6 +43,19 @@ public class LocusReferenceView extends ReferenceView {
*/
private final GenomeLoc bounds;
/**
* Start of the expanded window for which the reference context should be provided,
* relative to the locus in question.
*/
private final int windowStart;
/**
* Start of the expanded window for which the reference context should be provided,
* relative to the locus in question.
*/
private final int windowStop;
/**
* Track the reference sequence and the last point accessed. Used to
* track state when traversing over the reference.
@ -46,26 +63,65 @@ public class LocusReferenceView extends ReferenceView {
private ReferenceSequence referenceSequence;
/**
* Create a new locus reference view.
* @param provider source for locus data.
* Create a LocusReferenceView given no other contextual information about
* the walkers, etc.
* @param provider source for locus data.
*/
public LocusReferenceView( ShardDataProvider provider ) {
super( provider );
super(provider);
bounds = provider.getShard().getGenomeLoc();
this.referenceSequence = reference.getSubsequenceAt( bounds.getContig(),
bounds.getStart(),
bounds.getStop() );
windowStart = windowStop = 0;
initializeReferenceSequence(bounds);
}
/**
* Gets the reference base associated with this particular point on the genome.
* Create a new locus reference view.
* @param provider source for locus data.
*/
public LocusReferenceView( Walker walker, ShardDataProvider provider ) {
super( provider );
bounds = provider.getShard().getGenomeLoc();
// Retrieve information about the window being accessed.
if( walker.getClass().isAnnotationPresent(Reference.class) ) {
Window window = walker.getClass().getAnnotation(Reference.class).window();
if( window.start() > 0 ) throw new StingException( "Reference window starts after current locus" );
if( window.stop() < 0 ) throw new StingException( "Reference window ends before current locus" );
windowStart = window.start();
windowStop = window.stop();
}
else {
windowStart = 0;
windowStop = 0;
}
long expandedStart = getWindowStart( bounds );
long expandedStop = getWindowStop( bounds );
initializeReferenceSequence(GenomeLocParser.createGenomeLoc(bounds.getContig(), expandedStart, expandedStop));
}
/**
* Initialize reference sequence data using the given locus.
* @param locus
*/
private void initializeReferenceSequence( GenomeLoc locus ) {
this.referenceSequence = reference.getSubsequenceAt( locus.getContig(), locus.getStart(), locus.getStop() );
}
/**
* Gets the reference context associated with this particular point on the genome.
* @param genomeLoc Region for which to retrieve the base. GenomeLoc must represent a 1-base region.
* @return The base at the position represented by this genomeLoc.
*/
public char getReferenceBase( GenomeLoc genomeLoc ) {
public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) {
validateLocation( genomeLoc );
int offset = (int)(genomeLoc.getStart() - bounds.getStart());
return StringUtil.bytesToString( referenceSequence.getBases(), offset, 1 ).charAt(0);
GenomeLoc window = GenomeLocParser.createGenomeLoc( genomeLoc.getContig(), getWindowStart(genomeLoc), getWindowStop(genomeLoc) );
char[] bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)(window.getStart() - getWindowStart(bounds)), (int)window.size() ).toCharArray();
return new ReferenceContext( genomeLoc, window, bases );
}
/**
@ -91,4 +147,22 @@ public class LocusReferenceView extends ReferenceView {
throw new InvalidPositionException(
String.format("Requested position %s not within interval %s", genomeLoc, bounds));
}
/**
* Gets the start of the expanded window, bounded if necessary by the contig.
* @param locus The locus to expand.
* @return The expanded window.
*/
private long getWindowStart( GenomeLoc locus ) {
return Math.max( locus.getStart() + windowStart, 1 );
}
/**
* Gets the stop of the expanded window, bounded if necessary by the contig.
* @param locus The locus to expand.
* @return The expanded window.
*/
private long getWindowStop( GenomeLoc locus ) {
return Math.min( locus.getStop() + windowStop, reference.getSequenceDictionary().getSequence(locus.getContig()).getSequenceLength() );
}
}

View File

@ -45,7 +45,7 @@ public class TraverseLoci extends TraversalEngine {
LocusWalker<M, T> locusWalker = (LocusWalker<M, T>)walker;
LocusView locusView = getLocusView( walker, dataProvider );
LocusReferenceView referenceView = new LocusReferenceView( dataProvider );
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
// We keep processing while the next reference location is within the interval
@ -57,7 +57,7 @@ public class TraverseLoci extends TraversalEngine {
// Iterate forward to get all reference ordered data covering this locus
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation());
ReferenceContext refContext = new ReferenceContext( referenceView.getReferenceBase(locus.getLocation()) );
ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation());
final boolean keepMeP = locusWalker.filter(tracker, refContext, locus);
if (keepMeP) {

View File

@ -38,7 +38,7 @@ public class TraverseLocusWindows extends TraversalEngine {
GenomeLoc interval = shard.getGenomeLoc();
ReadView readView = new ReadView( dataProvider );
LocusReferenceView referenceView = new LocusReferenceView( dataProvider );
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
AlignmentContext locus = getLocusContext(readView.iterator(), interval);

View File

@ -18,7 +18,7 @@ import net.sf.samtools.SAMRecord;
*/
public class PrintLocusContextWalker extends LocusWalker<AlignmentContext, Integer> implements TreeReducible<Integer> {
public AlignmentContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
out.printf( "In map: ref = %s, loc = %s, reads = %s%n", ref,
out.printf( "In map: ref = %s, loc = %s, reads = %s%n", ref.getBase(),
context.getLocation(),
Arrays.deepToString( getReadNames(context.getReads()) ) );
return context;

View File

@ -22,7 +22,6 @@ import java.lang.annotation.*;
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface RMD {
String name();
Class<? extends ReferenceOrderedDatum> type();

View File

@ -0,0 +1,47 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers;
import java.lang.annotation.*;
/**
* Describes presentation, capabilities, and limitations of the reference
* provided to the GATK.
*
* @author mhanna
* @version 0.1
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface Reference {
/**
* Specifies the window expansion for the current walker.
* @return The window to which the reference should be expanded. Defaults to [0,0] (no expansion).
*/
public Window window() default @Window;
}

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers;
import java.lang.annotation.*;
/**
* Describes the size of the window into the genome. Has differing semantics based on
* the data this annotation is used to describe.
*
* @author mhanna
* @version 0.1
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
public @interface Window {
/**
* Controls where the window should start and stop relative to
* the locus currently being processed.
* @return start point; default is 0, indicating 'supply only the reference base at the current locus'.
*/
public int start() default 0;
/**
* Controls where the window should start and stop relative to
* the locus currently being processed.
* @return stop point; default is 0, indicating 'supply only the reference base at the current locus'.
*/
public int stop() default 0;
}

View File

@ -271,6 +271,13 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
return 0;
}
/**
* How many BPs are covered by this locus?
* @return Number of BPs covered by this locus. According to the semantics of GenomeLoc, this should
* never be < 1.
*/
public long size() {
return stop - start + 1;
}
}

View File

@ -47,7 +47,7 @@ public class LocusReferenceViewTest extends ReferenceViewTemplate {
ShardDataProvider dataProvider = new ShardDataProvider(shard, null, sequenceFile, null);
LocusReferenceView view = new LocusReferenceView(dataProvider);
view.getReferenceBase(shard.getGenomeLoc());
view.getReferenceContext(shard.getGenomeLoc()).getBase();
}
@Test
@ -74,7 +74,7 @@ public class LocusReferenceViewTest extends ReferenceViewTemplate {
ShardDataProvider dataProvider = new ShardDataProvider(shard, null, sequenceFile, null);
LocusReferenceView view = new LocusReferenceView(dataProvider);
view.getReferenceBase(GenomeLocParser.createGenomeLoc(0, 51));
view.getReferenceContext(GenomeLocParser.createGenomeLoc(0, 51)).getBase();
}
@ -95,7 +95,7 @@ public class LocusReferenceViewTest extends ReferenceViewTemplate {
ReferenceSequence expectedAsSeq = sequenceFile.getSubsequenceAt(locus.getContig(), locus.getStart(), locus.getStop());
char expected = StringUtil.bytesToString(expectedAsSeq.getBases()).charAt(0);
char actual = view.getReferenceBase(locus);
char actual = view.getReferenceContext(locus).getBase();
Assert.assertEquals(String.format("Value of base at position %s in shard %s does not match expected", locus.toString(), shard.getGenomeLoc()),
expected,