From 20db00a3e85e0bed467bb7dd626c758195409ce3 Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 16 Aug 2010 13:46:22 +0000 Subject: [PATCH] Lazy reference loading; the engine doesn't fetch the reference bases until you actually call ref.getBases(). With the new hidden --dontUpdateUG to table recalibrator this is 2-3x faster than before. Enabled for locus, read, and rod walkers. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4042 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/contexts/ReferenceContext.java | 69 +++++++++++++++---- .../providers/LocusReferenceView.java | 40 +++++------ .../providers/ReadReferenceView.java | 38 +++++----- .../datasources/providers/ReferenceView.java | 9 +-- .../TableRecalibrationWalker.java | 7 +- 5 files changed, 99 insertions(+), 64 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index 84425be72..1acd4082f 100644 --- a/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -52,34 +52,77 @@ public class ReferenceContext { private GenomeLoc window; /** - * The bases in the window around the current locus. + * The bases in the window around the current locus. If null, then bases haven't been fetched yet */ - private byte[] bases; + private byte[] basesCache = null; + + /** + * Lazy loader to fetch reference bases + */ + private ReferenceContextRefProvider basesProvider; /** * A cache of the bases converted to characters for walkers not yet using byte[] interface */ private char[] basesAsCharCached = null; + /** + * Interface to create byte[] contexts for lazy loading of the reference + */ + public static interface ReferenceContextRefProvider { + /** + * You must provide a routine that gets the byte[] bases that would have been passed into the + * ReferenceContext. The RC will handling caching. The value of this interface and routine is + * that it is only called when the bytes are actually requested by the walker, not up front. So + * if the walker doesn't need the refBases for whatever reason, there's no overhead to + * provide them. + * + * @return + */ + public byte[] getBases(); + } + + private static class ForwardingProvider implements ReferenceContextRefProvider { + byte[] bases; + + public ForwardingProvider( byte base ) { + this(new byte[] { base }); + } + + public ForwardingProvider( byte[] bases ) { + this.bases = bases; + } + + public byte[] getBases() { return bases; } + } + /** * Contructor for a simple, windowless reference context. * @param locus locus of interest. * @param base reference base at that locus. */ public ReferenceContext( GenomeLoc locus, byte base ) { - this( locus, locus, new byte[] { UPPERCASE_REFERENCE ? StringUtil.toUpperCase(base) : base } ); + this( locus, locus, new ForwardingProvider(base) ); } - // todo -- this really should take the referenceview as an option and only grab the bases if necessary public ReferenceContext( GenomeLoc locus, GenomeLoc window, byte[] bases ) { + this( locus, window, new ForwardingProvider(bases) ); + } + + public ReferenceContext( GenomeLoc locus, GenomeLoc window, ReferenceContextRefProvider basesProvider ) { // if( !window.containsP(locus) ) // throw new StingException("Invalid locus or window; window does not contain locus"); this.locus = locus; this.window = window; - this.bases = bases; + this.basesProvider = basesProvider; + } - if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(bases); + private void fetchBasesFromProvider() { + if ( basesCache == null ) { + basesCache = basesProvider.getBases(); + if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache); + } } /** @@ -99,7 +142,7 @@ public class ReferenceContext { * @return The base at the given locus from the reference. */ public byte getBase() { - return bases[(int)(locus.getStart() - window.getStart())]; + return getBases()[(int)(locus.getStart() - window.getStart())]; } @Deprecated @@ -121,13 +164,14 @@ public class ReferenceContext { * contain only the base at the given locus. */ public byte[] getBases() { - return bases; + fetchBasesFromProvider(); + return basesCache; } @Deprecated public char[] getBasesAsChars() { if ( basesAsCharCached == null ) - basesAsCharCached = new String(bases).toCharArray(); + basesAsCharCached = new String(getBases()).toCharArray(); return basesAsCharCached; } @@ -140,7 +184,7 @@ public class ReferenceContext { * @return */ public byte[] getBasesAtLocus(int n) { - + byte[] bases = getBases(); int start = (int)(locus.getStart()-window.getStart()); int stop = ( n==(-1) ? bases.length : start+n ); @@ -153,9 +197,4 @@ public class ReferenceContext { for ( int j = start ; j < stop ; j++) b[i++]=bases[j]; return b; } - - @Deprecated - public char[] getBasesAtLocusAsChar(int n) { - return new String(getBasesAtLocus(n)).toCharArray(); - } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java index ff29d9317..e17eccac3 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java @@ -165,6 +165,22 @@ public class LocusReferenceView extends ReferenceView { return l; } + public class Provider implements ReferenceContext.ReferenceContextRefProvider { + int refStart, len; + + public Provider( int refStart, int len ) { + this.refStart = refStart; + this.len = len; + } + + public byte[] getBases() { + //System.out.printf("Getting bases for location%n"); + byte[] bases = new byte[len]; + System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len); + return bases; + } + } + /** * Gets the reference context associated with this particular point or extended interval on the genome. * @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beoynd current bounds, it will be trimmed down. @@ -186,32 +202,10 @@ public class LocusReferenceView extends ReferenceView { refStart = (int)window.getStart()-1; } - // todo -- how often is this copy unnecessary? int len = (int)window.size(); - byte[] bases = new byte[len]; - System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len); - return new ReferenceContext( genomeLoc, window, bases ); + return new ReferenceContext( genomeLoc, window, new Provider(refStart, len)); } -// public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) { -// //validateLocation( genomeLoc ); -// -// GenomeLoc window = GenomeLocParser.createGenomeLoc( genomeLoc.getContig(), getWindowStart(genomeLoc), getWindowStop(genomeLoc) ); -// char[] bases = null; -// -// if(bounds != null) { -// window = trimToBounds(window); -// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)(window.getStart() - getWindowStart(bounds)), (int)window.size() ).toCharArray(); -// } -// else { -// if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex()) -// referenceSequence = reference.getSequence(genomeLoc.getContig()); -// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)window.getStart()-1, (int)window.size()).toCharArray(); -// } -// return new ReferenceContext( genomeLoc, window, bases ); -// } - - /** * Allow the user to pull reference info from any arbitrary region of the reference. * @param genomeLoc The locus. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java index 6384f7c39..7b52d3711 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.StringUtil; import net.sf.picard.reference.ReferenceSequence; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -52,26 +53,29 @@ public class ReadReferenceView extends ReferenceView { super(provider); } - /** - * Gets the bases of the reference that are aligned to the given read. - * - * @param read the read for which to extract reference information. - * - * @return The bases corresponding to this read, or null if the read is unmapped. - * If the alignment goes off the end of the contig, return just the portion - * mapped to the reference, followed by X's coresponding to the rest of the read. - * This indicates that the rest lies off the end of the contig. - */ -// public char[] getReferenceBases( SAMRecord read ) { -// if (read.getReadUnmappedFlag()) -// return null; -// return getReferenceBases( GenomeLocParser.createGenomeLoc(read) ); -// } + protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) { + return new Provider(genomeLoc); + } + + public class Provider implements ReferenceContext.ReferenceContextRefProvider { + GenomeLoc loc; + + public Provider( GenomeLoc loc ) { + this.loc = loc; + } + + public byte[] getBases() { +// System.out.printf("Getting bases for location %s%n", loc); +// throw new StingException("x"); + return getReferenceBases(loc); + } + } public ReferenceContext getReferenceContext( SAMRecord read ) { GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); - byte[] bases = super.getReferenceBases(loc); - return new ReferenceContext( loc, loc, bases ); +// byte[] bases = super.getReferenceBases(loc); +// return new ReferenceContext( loc, loc, bases ); + return new ReferenceContext( loc, loc, getReferenceBasesProvider(loc) ); } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java index 1ce1886bf..be10a8b88 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.StingException; @@ -64,14 +65,6 @@ public class ReferenceView implements View { * @return A list of the bases starting at the start of the locus (inclusive) and ending * at the end of the locus (inclusive). */ -// protected char[] getReferenceBasesAsChars( GenomeLoc genomeLoc ) { -//// SAMSequenceRecord sequenceInfo = reference.getSequenceDictionary().getSequence(genomeLoc.getContig()); -//// long stop = Math.min( genomeLoc.getStop(), sequenceInfo.getSequenceLength() ); -//// ReferenceSequence subsequence = reference.getSubsequenceAt(genomeLoc.getContig(),genomeLoc.getStart(),stop); -//// return (StringUtil.bytesToString(subsequence.getBases()) + Utils.dupString('X', (int)(genomeLoc.getStop() - stop)) ).toCharArray(); -// return new String(getReferenceBases(genomeLoc)).toCharArray(); -// } - final static int BUFFER = 10000; final static byte[] Xs = new byte[BUFFER]; static { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index b3d3ab4fd..f64dc2794 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -102,6 +102,11 @@ public class TableRecalibrationWalker extends ReadWalker