Lazy reference loading; the engine doesn't fetch the reference bases until you actually call ref.getBases(). With the new hidden --dontUpdateUG to table recalibrator this is 2-3x faster than before. Enabled for locus, read, and rod walkers.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4042 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-08-16 13:46:22 +00:00
parent 9ab647b730
commit 20db00a3e8
5 changed files with 99 additions and 64 deletions

View File

@ -52,34 +52,77 @@ public class ReferenceContext {
private GenomeLoc window;
/**
* The bases in the window around the current locus.
* The bases in the window around the current locus. If null, then bases haven't been fetched yet
*/
private byte[] bases;
private byte[] basesCache = null;
/**
* Lazy loader to fetch reference bases
*/
private ReferenceContextRefProvider basesProvider;
/**
* A cache of the bases converted to characters for walkers not yet using byte[] interface
*/
private char[] basesAsCharCached = null;
/**
* Interface to create byte[] contexts for lazy loading of the reference
*/
public static interface ReferenceContextRefProvider {
/**
* You must provide a routine that gets the byte[] bases that would have been passed into the
* ReferenceContext. The RC will handling caching. The value of this interface and routine is
* that it is only called when the bytes are actually requested by the walker, not up front. So
* if the walker doesn't need the refBases for whatever reason, there's no overhead to
* provide them.
*
* @return
*/
public byte[] getBases();
}
private static class ForwardingProvider implements ReferenceContextRefProvider {
byte[] bases;
public ForwardingProvider( byte base ) {
this(new byte[] { base });
}
public ForwardingProvider( byte[] bases ) {
this.bases = bases;
}
public byte[] getBases() { return bases; }
}
/**
* Contructor for a simple, windowless reference context.
* @param locus locus of interest.
* @param base reference base at that locus.
*/
public ReferenceContext( GenomeLoc locus, byte base ) {
this( locus, locus, new byte[] { UPPERCASE_REFERENCE ? StringUtil.toUpperCase(base) : base } );
this( locus, locus, new ForwardingProvider(base) );
}
// todo -- this really should take the referenceview as an option and only grab the bases if necessary
public ReferenceContext( GenomeLoc locus, GenomeLoc window, byte[] bases ) {
this( locus, window, new ForwardingProvider(bases) );
}
public ReferenceContext( GenomeLoc locus, GenomeLoc window, ReferenceContextRefProvider basesProvider ) {
// if( !window.containsP(locus) )
// throw new StingException("Invalid locus or window; window does not contain locus");
this.locus = locus;
this.window = window;
this.bases = bases;
this.basesProvider = basesProvider;
}
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(bases);
private void fetchBasesFromProvider() {
if ( basesCache == null ) {
basesCache = basesProvider.getBases();
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache);
}
}
/**
@ -99,7 +142,7 @@ public class ReferenceContext {
* @return The base at the given locus from the reference.
*/
public byte getBase() {
return bases[(int)(locus.getStart() - window.getStart())];
return getBases()[(int)(locus.getStart() - window.getStart())];
}
@Deprecated
@ -121,13 +164,14 @@ public class ReferenceContext {
* contain only the base at the given locus.
*/
public byte[] getBases() {
return bases;
fetchBasesFromProvider();
return basesCache;
}
@Deprecated
public char[] getBasesAsChars() {
if ( basesAsCharCached == null )
basesAsCharCached = new String(bases).toCharArray();
basesAsCharCached = new String(getBases()).toCharArray();
return basesAsCharCached;
}
@ -140,7 +184,7 @@ public class ReferenceContext {
* @return
*/
public byte[] getBasesAtLocus(int n) {
byte[] bases = getBases();
int start = (int)(locus.getStart()-window.getStart());
int stop = ( n==(-1) ? bases.length : start+n );
@ -153,9 +197,4 @@ public class ReferenceContext {
for ( int j = start ; j < stop ; j++) b[i++]=bases[j];
return b;
}
@Deprecated
public char[] getBasesAtLocusAsChar(int n) {
return new String(getBasesAtLocus(n)).toCharArray();
}
}

View File

@ -165,6 +165,22 @@ public class LocusReferenceView extends ReferenceView {
return l;
}
public class Provider implements ReferenceContext.ReferenceContextRefProvider {
int refStart, len;
public Provider( int refStart, int len ) {
this.refStart = refStart;
this.len = len;
}
public byte[] getBases() {
//System.out.printf("Getting bases for location%n");
byte[] bases = new byte[len];
System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len);
return bases;
}
}
/**
* Gets the reference context associated with this particular point or extended interval on the genome.
* @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beoynd current bounds, it will be trimmed down.
@ -186,32 +202,10 @@ public class LocusReferenceView extends ReferenceView {
refStart = (int)window.getStart()-1;
}
// todo -- how often is this copy unnecessary?
int len = (int)window.size();
byte[] bases = new byte[len];
System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len);
return new ReferenceContext( genomeLoc, window, bases );
return new ReferenceContext( genomeLoc, window, new Provider(refStart, len));
}
// public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) {
// //validateLocation( genomeLoc );
//
// GenomeLoc window = GenomeLocParser.createGenomeLoc( genomeLoc.getContig(), getWindowStart(genomeLoc), getWindowStop(genomeLoc) );
// char[] bases = null;
//
// if(bounds != null) {
// window = trimToBounds(window);
// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)(window.getStart() - getWindowStart(bounds)), (int)window.size() ).toCharArray();
// }
// else {
// if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex())
// referenceSequence = reference.getSequence(genomeLoc.getContig());
// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)window.getStart()-1, (int)window.size()).toCharArray();
// }
// return new ReferenceContext( genomeLoc, window, bases );
// }
/**
* Allow the user to pull reference info from any arbitrary region of the reference.
* @param genomeLoc The locus.

View File

@ -4,6 +4,7 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -52,26 +53,29 @@ public class ReadReferenceView extends ReferenceView {
super(provider);
}
/**
* Gets the bases of the reference that are aligned to the given read.
*
* @param read the read for which to extract reference information.
*
* @return The bases corresponding to this read, or null if the read is unmapped.
* If the alignment goes off the end of the contig, return just the portion
* mapped to the reference, followed by X's coresponding to the rest of the read.
* This indicates that the rest lies off the end of the contig.
*/
// public char[] getReferenceBases( SAMRecord read ) {
// if (read.getReadUnmappedFlag())
// return null;
// return getReferenceBases( GenomeLocParser.createGenomeLoc(read) );
// }
protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) {
return new Provider(genomeLoc);
}
public class Provider implements ReferenceContext.ReferenceContextRefProvider {
GenomeLoc loc;
public Provider( GenomeLoc loc ) {
this.loc = loc;
}
public byte[] getBases() {
// System.out.printf("Getting bases for location %s%n", loc);
// throw new StingException("x");
return getReferenceBases(loc);
}
}
public ReferenceContext getReferenceContext( SAMRecord read ) {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
byte[] bases = super.getReferenceBases(loc);
return new ReferenceContext( loc, loc, bases );
// byte[] bases = super.getReferenceBases(loc);
// return new ReferenceContext( loc, loc, bases );
return new ReferenceContext( loc, loc, getReferenceBasesProvider(loc) );
}
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
@ -64,14 +65,6 @@ public class ReferenceView implements View {
* @return A list of the bases starting at the start of the locus (inclusive) and ending
* at the end of the locus (inclusive).
*/
// protected char[] getReferenceBasesAsChars( GenomeLoc genomeLoc ) {
//// SAMSequenceRecord sequenceInfo = reference.getSequenceDictionary().getSequence(genomeLoc.getContig());
//// long stop = Math.min( genomeLoc.getStop(), sequenceInfo.getSequenceLength() );
//// ReferenceSequence subsequence = reference.getSubsequenceAt(genomeLoc.getContig(),genomeLoc.getStart(),stop);
//// return (StringUtil.bytesToString(subsequence.getBases()) + Utils.dupString('X', (int)(genomeLoc.getStop() - stop)) ).toCharArray();
// return new String(getReferenceBases(genomeLoc)).toCharArray();
// }
final static int BUFFER = 10000;
final static byte[] Xs = new byte[BUFFER];
static {

View File

@ -102,6 +102,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
private boolean REQUIRE_EOF = false;
@Hidden
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
private boolean skipUQUpdate = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -375,7 +380,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
}
if (read.getAttribute(SAMTag.UQ.name()) != null && refBases != null) {
if (read.getAttribute(SAMTag.UQ.name()) != null && refBases != null && ! skipUQUpdate ) {
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
}