Lazy reference loading; the engine doesn't fetch the reference bases until you actually call ref.getBases(). With the new hidden --dontUpdateUG to table recalibrator this is 2-3x faster than before. Enabled for locus, read, and rod walkers.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4042 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-08-16 13:46:22 +00:00
parent 9ab647b730
commit 20db00a3e8
5 changed files with 99 additions and 64 deletions

View File

@ -52,34 +52,77 @@ public class ReferenceContext {
private GenomeLoc window; private GenomeLoc window;
/** /**
* The bases in the window around the current locus. * The bases in the window around the current locus. If null, then bases haven't been fetched yet
*/ */
private byte[] bases; private byte[] basesCache = null;
/**
* Lazy loader to fetch reference bases
*/
private ReferenceContextRefProvider basesProvider;
/** /**
* A cache of the bases converted to characters for walkers not yet using byte[] interface * A cache of the bases converted to characters for walkers not yet using byte[] interface
*/ */
private char[] basesAsCharCached = null; private char[] basesAsCharCached = null;
/**
* Interface to create byte[] contexts for lazy loading of the reference
*/
public static interface ReferenceContextRefProvider {
/**
* You must provide a routine that gets the byte[] bases that would have been passed into the
* ReferenceContext. The RC will handling caching. The value of this interface and routine is
* that it is only called when the bytes are actually requested by the walker, not up front. So
* if the walker doesn't need the refBases for whatever reason, there's no overhead to
* provide them.
*
* @return
*/
public byte[] getBases();
}
private static class ForwardingProvider implements ReferenceContextRefProvider {
byte[] bases;
public ForwardingProvider( byte base ) {
this(new byte[] { base });
}
public ForwardingProvider( byte[] bases ) {
this.bases = bases;
}
public byte[] getBases() { return bases; }
}
/** /**
* Contructor for a simple, windowless reference context. * Contructor for a simple, windowless reference context.
* @param locus locus of interest. * @param locus locus of interest.
* @param base reference base at that locus. * @param base reference base at that locus.
*/ */
public ReferenceContext( GenomeLoc locus, byte base ) { public ReferenceContext( GenomeLoc locus, byte base ) {
this( locus, locus, new byte[] { UPPERCASE_REFERENCE ? StringUtil.toUpperCase(base) : base } ); this( locus, locus, new ForwardingProvider(base) );
} }
// todo -- this really should take the referenceview as an option and only grab the bases if necessary
public ReferenceContext( GenomeLoc locus, GenomeLoc window, byte[] bases ) { public ReferenceContext( GenomeLoc locus, GenomeLoc window, byte[] bases ) {
this( locus, window, new ForwardingProvider(bases) );
}
public ReferenceContext( GenomeLoc locus, GenomeLoc window, ReferenceContextRefProvider basesProvider ) {
// if( !window.containsP(locus) ) // if( !window.containsP(locus) )
// throw new StingException("Invalid locus or window; window does not contain locus"); // throw new StingException("Invalid locus or window; window does not contain locus");
this.locus = locus; this.locus = locus;
this.window = window; this.window = window;
this.bases = bases; this.basesProvider = basesProvider;
}
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(bases); private void fetchBasesFromProvider() {
if ( basesCache == null ) {
basesCache = basesProvider.getBases();
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache);
}
} }
/** /**
@ -99,7 +142,7 @@ public class ReferenceContext {
* @return The base at the given locus from the reference. * @return The base at the given locus from the reference.
*/ */
public byte getBase() { public byte getBase() {
return bases[(int)(locus.getStart() - window.getStart())]; return getBases()[(int)(locus.getStart() - window.getStart())];
} }
@Deprecated @Deprecated
@ -121,13 +164,14 @@ public class ReferenceContext {
* contain only the base at the given locus. * contain only the base at the given locus.
*/ */
public byte[] getBases() { public byte[] getBases() {
return bases; fetchBasesFromProvider();
return basesCache;
} }
@Deprecated @Deprecated
public char[] getBasesAsChars() { public char[] getBasesAsChars() {
if ( basesAsCharCached == null ) if ( basesAsCharCached == null )
basesAsCharCached = new String(bases).toCharArray(); basesAsCharCached = new String(getBases()).toCharArray();
return basesAsCharCached; return basesAsCharCached;
} }
@ -140,7 +184,7 @@ public class ReferenceContext {
* @return * @return
*/ */
public byte[] getBasesAtLocus(int n) { public byte[] getBasesAtLocus(int n) {
byte[] bases = getBases();
int start = (int)(locus.getStart()-window.getStart()); int start = (int)(locus.getStart()-window.getStart());
int stop = ( n==(-1) ? bases.length : start+n ); int stop = ( n==(-1) ? bases.length : start+n );
@ -153,9 +197,4 @@ public class ReferenceContext {
for ( int j = start ; j < stop ; j++) b[i++]=bases[j]; for ( int j = start ; j < stop ; j++) b[i++]=bases[j];
return b; return b;
} }
@Deprecated
public char[] getBasesAtLocusAsChar(int n) {
return new String(getBasesAtLocus(n)).toCharArray();
}
} }

View File

@ -165,6 +165,22 @@ public class LocusReferenceView extends ReferenceView {
return l; return l;
} }
public class Provider implements ReferenceContext.ReferenceContextRefProvider {
int refStart, len;
public Provider( int refStart, int len ) {
this.refStart = refStart;
this.len = len;
}
public byte[] getBases() {
//System.out.printf("Getting bases for location%n");
byte[] bases = new byte[len];
System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len);
return bases;
}
}
/** /**
* Gets the reference context associated with this particular point or extended interval on the genome. * Gets the reference context associated with this particular point or extended interval on the genome.
* @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beoynd current bounds, it will be trimmed down. * @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beoynd current bounds, it will be trimmed down.
@ -186,32 +202,10 @@ public class LocusReferenceView extends ReferenceView {
refStart = (int)window.getStart()-1; refStart = (int)window.getStart()-1;
} }
// todo -- how often is this copy unnecessary?
int len = (int)window.size(); int len = (int)window.size();
byte[] bases = new byte[len]; return new ReferenceContext( genomeLoc, window, new Provider(refStart, len));
System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len);
return new ReferenceContext( genomeLoc, window, bases );
} }
// public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) {
// //validateLocation( genomeLoc );
//
// GenomeLoc window = GenomeLocParser.createGenomeLoc( genomeLoc.getContig(), getWindowStart(genomeLoc), getWindowStop(genomeLoc) );
// char[] bases = null;
//
// if(bounds != null) {
// window = trimToBounds(window);
// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)(window.getStart() - getWindowStart(bounds)), (int)window.size() ).toCharArray();
// }
// else {
// if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex())
// referenceSequence = reference.getSequence(genomeLoc.getContig());
// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)window.getStart()-1, (int)window.size()).toCharArray();
// }
// return new ReferenceContext( genomeLoc, window, bases );
// }
/** /**
* Allow the user to pull reference info from any arbitrary region of the reference. * Allow the user to pull reference info from any arbitrary region of the reference.
* @param genomeLoc The locus. * @param genomeLoc The locus.

View File

@ -4,6 +4,7 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.StringUtil; import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.ReferenceSequence; import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
@ -52,26 +53,29 @@ public class ReadReferenceView extends ReferenceView {
super(provider); super(provider);
} }
/** protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) {
* Gets the bases of the reference that are aligned to the given read. return new Provider(genomeLoc);
* }
* @param read the read for which to extract reference information.
* public class Provider implements ReferenceContext.ReferenceContextRefProvider {
* @return The bases corresponding to this read, or null if the read is unmapped. GenomeLoc loc;
* If the alignment goes off the end of the contig, return just the portion
* mapped to the reference, followed by X's coresponding to the rest of the read. public Provider( GenomeLoc loc ) {
* This indicates that the rest lies off the end of the contig. this.loc = loc;
*/ }
// public char[] getReferenceBases( SAMRecord read ) {
// if (read.getReadUnmappedFlag()) public byte[] getBases() {
// return null; // System.out.printf("Getting bases for location %s%n", loc);
// return getReferenceBases( GenomeLocParser.createGenomeLoc(read) ); // throw new StingException("x");
// } return getReferenceBases(loc);
}
}
public ReferenceContext getReferenceContext( SAMRecord read ) { public ReferenceContext getReferenceContext( SAMRecord read ) {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
byte[] bases = super.getReferenceBases(loc); // byte[] bases = super.getReferenceBases(loc);
return new ReferenceContext( loc, loc, bases ); // return new ReferenceContext( loc, loc, bases );
return new ReferenceContext( loc, loc, getReferenceBasesProvider(loc) );
} }
} }

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.providers; package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
@ -64,14 +65,6 @@ public class ReferenceView implements View {
* @return A list of the bases starting at the start of the locus (inclusive) and ending * @return A list of the bases starting at the start of the locus (inclusive) and ending
* at the end of the locus (inclusive). * at the end of the locus (inclusive).
*/ */
// protected char[] getReferenceBasesAsChars( GenomeLoc genomeLoc ) {
//// SAMSequenceRecord sequenceInfo = reference.getSequenceDictionary().getSequence(genomeLoc.getContig());
//// long stop = Math.min( genomeLoc.getStop(), sequenceInfo.getSequenceLength() );
//// ReferenceSequence subsequence = reference.getSubsequenceAt(genomeLoc.getContig(),genomeLoc.getStart(),stop);
//// return (StringUtil.bytesToString(subsequence.getBases()) + Utils.dupString('X', (int)(genomeLoc.getStop() - stop)) ).toCharArray();
// return new String(getReferenceBases(genomeLoc)).toCharArray();
// }
final static int BUFFER = 10000; final static int BUFFER = 10000;
final static byte[] Xs = new byte[BUFFER]; final static byte[] Xs = new byte[BUFFER];
static { static {

View File

@ -102,6 +102,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.") @Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
private boolean REQUIRE_EOF = false; private boolean REQUIRE_EOF = false;
@Hidden
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
private boolean skipUQUpdate = false;
///////////////////////////// /////////////////////////////
// Private Member Variables // Private Member Variables
///////////////////////////// /////////////////////////////
@ -375,7 +380,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals)); read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
} }
if (read.getAttribute(SAMTag.UQ.name()) != null && refBases != null) { if (read.getAttribute(SAMTag.UQ.name()) != null && refBases != null && ! skipUQUpdate ) {
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false)); read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
} }