Lazy reference loading; the engine doesn't fetch the reference bases until you actually call ref.getBases(). With the new hidden --dontUpdateUG to table recalibrator this is 2-3x faster than before. Enabled for locus, read, and rod walkers.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4042 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9ab647b730
commit
20db00a3e8
|
|
@ -52,34 +52,77 @@ public class ReferenceContext {
|
|||
private GenomeLoc window;
|
||||
|
||||
/**
|
||||
* The bases in the window around the current locus.
|
||||
* The bases in the window around the current locus. If null, then bases haven't been fetched yet
|
||||
*/
|
||||
private byte[] bases;
|
||||
private byte[] basesCache = null;
|
||||
|
||||
/**
|
||||
* Lazy loader to fetch reference bases
|
||||
*/
|
||||
private ReferenceContextRefProvider basesProvider;
|
||||
|
||||
/**
|
||||
* A cache of the bases converted to characters for walkers not yet using byte[] interface
|
||||
*/
|
||||
private char[] basesAsCharCached = null;
|
||||
|
||||
/**
|
||||
* Interface to create byte[] contexts for lazy loading of the reference
|
||||
*/
|
||||
public static interface ReferenceContextRefProvider {
|
||||
/**
|
||||
* You must provide a routine that gets the byte[] bases that would have been passed into the
|
||||
* ReferenceContext. The RC will handling caching. The value of this interface and routine is
|
||||
* that it is only called when the bytes are actually requested by the walker, not up front. So
|
||||
* if the walker doesn't need the refBases for whatever reason, there's no overhead to
|
||||
* provide them.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public byte[] getBases();
|
||||
}
|
||||
|
||||
private static class ForwardingProvider implements ReferenceContextRefProvider {
|
||||
byte[] bases;
|
||||
|
||||
public ForwardingProvider( byte base ) {
|
||||
this(new byte[] { base });
|
||||
}
|
||||
|
||||
public ForwardingProvider( byte[] bases ) {
|
||||
this.bases = bases;
|
||||
}
|
||||
|
||||
public byte[] getBases() { return bases; }
|
||||
}
|
||||
|
||||
/**
|
||||
* Contructor for a simple, windowless reference context.
|
||||
* @param locus locus of interest.
|
||||
* @param base reference base at that locus.
|
||||
*/
|
||||
public ReferenceContext( GenomeLoc locus, byte base ) {
|
||||
this( locus, locus, new byte[] { UPPERCASE_REFERENCE ? StringUtil.toUpperCase(base) : base } );
|
||||
this( locus, locus, new ForwardingProvider(base) );
|
||||
}
|
||||
|
||||
// todo -- this really should take the referenceview as an option and only grab the bases if necessary
|
||||
public ReferenceContext( GenomeLoc locus, GenomeLoc window, byte[] bases ) {
|
||||
this( locus, window, new ForwardingProvider(bases) );
|
||||
}
|
||||
|
||||
public ReferenceContext( GenomeLoc locus, GenomeLoc window, ReferenceContextRefProvider basesProvider ) {
|
||||
// if( !window.containsP(locus) )
|
||||
// throw new StingException("Invalid locus or window; window does not contain locus");
|
||||
|
||||
this.locus = locus;
|
||||
this.window = window;
|
||||
this.bases = bases;
|
||||
this.basesProvider = basesProvider;
|
||||
}
|
||||
|
||||
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(bases);
|
||||
private void fetchBasesFromProvider() {
|
||||
if ( basesCache == null ) {
|
||||
basesCache = basesProvider.getBases();
|
||||
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -99,7 +142,7 @@ public class ReferenceContext {
|
|||
* @return The base at the given locus from the reference.
|
||||
*/
|
||||
public byte getBase() {
|
||||
return bases[(int)(locus.getStart() - window.getStart())];
|
||||
return getBases()[(int)(locus.getStart() - window.getStart())];
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
|
|
@ -121,13 +164,14 @@ public class ReferenceContext {
|
|||
* contain only the base at the given locus.
|
||||
*/
|
||||
public byte[] getBases() {
|
||||
return bases;
|
||||
fetchBasesFromProvider();
|
||||
return basesCache;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public char[] getBasesAsChars() {
|
||||
if ( basesAsCharCached == null )
|
||||
basesAsCharCached = new String(bases).toCharArray();
|
||||
basesAsCharCached = new String(getBases()).toCharArray();
|
||||
return basesAsCharCached;
|
||||
}
|
||||
|
||||
|
|
@ -140,7 +184,7 @@ public class ReferenceContext {
|
|||
* @return
|
||||
*/
|
||||
public byte[] getBasesAtLocus(int n) {
|
||||
|
||||
byte[] bases = getBases();
|
||||
int start = (int)(locus.getStart()-window.getStart());
|
||||
int stop = ( n==(-1) ? bases.length : start+n );
|
||||
|
||||
|
|
@ -153,9 +197,4 @@ public class ReferenceContext {
|
|||
for ( int j = start ; j < stop ; j++) b[i++]=bases[j];
|
||||
return b;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public char[] getBasesAtLocusAsChar(int n) {
|
||||
return new String(getBasesAtLocus(n)).toCharArray();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -165,6 +165,22 @@ public class LocusReferenceView extends ReferenceView {
|
|||
return l;
|
||||
}
|
||||
|
||||
public class Provider implements ReferenceContext.ReferenceContextRefProvider {
|
||||
int refStart, len;
|
||||
|
||||
public Provider( int refStart, int len ) {
|
||||
this.refStart = refStart;
|
||||
this.len = len;
|
||||
}
|
||||
|
||||
public byte[] getBases() {
|
||||
//System.out.printf("Getting bases for location%n");
|
||||
byte[] bases = new byte[len];
|
||||
System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len);
|
||||
return bases;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the reference context associated with this particular point or extended interval on the genome.
|
||||
* @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beoynd current bounds, it will be trimmed down.
|
||||
|
|
@ -186,32 +202,10 @@ public class LocusReferenceView extends ReferenceView {
|
|||
refStart = (int)window.getStart()-1;
|
||||
}
|
||||
|
||||
// todo -- how often is this copy unnecessary?
|
||||
int len = (int)window.size();
|
||||
byte[] bases = new byte[len];
|
||||
System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len);
|
||||
return new ReferenceContext( genomeLoc, window, bases );
|
||||
return new ReferenceContext( genomeLoc, window, new Provider(refStart, len));
|
||||
}
|
||||
|
||||
// public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) {
|
||||
// //validateLocation( genomeLoc );
|
||||
//
|
||||
// GenomeLoc window = GenomeLocParser.createGenomeLoc( genomeLoc.getContig(), getWindowStart(genomeLoc), getWindowStop(genomeLoc) );
|
||||
// char[] bases = null;
|
||||
//
|
||||
// if(bounds != null) {
|
||||
// window = trimToBounds(window);
|
||||
// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)(window.getStart() - getWindowStart(bounds)), (int)window.size() ).toCharArray();
|
||||
// }
|
||||
// else {
|
||||
// if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex())
|
||||
// referenceSequence = reference.getSequence(genomeLoc.getContig());
|
||||
// bases = StringUtil.bytesToString( referenceSequence.getBases(), (int)window.getStart()-1, (int)window.size()).toCharArray();
|
||||
// }
|
||||
// return new ReferenceContext( genomeLoc, window, bases );
|
||||
// }
|
||||
|
||||
|
||||
/**
|
||||
* Allow the user to pull reference info from any arbitrary region of the reference.
|
||||
* @param genomeLoc The locus.
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import net.sf.samtools.SAMRecord;
|
|||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -52,26 +53,29 @@ public class ReadReferenceView extends ReferenceView {
|
|||
super(provider);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the bases of the reference that are aligned to the given read.
|
||||
*
|
||||
* @param read the read for which to extract reference information.
|
||||
*
|
||||
* @return The bases corresponding to this read, or null if the read is unmapped.
|
||||
* If the alignment goes off the end of the contig, return just the portion
|
||||
* mapped to the reference, followed by X's coresponding to the rest of the read.
|
||||
* This indicates that the rest lies off the end of the contig.
|
||||
*/
|
||||
// public char[] getReferenceBases( SAMRecord read ) {
|
||||
// if (read.getReadUnmappedFlag())
|
||||
// return null;
|
||||
// return getReferenceBases( GenomeLocParser.createGenomeLoc(read) );
|
||||
// }
|
||||
protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) {
|
||||
return new Provider(genomeLoc);
|
||||
}
|
||||
|
||||
public class Provider implements ReferenceContext.ReferenceContextRefProvider {
|
||||
GenomeLoc loc;
|
||||
|
||||
public Provider( GenomeLoc loc ) {
|
||||
this.loc = loc;
|
||||
}
|
||||
|
||||
public byte[] getBases() {
|
||||
// System.out.printf("Getting bases for location %s%n", loc);
|
||||
// throw new StingException("x");
|
||||
return getReferenceBases(loc);
|
||||
}
|
||||
}
|
||||
|
||||
public ReferenceContext getReferenceContext( SAMRecord read ) {
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
|
||||
byte[] bases = super.getReferenceBases(loc);
|
||||
return new ReferenceContext( loc, loc, bases );
|
||||
// byte[] bases = super.getReferenceBases(loc);
|
||||
// return new ReferenceContext( loc, loc, bases );
|
||||
return new ReferenceContext( loc, loc, getReferenceBasesProvider(loc) );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
|
@ -64,14 +65,6 @@ public class ReferenceView implements View {
|
|||
* @return A list of the bases starting at the start of the locus (inclusive) and ending
|
||||
* at the end of the locus (inclusive).
|
||||
*/
|
||||
// protected char[] getReferenceBasesAsChars( GenomeLoc genomeLoc ) {
|
||||
//// SAMSequenceRecord sequenceInfo = reference.getSequenceDictionary().getSequence(genomeLoc.getContig());
|
||||
//// long stop = Math.min( genomeLoc.getStop(), sequenceInfo.getSequenceLength() );
|
||||
//// ReferenceSequence subsequence = reference.getSubsequenceAt(genomeLoc.getContig(),genomeLoc.getStart(),stop);
|
||||
//// return (StringUtil.bytesToString(subsequence.getBases()) + Utils.dupString('X', (int)(genomeLoc.getStop() - stop)) ).toCharArray();
|
||||
// return new String(getReferenceBases(genomeLoc)).toCharArray();
|
||||
// }
|
||||
|
||||
final static int BUFFER = 10000;
|
||||
final static byte[] Xs = new byte[BUFFER];
|
||||
static {
|
||||
|
|
|
|||
|
|
@ -102,6 +102,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
|
||||
private boolean REQUIRE_EOF = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
||||
private boolean skipUQUpdate = false;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -375,7 +380,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
if (read.getAttribute(SAMTag.UQ.name()) != null && refBases != null) {
|
||||
if (read.getAttribute(SAMTag.UQ.name()) != null && refBases != null && ! skipUQUpdate ) {
|
||||
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue