Two optimizations. Even more aggressive printProgress meter optimization to only even consider doing work once every 1000 cycles through the engine. Second, GenomeLocParser now uses a single indirection around the contigInfo variable. This class uses a last used cache to retrieve efficiently contig information instead of always returning to the underlying SAMSequenceDictionary hashmap to make genome locs.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5670 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
29857f5ba6
commit
cc78027bd3
|
|
@ -115,6 +115,8 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
||||||
private SimpleTimer timer = new SimpleTimer("Traversal");
|
private SimpleTimer timer = new SimpleTimer("Traversal");
|
||||||
|
|
||||||
// How long can we go without printing some progress info?
|
// How long can we go without printing some progress info?
|
||||||
|
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
|
||||||
|
private int printProgressCheckCounter = 0;
|
||||||
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
|
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
|
||||||
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
|
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
|
||||||
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
|
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
|
||||||
|
|
@ -242,6 +244,10 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
||||||
* @param mustPrint If true, will print out info, regardless of nRecords or time interval
|
* @param mustPrint If true, will print out info, regardless of nRecords or time interval
|
||||||
*/
|
*/
|
||||||
private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) {
|
private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) {
|
||||||
|
if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 )
|
||||||
|
// don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES
|
||||||
|
return;
|
||||||
|
|
||||||
if(!progressMeterInitialized && mustPrint == false ) {
|
if(!progressMeterInitialized && mustPrint == false ) {
|
||||||
logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]");
|
logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]");
|
||||||
logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining",
|
logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining",
|
||||||
|
|
|
||||||
|
|
@ -62,8 +62,82 @@ public class GenomeLocParser {
|
||||||
// Ugly global variable defining the optional ordering of contig elements
|
// Ugly global variable defining the optional ordering of contig elements
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//public static Map<String, Integer> refContigOrdering = null;
|
|
||||||
protected SAMSequenceDictionary contigInfo = null;
|
/**
|
||||||
|
* A wrapper class that provides efficient last used caching for the global
|
||||||
|
* SAMSequenceDictionary underlying all of the GATK engine capabilities
|
||||||
|
*/
|
||||||
|
private final class MasterSequenceDictionary {
|
||||||
|
final private SAMSequenceDictionary dict;
|
||||||
|
|
||||||
|
// cache
|
||||||
|
SAMSequenceRecord lastSSR = null;
|
||||||
|
String lastContig = "";
|
||||||
|
int lastIndex = -1;
|
||||||
|
|
||||||
|
public MasterSequenceDictionary(SAMSequenceDictionary dict) {
|
||||||
|
this.dict = dict;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final int getNSequences() {
|
||||||
|
return dict.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized final SAMSequenceRecord getSequence(final String contig) {
|
||||||
|
if ( isCached(contig) )
|
||||||
|
return lastSSR;
|
||||||
|
else
|
||||||
|
return updateCache(dict.getSequence(contig));
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized final SAMSequenceRecord getSequence(final int index) {
|
||||||
|
if ( isCached(index) )
|
||||||
|
return lastSSR;
|
||||||
|
else
|
||||||
|
return updateCache(dict.getSequence(index));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized final int getSequenceIndex(final String contig) {
|
||||||
|
if ( ! isCached(contig) ) {
|
||||||
|
SAMSequenceRecord rec = dict.getSequence(contig);
|
||||||
|
if ( rec == null )
|
||||||
|
return -1; // not found
|
||||||
|
else
|
||||||
|
updateCache(rec);
|
||||||
|
}
|
||||||
|
|
||||||
|
return lastIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
private synchronized boolean isCached(final String contig) {
|
||||||
|
return lastContig.equals(contig);
|
||||||
|
}
|
||||||
|
|
||||||
|
private synchronized boolean isCached(final int index) {
|
||||||
|
return lastIndex == index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The key algorithm. Given a new record, update the last used record, contig
|
||||||
|
* name, and index.
|
||||||
|
*
|
||||||
|
* @param rec
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private synchronized SAMSequenceRecord updateCache(SAMSequenceRecord rec) {
|
||||||
|
if ( rec == null ) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
lastSSR = rec;
|
||||||
|
lastContig = rec.getSequenceName();
|
||||||
|
lastIndex = rec.getSequenceIndex();
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private MasterSequenceDictionary contigInfo = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set our internal reference contig order
|
* set our internal reference contig order
|
||||||
|
|
@ -78,7 +152,7 @@ public class GenomeLocParser {
|
||||||
//logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
|
//logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
|
||||||
throw new UserException.CommandLineException("Failed to load reference dictionary");
|
throw new UserException.CommandLineException("Failed to load reference dictionary");
|
||||||
} else if (contigInfo == null) {
|
} else if (contigInfo == null) {
|
||||||
contigInfo = seqDict;
|
contigInfo = new MasterSequenceDictionary(seqDict);
|
||||||
logger.debug(String.format("Prepared reference sequence contig dictionary"));
|
logger.debug(String.format("Prepared reference sequence contig dictionary"));
|
||||||
for (SAMSequenceRecord contig : seqDict.getSequences()) {
|
for (SAMSequenceRecord contig : seqDict.getSequences()) {
|
||||||
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
|
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
|
||||||
|
|
@ -123,7 +197,6 @@ public class GenomeLocParser {
|
||||||
* @return a GenomeLoc representing the String
|
* @return a GenomeLoc representing the String
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public GenomeLoc parseGenomeInterval(final String str) {
|
public GenomeLoc parseGenomeInterval(final String str) {
|
||||||
GenomeLoc ret = parseGenomeLoc(str);
|
GenomeLoc ret = parseGenomeLoc(str);
|
||||||
exceptionOnInvalidGenomeLocBounds(ret);
|
exceptionOnInvalidGenomeLocBounds(ret);
|
||||||
|
|
@ -263,7 +336,7 @@ public class GenomeLocParser {
|
||||||
*/
|
*/
|
||||||
private boolean isContigValid(String contig) {
|
private boolean isContigValid(String contig) {
|
||||||
int contigIndex = contigInfo.getSequenceIndex(contig);
|
int contigIndex = contigInfo.getSequenceIndex(contig);
|
||||||
return contigIndex >= 0 && contigIndex < contigInfo.size();
|
return contigIndex >= 0 && contigIndex < contigInfo.getNSequences();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -426,7 +499,7 @@ public class GenomeLocParser {
|
||||||
if (toReturn.getContigIndex() < 0) {
|
if (toReturn.getContigIndex() < 0) {
|
||||||
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0");
|
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0");
|
||||||
}
|
}
|
||||||
if (toReturn.getContigIndex() >= contigInfo.getSequences().size()) {
|
if (toReturn.getContigIndex() >= contigInfo.getNSequences()) {
|
||||||
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater then the stored sequence count");
|
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater then the stored sequence count");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -468,7 +541,7 @@ public class GenomeLocParser {
|
||||||
public boolean validGenomeLoc(GenomeLoc loc) {
|
public boolean validGenomeLoc(GenomeLoc loc) {
|
||||||
// quick check before we get the contig size, is the contig number valid
|
// quick check before we get the contig size, is the contig number valid
|
||||||
if ((loc.getContigIndex() < 0) || // the contig index has to be positive
|
if ((loc.getContigIndex() < 0) || // the contig index has to be positive
|
||||||
(loc.getContigIndex() >= contigInfo.getSequences().size())) // the contig must be in the integer range of contigs)
|
(loc.getContigIndex() >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
int contigSize = contigInfo.getSequence(loc.getContigIndex()).getSequenceLength();
|
int contigSize = contigInfo.getSequence(loc.getContigIndex()).getSequenceLength();
|
||||||
|
|
@ -510,7 +583,7 @@ public class GenomeLocParser {
|
||||||
* performs interval-style validation: contig is valid and atart and stop less than the end
|
* performs interval-style validation: contig is valid and atart and stop less than the end
|
||||||
*/
|
*/
|
||||||
public boolean validGenomeLoc(int contigIndex, int start, int stop) {
|
public boolean validGenomeLoc(int contigIndex, int start, int stop) {
|
||||||
if (contigIndex < 0 || contigIndex >= contigInfo.size()) return false;
|
if (contigIndex < 0 || contigIndex >= contigInfo.getNSequences()) return false;
|
||||||
return validGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contigIndex), contigIndex, start, stop));
|
return validGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contigIndex), contigIndex, start, stop));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue